1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_affinity.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
45 #endif /* defined(KMP_GOMP_COMPAT) */
46 
47 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
48 #if OMP_50_ENABLED
49     "5.0 (201611)";
50 #elif OMP_45_ENABLED
51     "4.5 (201511)";
52 #elif OMP_40_ENABLED
53     "4.0 (201307)";
54 #else
55     "3.1 (201107)";
56 #endif
57 
58 #ifdef KMP_DEBUG
59 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
63 
64 /* ------------------------------------------------------------------------ */
65 /* ------------------------------------------------------------------------ */
66 
67 kmp_info_t __kmp_monitor;
68 
69 /* ------------------------------------------------------------------------ */
70 /* ------------------------------------------------------------------------ */
71 
72 /* Forward declarations */
73 
74 void __kmp_cleanup( void );
75 
76 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
77 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
78 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
79 static void __kmp_partition_places( kmp_team_t *team, int update_master_only=0 );
80 #endif
81 static void __kmp_do_serial_initialize( void );
82 void __kmp_fork_barrier( int gtid, int tid );
83 void __kmp_join_barrier( int gtid );
84 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
85 
86 #ifdef USE_LOAD_BALANCE
87 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
88 #endif
89 
90 static int __kmp_expand_threads(int nWish, int nNeed);
91 #if KMP_OS_WINDOWS
92 static int __kmp_unregister_root_other_thread( int gtid );
93 #endif
94 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
95 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
96 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
97 
98 /* ------------------------------------------------------------------------ */
99 /* ------------------------------------------------------------------------ */
100 
101 /* Calculate the identifier of the current thread */
102 /* fast (and somewhat portable) way to get unique */
103 /* identifier of executing thread.                */
104 /* returns KMP_GTID_DNE if we haven't been assigned a gtid   */
105 
106 int
107 __kmp_get_global_thread_id( )
108 {
109     int i;
110     kmp_info_t   **other_threads;
111     size_t         stack_data;
112     char          *stack_addr;
113     size_t         stack_size;
114     char          *stack_base;
115 
116     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
117                       __kmp_nth, __kmp_all_nth ));
118 
119     /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
120              parallel region, made it return KMP_GTID_DNE to force serial_initialize by
121              caller.  Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
122              __kmp_init_gtid for this to work.  */
123 
124     if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
125 
126 #ifdef KMP_TDATA_GTID
127     if ( TCR_4(__kmp_gtid_mode) >= 3) {
128         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
129         return __kmp_gtid;
130     }
131 #endif
132     if ( TCR_4(__kmp_gtid_mode) >= 2) {
133         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
134         return __kmp_gtid_get_specific();
135     }
136     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
137 
138     stack_addr    = (char*) & stack_data;
139     other_threads = __kmp_threads;
140 
141     /*
142         ATT: The code below is a source of potential bugs due to unsynchronized access to
143         __kmp_threads array. For example:
144             1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
145             2. Current thread is suspended by OS.
146             3. Another thread unregisters and finishes (debug versions of free() may fill memory
147                with something like 0xEF).
148             4. Current thread is resumed.
149             5. Current thread reads junk from *thr.
150         TODO: Fix it.
151         --ln
152     */
153 
154     for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
155 
156         kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157         if( !thr ) continue;
158 
159         stack_size =  (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
160         stack_base =  (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
161 
162         /* stack grows down -- search through all of the active threads */
163 
164         if( stack_addr <= stack_base ) {
165             size_t stack_diff = stack_base - stack_addr;
166 
167             if( stack_diff <= stack_size ) {
168                 /* The only way we can be closer than the allocated */
169                 /* stack size is if we are running on this thread. */
170                 KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
171                 return i;
172             }
173         }
174     }
175 
176     /* get specific to try and determine our gtid */
177     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
178                       "thread, using TLS\n" ));
179     i = __kmp_gtid_get_specific();
180 
181     /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
182 
183     /* if we havn't been assigned a gtid, then return code */
184     if( i<0 ) return i;
185 
186     /* dynamically updated stack window for uber threads to avoid get_specific call */
187     if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
188         KMP_FATAL( StackOverflow, i );
189     }
190 
191     stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
192     if( stack_addr > stack_base ) {
193         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
194         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195           other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
196     } else {
197         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
198     }
199 
200     /* Reprint stack bounds for ubermaster since they have been refined */
201     if ( __kmp_storage_map ) {
202         char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
203         char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
204         __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
205                                       other_threads[i]->th.th_info.ds.ds_stacksize,
206                                       "th_%d stack (refinement)", i );
207     }
208     return i;
209 }
210 
211 int
212 __kmp_get_global_thread_id_reg( )
213 {
214     int gtid;
215 
216     if ( !__kmp_init_serial ) {
217         gtid = KMP_GTID_DNE;
218     } else
219 #ifdef KMP_TDATA_GTID
220     if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
221         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
222         gtid = __kmp_gtid;
223     } else
224 #endif
225     if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
226         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
227         gtid = __kmp_gtid_get_specific();
228     } else {
229         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
230         gtid = __kmp_get_global_thread_id();
231     }
232 
233     /* we must be a new uber master sibling thread */
234     if( gtid == KMP_GTID_DNE ) {
235         KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
236                         "Registering a new gtid.\n" ));
237         __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
238         if( !__kmp_init_serial ) {
239             __kmp_do_serial_initialize();
240             gtid = __kmp_gtid_get_specific();
241         } else {
242             gtid = __kmp_register_root(FALSE);
243         }
244         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
245         /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
246     }
247 
248     KMP_DEBUG_ASSERT( gtid >=0 );
249 
250     return gtid;
251 }
252 
253 /* caller must hold forkjoin_lock */
254 void
255 __kmp_check_stack_overlap( kmp_info_t *th )
256 {
257     int f;
258     char *stack_beg = NULL;
259     char *stack_end = NULL;
260     int gtid;
261 
262     KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
263     if ( __kmp_storage_map ) {
264         stack_end = (char *) th->th.th_info.ds.ds_stackbase;
265         stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
266 
267         gtid = __kmp_gtid_from_thread( th );
268 
269         if (gtid == KMP_GTID_MONITOR) {
270             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271                                      "th_%s stack (%s)", "mon",
272                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
273         } else {
274             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
275                                      "th_%d stack (%s)", gtid,
276                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
277         }
278     }
279 
280     /* No point in checking ubermaster threads since they use refinement and cannot overlap */
281     gtid = __kmp_gtid_from_thread( th );
282     if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
283     {
284         KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
285         if ( stack_beg == NULL ) {
286             stack_end = (char *) th->th.th_info.ds.ds_stackbase;
287             stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
288         }
289 
290         for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
291             kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
292 
293             if( f_th && f_th != th ) {
294                 char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295                 char *other_stack_beg = other_stack_end -
296                                         (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297                 if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298                    (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300                     /* Print the other stack values before the abort */
301                     if ( __kmp_storage_map )
302                         __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
303                             (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
304                             "th_%d stack (overlapped)",
305                                                  __kmp_gtid_from_thread( f_th ) );
306 
307                     __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
308                 }
309             }
310         }
311     }
312     KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
313 }
314 
315 
316 /* ------------------------------------------------------------------------ */
317 
318 /* ------------------------------------------------------------------------ */
319 
320 void
321 __kmp_infinite_loop( void )
322 {
323     static int done = FALSE;
324 
325     while (! done) {
326         KMP_YIELD( 1 );
327     }
328 }
329 
330 #define MAX_MESSAGE     512
331 
332 void
333 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
334     char buffer[MAX_MESSAGE];
335     va_list ap;
336 
337     va_start( ap, format);
338     KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
339     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
340     __kmp_vprintf( kmp_err, buffer, ap );
341 #if KMP_PRINT_DATA_PLACEMENT
342     int node;
343     if(gtid >= 0) {
344         if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
345             if( __kmp_storage_map_verbose ) {
346                 node = __kmp_get_host_node(p1);
347                 if(node < 0)  /* doesn't work, so don't try this next time */
348                     __kmp_storage_map_verbose = FALSE;
349                 else {
350                     char *last;
351                     int lastNode;
352                     int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354                     const int page_size = KMP_GET_PAGE_SIZE();
355 
356                     p1 = (void *)( (size_t)p1 & ~((size_t)page_size - 1) );
357                     p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)page_size - 1) );
358                     if(localProc >= 0)
359                         __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid, localProc>>1);
360                     else
361                         __kmp_printf_no_lock("  GTID %d\n", gtid);
362 # if KMP_USE_PRCTL
363 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
364                     do {
365                         last = p1;
366                         lastNode = node;
367                         /* This loop collates adjacent pages with the same host node. */
368                         do {
369                             (char*)p1 += page_size;
370                         } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
371                         __kmp_printf_no_lock("    %p-%p memNode %d\n", last,
372                                              (char*)p1 - 1, lastNode);
373                     } while(p1 <= p2);
374 # else
375                     __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
376                                          (char*)p1 + (page_size - 1), __kmp_get_host_node(p1));
377                     if(p1 < p2)  {
378                         __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                              (char*)p2 + (page_size - 1), __kmp_get_host_node(p2));
380                     }
381 # endif
382                 }
383             }
384         } else
385             __kmp_printf_no_lock("  %s\n", KMP_I18N_STR( StorageMapWarning ) );
386     }
387 #endif /* KMP_PRINT_DATA_PLACEMENT */
388     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
389 }
390 
391 void
392 __kmp_warn( char const * format, ... )
393 {
394     char buffer[MAX_MESSAGE];
395     va_list ap;
396 
397     if ( __kmp_generate_warnings == kmp_warnings_off ) {
398         return;
399     }
400 
401     va_start( ap, format );
402 
403     KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
404     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
405     __kmp_vprintf( kmp_err, buffer, ap );
406     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
407 
408     va_end( ap );
409 }
410 
411 void
412 __kmp_abort_process()
413 {
414 
415     // Later threads may stall here, but that's ok because abort() will kill them.
416     __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
417 
418     if ( __kmp_debug_buf ) {
419         __kmp_dump_debug_buffer();
420     }; // if
421 
422     if ( KMP_OS_WINDOWS ) {
423         // Let other threads know of abnormal termination and prevent deadlock
424         // if abort happened during library initialization or shutdown
425         __kmp_global.g.g_abort = SIGABRT;
426 
427         /*
428             On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
429             Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
430             works well, but this function is not available in VS7 (this is not problem for DLL, but
431             it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
432             not help, at least in some versions of MS C RTL.
433 
434             It seems following sequence is the only way to simulate abort() and avoid pop-up error
435             box.
436         */
437         raise( SIGABRT );
438         _exit( 3 );    // Just in case, if signal ignored, exit anyway.
439     } else {
440         abort();
441     }; // if
442 
443     __kmp_infinite_loop();
444     __kmp_release_bootstrap_lock( & __kmp_exit_lock );
445 
446 } // __kmp_abort_process
447 
448 void
449 __kmp_abort_thread( void )
450 {
451     // TODO: Eliminate g_abort global variable and this function.
452     // In case of abort just call abort(), it will kill all the threads.
453     __kmp_infinite_loop();
454 } // __kmp_abort_thread
455 
456 /* ------------------------------------------------------------------------ */
457 
458 /*
459  * Print out the storage map for the major kmp_info_t thread data structures
460  * that are allocated together.
461  */
462 
463 static void
464 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
465 {
466     __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
467 
468     __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
469                              "th_%d.th_info", gtid );
470 
471     __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
472                              "th_%d.th_local", gtid );
473 
474     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
475                              sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
476 
477     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
478                              &thr->th.th_bar[bs_plain_barrier+1],
479                              sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
480 
481     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
482                              &thr->th.th_bar[bs_forkjoin_barrier+1],
483                              sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
484 
485     #if KMP_FAST_REDUCTION_BARRIER
486         __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
487                              &thr->th.th_bar[bs_reduction_barrier+1],
488                              sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
489     #endif // KMP_FAST_REDUCTION_BARRIER
490 }
491 
492 /*
493  * Print out the storage map for the major kmp_team_t team data structures
494  * that are allocated together.
495  */
496 
497 static void
498 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
499 {
500     int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
501     __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
502                              header, team_id );
503 
504     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
505                              sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
506 
507 
508     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
509                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
510 
511     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
512                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
513 
514     #if KMP_FAST_REDUCTION_BARRIER
515         __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
516                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
517     #endif // KMP_FAST_REDUCTION_BARRIER
518 
519     __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520                              sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
521 
522     __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
523                              sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
524 
525     __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
526                              sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
527                              header, team_id );
528 
529 
530     __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
531                              sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
532 }
533 
534 static void __kmp_init_allocator() {}
535 static void __kmp_fini_allocator() {}
536 
537 /* ------------------------------------------------------------------------ */
538 
539 #ifdef KMP_DYNAMIC_LIB
540 # if KMP_OS_WINDOWS
541 
542 static void
543 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
544     // TODO: Change to __kmp_break_bootstrap_lock().
545     __kmp_init_bootstrap_lock( lck ); // make the lock released
546 }
547 
548 static void
549 __kmp_reset_locks_on_process_detach( int gtid_req ) {
550     int i;
551     int thread_count;
552 
553     // PROCESS_DETACH is expected to be called by a thread
554     // that executes ProcessExit() or FreeLibrary().
555     // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
556     // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
557     // However, in fact, some threads can be still alive here, although being about to be terminated.
558     // The threads in the array with ds_thread==0 are most suspicious.
559     // Actually, it can be not safe to access the __kmp_threads[].
560 
561     // TODO: does it make sense to check __kmp_roots[] ?
562 
563     // Let's check that there are no other alive threads registered with the OMP lib.
564     while( 1 ) {
565         thread_count = 0;
566         for( i = 0; i < __kmp_threads_capacity; ++i ) {
567             if( !__kmp_threads ) continue;
568             kmp_info_t* th = __kmp_threads[ i ];
569             if( th == NULL ) continue;
570             int gtid = th->th.th_info.ds.ds_gtid;
571             if( gtid == gtid_req ) continue;
572             if( gtid < 0 ) continue;
573             DWORD exit_val;
574             int alive = __kmp_is_thread_alive( th, &exit_val );
575             if( alive ) {
576             ++thread_count;
577             }
578         }
579         if( thread_count == 0 ) break; // success
580     }
581 
582     // Assume that I'm alone.
583 
584     // Now it might be probably safe to check and reset locks.
585     // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
586     __kmp_reset_lock( &__kmp_forkjoin_lock );
587     #ifdef KMP_DEBUG
588     __kmp_reset_lock( &__kmp_stdio_lock );
589     #endif // KMP_DEBUG
590 }
591 
592 BOOL WINAPI
593 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
594     //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
595 
596     switch( fdwReason ) {
597 
598         case DLL_PROCESS_ATTACH:
599             KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
600 
601             return TRUE;
602 
603         case DLL_PROCESS_DETACH:
604             KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
605                         __kmp_gtid_get_specific() ));
606 
607             if( lpReserved != NULL )
608             {
609                 // lpReserved is used for telling the difference:
610                 //  lpReserved == NULL when FreeLibrary() was called,
611                 //  lpReserved != NULL when the process terminates.
612                 // When FreeLibrary() is called, worker threads remain alive.
613                 // So they will release the forkjoin lock by themselves.
614                 // When the process terminates, worker threads disappear triggering
615                 // the problem of unreleased forkjoin lock as described below.
616 
617                 // A worker thread can take the forkjoin lock.
618                 // The problem comes up if that worker thread becomes dead
619                 // before it releases the forkjoin lock.
620                 // The forkjoin lock remains taken, while the thread
621                 // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
622                 // will try to take the forkjoin lock and will always fail,
623                 // so that the application will never finish [normally].
624                 // This scenario is possible if __kmpc_end() has not been executed.
625                 // It looks like it's not a corner case, but common cases:
626                 // - the main function was compiled by an alternative compiler;
627                 // - the main function was compiled by icl but without /Qopenmp (application with plugins);
628                 // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
629                 // - alive foreign thread prevented __kmpc_end from doing cleanup.
630 
631                 // This is a hack to work around the problem.
632                 // TODO: !!! to figure out something better.
633                 __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
634             }
635 
636             __kmp_internal_end_library( __kmp_gtid_get_specific() );
637 
638             return TRUE;
639 
640         case DLL_THREAD_ATTACH:
641             KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
642 
643             /* if we wanted to register new siblings all the time here call
644              * __kmp_get_gtid(); */
645             return TRUE;
646 
647         case DLL_THREAD_DETACH:
648             KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
649                         __kmp_gtid_get_specific() ));
650 
651             __kmp_internal_end_thread( __kmp_gtid_get_specific() );
652             return TRUE;
653     }
654 
655     return TRUE;
656 }
657 
658 # endif /* KMP_OS_WINDOWS */
659 #endif /* KMP_DYNAMIC_LIB */
660 
661 
662 /* ------------------------------------------------------------------------ */
663 
664 /* Change the library type to "status" and return the old type */
665 /* called from within initialization routines where __kmp_initz_lock is held */
666 int
667 __kmp_change_library( int status )
668 {
669     int old_status;
670 
671     old_status = __kmp_yield_init & 1;  // check whether KMP_LIBRARY=throughput (even init count)
672 
673     if (status) {
674         __kmp_yield_init |= 1;  // throughput => turnaround (odd init count)
675     }
676     else {
677         __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
678     }
679 
680     return old_status;  // return previous setting of whether KMP_LIBRARY=throughput
681 }
682 
683 /* ------------------------------------------------------------------------ */
684 /* ------------------------------------------------------------------------ */
685 
686 /* __kmp_parallel_deo --
687  * Wait until it's our turn.
688  */
689 void
690 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
691 {
692     int gtid = *gtid_ref;
693 #ifdef BUILD_PARALLEL_ORDERED
694     kmp_team_t *team = __kmp_team_from_gtid( gtid );
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697     if( __kmp_env_consistency_check ) {
698         if( __kmp_threads[gtid]->th.th_root->r.r_active )
699 #if KMP_USE_DYNAMIC_LOCK
700             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
701 #else
702             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
703 #endif
704     }
705 #ifdef BUILD_PARALLEL_ORDERED
706     if( !team->t.t_serialized ) {
707         KMP_MB();
708         KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
709         KMP_MB();
710     }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* __kmp_parallel_dxo --
715  * Signal the next task.
716  */
717 
718 void
719 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
720 {
721     int gtid = *gtid_ref;
722 #ifdef BUILD_PARALLEL_ORDERED
723     int tid =  __kmp_tid_from_gtid( gtid );
724     kmp_team_t *team = __kmp_team_from_gtid( gtid );
725 #endif /* BUILD_PARALLEL_ORDERED */
726 
727     if( __kmp_env_consistency_check ) {
728         if( __kmp_threads[gtid]->th.th_root->r.r_active )
729             __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
730     }
731 #ifdef BUILD_PARALLEL_ORDERED
732     if ( ! team->t.t_serialized ) {
733         KMP_MB();       /* Flush all pending memory write invalidates.  */
734 
735         /* use the tid of the next thread in this team */
736         /* TODO repleace with general release procedure */
737         team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
738 
739 #if OMPT_SUPPORT && OMPT_BLAME
740         if (ompt_enabled &&
741             ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
742             /* accept blame for "ordered" waiting */
743             kmp_info_t *this_thread = __kmp_threads[gtid];
744             ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
745                 this_thread->th.ompt_thread_info.wait_id);
746         }
747 #endif
748 
749         KMP_MB();       /* Flush all pending memory write invalidates.  */
750     }
751 #endif /* BUILD_PARALLEL_ORDERED */
752 }
753 
754 /* ------------------------------------------------------------------------ */
755 /* ------------------------------------------------------------------------ */
756 
757 /* ------------------------------------------------------------------------ */
758 /* ------------------------------------------------------------------------ */
759 
760 /* The BARRIER for a SINGLE process section is always explicit   */
761 
762 int
763 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
764 {
765     int status;
766     kmp_info_t *th;
767     kmp_team_t *team;
768 
769     if( ! TCR_4(__kmp_init_parallel) )
770         __kmp_parallel_initialize();
771 
772     th   = __kmp_threads[ gtid ];
773     team = th->th.th_team;
774     status = 0;
775 
776     th->th.th_ident = id_ref;
777 
778     if ( team->t.t_serialized ) {
779         status = 1;
780     } else {
781         kmp_int32 old_this = th->th.th_local.this_construct;
782 
783         ++th->th.th_local.this_construct;
784         /* try to set team count to thread count--success means thread got the
785            single block
786         */
787         /* TODO: Should this be acquire or release? */
788         if (team->t.t_construct == old_this) {
789             status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
790                                                  th->th.th_local.this_construct);
791         }
792 #if USE_ITT_BUILD
793         if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
794 #if OMP_40_ENABLED
795             th->th.th_teams_microtask == NULL &&
796 #endif
797             team->t.t_active_level == 1 )
798         {   // Only report metadata by master of active team at level 1
799             __kmp_itt_metadata_single( id_ref );
800         }
801 #endif /* USE_ITT_BUILD */
802     }
803 
804     if( __kmp_env_consistency_check ) {
805         if (status && push_ws) {
806             __kmp_push_workshare( gtid, ct_psingle, id_ref );
807         } else {
808             __kmp_check_workshare( gtid, ct_psingle, id_ref );
809         }
810     }
811 #if USE_ITT_BUILD
812     if ( status ) {
813         __kmp_itt_single_start( gtid );
814     }
815 #endif /* USE_ITT_BUILD */
816     return status;
817 }
818 
819 void
820 __kmp_exit_single( int gtid )
821 {
822 #if USE_ITT_BUILD
823     __kmp_itt_single_end( gtid );
824 #endif /* USE_ITT_BUILD */
825     if( __kmp_env_consistency_check )
826         __kmp_pop_workshare( gtid, ct_psingle, NULL );
827 }
828 
829 
830 /*
831  * determine if we can go parallel or must use a serialized parallel region and
832  * how many threads we can use
833  * set_nproc is the number of threads requested for the team
834  * returns 0 if we should serialize or only use one thread,
835  * otherwise the number of threads to use
836  * The forkjoin lock is held by the caller.
837  */
838 static int
839 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
840    int master_tid, int set_nthreads
841 #if OMP_40_ENABLED
842   , int enter_teams
843 #endif /* OMP_40_ENABLED */
844 )
845 {
846     int capacity;
847     int new_nthreads;
848     KMP_DEBUG_ASSERT( __kmp_init_serial );
849     KMP_DEBUG_ASSERT( root && parent_team );
850 
851     //
852     // If dyn-var is set, dynamically adjust the number of desired threads,
853     // according to the method specified by dynamic_mode.
854     //
855     new_nthreads = set_nthreads;
856     if ( ! get__dynamic_2( parent_team, master_tid ) ) {
857         ;
858     }
859 #ifdef USE_LOAD_BALANCE
860     else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
861         new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
862         if ( new_nthreads == 1 ) {
863             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
864               master_tid ));
865             return 1;
866         }
867         if ( new_nthreads < set_nthreads ) {
868             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
869               master_tid, new_nthreads ));
870         }
871     }
872 #endif /* USE_LOAD_BALANCE */
873     else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
874         new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
875           : root->r.r_hot_team->t.t_nproc);
876         if ( new_nthreads <= 1 ) {
877             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
878               master_tid ));
879             return 1;
880         }
881         if ( new_nthreads < set_nthreads ) {
882             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
883               master_tid, new_nthreads ));
884         }
885         else {
886             new_nthreads = set_nthreads;
887         }
888     }
889     else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
890         if ( set_nthreads > 2 ) {
891             new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
892             new_nthreads = ( new_nthreads % set_nthreads ) + 1;
893             if ( new_nthreads == 1 ) {
894                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
895                   master_tid ));
896                 return 1;
897             }
898             if ( new_nthreads < set_nthreads ) {
899                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
900                   master_tid, new_nthreads ));
901             }
902         }
903     }
904     else {
905         KMP_ASSERT( 0 );
906     }
907 
908     //
909     // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
910     //
911     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
912       root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
913         int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
914           root->r.r_hot_team->t.t_nproc );
915         if ( tl_nthreads <= 0 ) {
916             tl_nthreads = 1;
917         }
918 
919         //
920         // If dyn-var is false, emit a 1-time warning.
921         //
922         if ( ! get__dynamic_2( parent_team, master_tid )
923           && ( ! __kmp_reserve_warn ) ) {
924             __kmp_reserve_warn = 1;
925             __kmp_msg(
926                 kmp_ms_warning,
927                 KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
928                 KMP_HNT( Unset_ALL_THREADS ),
929                 __kmp_msg_null
930             );
931         }
932         if ( tl_nthreads == 1 ) {
933             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
934               master_tid ));
935             return 1;
936         }
937         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
938           master_tid, tl_nthreads ));
939         new_nthreads = tl_nthreads;
940     }
941 
942     //
943     // Check if the threads array is large enough, or needs expanding.
944     //
945     // See comment in __kmp_register_root() about the adjustment if
946     // __kmp_threads[0] == NULL.
947     //
948     capacity = __kmp_threads_capacity;
949     if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
950         --capacity;
951     }
952     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
953       root->r.r_hot_team->t.t_nproc ) > capacity ) {
954         //
955         // Expand the threads array.
956         //
957         int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
958           root->r.r_hot_team->t.t_nproc ) - capacity;
959         int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
960         if ( slotsAdded < slotsRequired ) {
961             //
962             // The threads array was not expanded enough.
963             //
964             new_nthreads -= ( slotsRequired - slotsAdded );
965             KMP_ASSERT( new_nthreads >= 1 );
966 
967             //
968             // If dyn-var is false, emit a 1-time warning.
969             //
970             if ( ! get__dynamic_2( parent_team, master_tid )
971               && ( ! __kmp_reserve_warn ) ) {
972                 __kmp_reserve_warn = 1;
973                 if ( __kmp_tp_cached ) {
974                     __kmp_msg(
975                         kmp_ms_warning,
976                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
977                         KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
978                         KMP_HNT( PossibleSystemLimitOnThreads ),
979                         __kmp_msg_null
980                     );
981                 }
982                 else {
983                     __kmp_msg(
984                         kmp_ms_warning,
985                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
986                         KMP_HNT( SystemLimitOnThreads ),
987                         __kmp_msg_null
988                     );
989                 }
990             }
991         }
992     }
993 
994     if ( new_nthreads == 1 ) {
995         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
996                         __kmp_get_gtid(), set_nthreads ) );
997         return 1;
998     }
999 
1000     KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
1001                     __kmp_get_gtid(), new_nthreads, set_nthreads ));
1002     return new_nthreads;
1003 }
1004 
1005 /* ------------------------------------------------------------------------ */
1006 /* ------------------------------------------------------------------------ */
1007 
1008 /* allocate threads from the thread pool and assign them to the new team */
1009 /* we are assured that there are enough threads available, because we
1010  * checked on that earlier within critical section forkjoin */
1011 
1012 static void
1013 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1014                          kmp_info_t *master_th, int master_gtid )
1015 {
1016     int         i;
1017     int use_hot_team;
1018 
1019     KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1020     KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1021     KMP_MB();
1022 
1023     /* first, let's setup the master thread */
1024     master_th->th.th_info.ds.ds_tid  = 0;
1025     master_th->th.th_team            = team;
1026     master_th->th.th_team_nproc      = team->t.t_nproc;
1027     master_th->th.th_team_master     = master_th;
1028     master_th->th.th_team_serialized = FALSE;
1029     master_th->th.th_dispatch        = & team->t.t_dispatch[ 0 ];
1030 
1031     /* make sure we are not the optimized hot team */
1032 #if KMP_NESTED_HOT_TEAMS
1033     use_hot_team = 0;
1034     kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1035     if( hot_teams ) {  // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1036         int level = team->t.t_active_level - 1;    // index in array of hot teams
1037         if( master_th->th.th_teams_microtask ) {    // are we inside the teams?
1038             if( master_th->th.th_teams_size.nteams > 1 ) {
1039                 ++level; // level was not increased in teams construct for team_of_masters
1040             }
1041             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1042                 master_th->th.th_teams_level == team->t.t_level ) {
1043                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
1044             }            // team->t.t_level will be increased inside parallel
1045         }
1046         if( level < __kmp_hot_teams_max_level ) {
1047             if( hot_teams[level].hot_team ) {
1048                 // hot team has already been allocated for given level
1049                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1050                 use_hot_team = 1; // the team is ready to use
1051             } else {
1052                 use_hot_team = 0; // AC: threads are not allocated yet
1053                 hot_teams[level].hot_team = team; // remember new hot team
1054                 hot_teams[level].hot_team_nth = team->t.t_nproc;
1055             }
1056         } else {
1057             use_hot_team = 0;
1058         }
1059     }
1060 #else
1061     use_hot_team = team == root->r.r_hot_team;
1062 #endif
1063     if ( !use_hot_team ) {
1064 
1065         /* install the master thread */
1066         team->t.t_threads[ 0 ]    = master_th;
1067         __kmp_initialize_info( master_th, team, 0, master_gtid );
1068 
1069         /* now, install the worker threads */
1070         for ( i=1 ;  i < team->t.t_nproc ; i++ ) {
1071 
1072             /* fork or reallocate a new thread and install it in team */
1073             kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1074             team->t.t_threads[ i ] = thr;
1075             KMP_DEBUG_ASSERT( thr );
1076             KMP_DEBUG_ASSERT( thr->th.th_team == team );
1077             /* align team and thread arrived states */
1078             KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1079                             __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1080                             __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1081                             team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1082                             team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1083 #if OMP_40_ENABLED
1084             thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1085             thr->th.th_teams_level     = master_th->th.th_teams_level;
1086             thr->th.th_teams_size      = master_th->th.th_teams_size;
1087 #endif
1088             { // Initialize threads' barrier data.
1089                 int b;
1090                 kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1091                 for ( b = 0; b < bs_last_barrier; ++ b ) {
1092                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
1093                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1094 #if USE_DEBUGGER
1095                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1096 #endif
1097                 }; // for b
1098             }
1099         }
1100 
1101 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1102         __kmp_partition_places( team );
1103 #endif
1104 
1105     }
1106 
1107     KMP_MB();
1108 }
1109 
1110 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1111 //
1112 // Propagate any changes to the floating point control registers out to the team
1113 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1114 // so we don't make changes unless they are needed.
1115 //
1116 inline static void
1117 propagateFPControl(kmp_team_t * team)
1118 {
1119     if ( __kmp_inherit_fp_control ) {
1120         kmp_int16 x87_fpu_control_word;
1121         kmp_uint32 mxcsr;
1122 
1123         // Get master values of FPU control flags (both X87 and vector)
1124         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1125         __kmp_store_mxcsr( &mxcsr );
1126         mxcsr &= KMP_X86_MXCSR_MASK;
1127 
1128         // There is no point looking at t_fp_control_saved here.
1129         // If it is TRUE, we still have to update the values if they are different from those we now have.
1130         // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1131         // that the values in the team are the same as those we have.
1132         // So, this code achieves what we need whether or not t_fp_control_saved is true.
1133         // By checking whether the value needs updating we avoid unnecessary writes that would put the
1134         // cache-line into a written state, causing all threads in the team to have to read it again.
1135         KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1136         KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1137         // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1138         // So we must ensure it is correct.
1139         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1140     }
1141     else {
1142         // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1143         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1144     }
1145 }
1146 
1147 // Do the opposite, setting the hardware registers to the updated values from the team.
1148 inline static void
1149 updateHWFPControl(kmp_team_t * team)
1150 {
1151     if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1152         //
1153         // Only reset the fp control regs if they have been changed in the team.
1154         // the parallel region that we are exiting.
1155         //
1156         kmp_int16 x87_fpu_control_word;
1157         kmp_uint32 mxcsr;
1158         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1159         __kmp_store_mxcsr( &mxcsr );
1160         mxcsr &= KMP_X86_MXCSR_MASK;
1161 
1162         if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1163             __kmp_clear_x87_fpu_status_word();
1164             __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1165         }
1166 
1167         if ( team->t.t_mxcsr != mxcsr ) {
1168             __kmp_load_mxcsr( &team->t.t_mxcsr );
1169         }
1170     }
1171 }
1172 #else
1173 # define propagateFPControl(x) ((void)0)
1174 # define updateHWFPControl(x)  ((void)0)
1175 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1176 
1177 static void
1178 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1179 
1180 /*
1181  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1182  */
1183 void
1184 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1185 {
1186     kmp_info_t *this_thr;
1187     kmp_team_t *serial_team;
1188 
1189     KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1190 
1191     /* Skip all this code for autopar serialized loops since it results in
1192        unacceptable overhead */
1193     if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1194         return;
1195 
1196     if( ! TCR_4( __kmp_init_parallel ) )
1197         __kmp_parallel_initialize();
1198 
1199     this_thr     = __kmp_threads[ global_tid ];
1200     serial_team  = this_thr->th.th_serial_team;
1201 
1202     /* utilize the serialized team held by this thread */
1203     KMP_DEBUG_ASSERT( serial_team );
1204     KMP_MB();
1205 
1206     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1207         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1208         KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1209         KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1210                         global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1211         this_thr->th.th_task_team = NULL;
1212     }
1213 
1214 #if OMP_40_ENABLED
1215     kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1216     if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1217         proc_bind = proc_bind_false;
1218     }
1219     else if ( proc_bind == proc_bind_default ) {
1220         //
1221         // No proc_bind clause was specified, so use the current value
1222         // of proc-bind-var for this parallel region.
1223         //
1224         proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1225     }
1226     //
1227     // Reset for next parallel region
1228     //
1229     this_thr->th.th_set_proc_bind = proc_bind_default;
1230 #endif /* OMP_40_ENABLED */
1231 
1232     if( this_thr->th.th_team != serial_team ) {
1233         // Nested level will be an index in the nested nthreads array
1234         int level = this_thr->th.th_team->t.t_level;
1235 
1236         if( serial_team->t.t_serialized ) {
1237             /* this serial team was already used
1238              * TODO increase performance by making this locks more specific */
1239             kmp_team_t *new_team;
1240 
1241             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1242 
1243 #if OMPT_SUPPORT
1244             ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1245 #endif
1246 
1247             new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1248 #if OMPT_SUPPORT
1249                                            ompt_parallel_id,
1250 #endif
1251 #if OMP_40_ENABLED
1252                                            proc_bind,
1253 #endif
1254                                            & this_thr->th.th_current_task->td_icvs,
1255                                            0 USE_NESTED_HOT_ARG(NULL) );
1256             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1257             KMP_ASSERT( new_team );
1258 
1259             /* setup new serialized team and install it */
1260             new_team->t.t_threads[0] = this_thr;
1261             new_team->t.t_parent = this_thr->th.th_team;
1262             serial_team = new_team;
1263             this_thr->th.th_serial_team = serial_team;
1264 
1265             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1266                             global_tid, serial_team ) );
1267 
1268 
1269             /* TODO the above breaks the requirement that if we run out of
1270              * resources, then we can still guarantee that serialized teams
1271              * are ok, since we may need to allocate a new one */
1272         } else {
1273             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1274                             global_tid, serial_team ) );
1275         }
1276 
1277         /* we have to initialize this serial team */
1278         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1279         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1280         KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1281         serial_team->t.t_ident         = loc;
1282         serial_team->t.t_serialized    = 1;
1283         serial_team->t.t_nproc         = 1;
1284         serial_team->t.t_parent        = this_thr->th.th_team;
1285         serial_team->t.t_sched         = this_thr->th.th_team->t.t_sched;
1286         this_thr->th.th_team           = serial_team;
1287         serial_team->t.t_master_tid    = this_thr->th.th_info.ds.ds_tid;
1288 
1289         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1290                         global_tid, this_thr->th.th_current_task ) );
1291         KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1292         this_thr->th.th_current_task->td_flags.executing = 0;
1293 
1294         __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1295 
1296         /* TODO: GEH: do the ICVs work for nested serialized teams?  Don't we need an implicit task for
1297            each serialized task represented by team->t.t_serialized? */
1298         copy_icvs(
1299                   & this_thr->th.th_current_task->td_icvs,
1300                   & this_thr->th.th_current_task->td_parent->td_icvs );
1301 
1302         // Thread value exists in the nested nthreads array for the next nested level
1303         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1304             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1305         }
1306 
1307 #if OMP_40_ENABLED
1308         if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1309             this_thr->th.th_current_task->td_icvs.proc_bind
1310                 = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1311         }
1312 #endif /* OMP_40_ENABLED */
1313 
1314 #if USE_DEBUGGER
1315         serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1316 #endif
1317         this_thr->th.th_info.ds.ds_tid = 0;
1318 
1319         /* set thread cache values */
1320         this_thr->th.th_team_nproc     = 1;
1321         this_thr->th.th_team_master    = this_thr;
1322         this_thr->th.th_team_serialized = 1;
1323 
1324         serial_team->t.t_level        = serial_team->t.t_parent->t.t_level + 1;
1325         serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1326 
1327         propagateFPControl (serial_team);
1328 
1329         /* check if we need to allocate dispatch buffers stack */
1330         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1331         if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1332             serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1333                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1334         }
1335         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1336 
1337 #if OMPT_SUPPORT
1338         ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1339         __ompt_team_assign_id(serial_team, ompt_parallel_id);
1340 #endif
1341 
1342         KMP_MB();
1343 
1344     } else {
1345         /* this serialized team is already being used,
1346          * that's fine, just add another nested level */
1347         KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1348         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1349         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1350         ++ serial_team->t.t_serialized;
1351         this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1352 
1353         // Nested level will be an index in the nested nthreads array
1354         int level = this_thr->th.th_team->t.t_level;
1355         // Thread value exists in the nested nthreads array for the next nested level
1356         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1357             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1358         }
1359         serial_team->t.t_level++;
1360         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1361                         global_tid, serial_team, serial_team->t.t_level ) );
1362 
1363         /* allocate/push dispatch buffers stack */
1364         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1365         {
1366             dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1367                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1368             disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1369             serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1370         }
1371         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1372 
1373         KMP_MB();
1374     }
1375 
1376     if ( __kmp_env_consistency_check )
1377         __kmp_push_parallel( global_tid, NULL );
1378 
1379 }
1380 
1381 /* most of the work for a fork */
1382 /* return true if we really went parallel, false if serialized */
1383 int
1384 __kmp_fork_call(
1385     ident_t   * loc,
1386     int         gtid,
1387     enum fork_context_e  call_context, // Intel, GNU, ...
1388     kmp_int32   argc,
1389 #if OMPT_SUPPORT
1390     void       *unwrapped_task,
1391 #endif
1392     microtask_t microtask,
1393     launch_t    invoker,
1394 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1395 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1396     va_list   * ap
1397 #else
1398     va_list     ap
1399 #endif
1400     )
1401 {
1402     void          **argv;
1403     int             i;
1404     int             master_tid;
1405     int             master_this_cons;
1406     kmp_team_t     *team;
1407     kmp_team_t     *parent_team;
1408     kmp_info_t     *master_th;
1409     kmp_root_t     *root;
1410     int             nthreads;
1411     int             master_active;
1412     int             master_set_numthreads;
1413     int             level;
1414 #if OMP_40_ENABLED
1415     int             active_level;
1416     int             teams_level;
1417 #endif
1418 #if KMP_NESTED_HOT_TEAMS
1419     kmp_hot_team_ptr_t **p_hot_teams;
1420 #endif
1421     { // KMP_TIME_BLOCK
1422     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1423     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1424 
1425     KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1426     if ( __kmp_stkpadding > 0 &&  __kmp_root[gtid] != NULL ) {
1427         /* Some systems prefer the stack for the root thread(s) to start with */
1428         /* some gap from the parent stack to prevent false sharing. */
1429         void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1430         /* These 2 lines below are so this does not get optimized out */
1431         if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1432             __kmp_stkpadding += (short)((kmp_int64)dummy);
1433     }
1434 
1435     /* initialize if needed */
1436     KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1437     if( ! TCR_4(__kmp_init_parallel) )
1438         __kmp_parallel_initialize();
1439 
1440     /* setup current data */
1441     master_th     = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1442     parent_team   = master_th->th.th_team;
1443     master_tid    = master_th->th.th_info.ds.ds_tid;
1444     master_this_cons = master_th->th.th_local.this_construct;
1445     root          = master_th->th.th_root;
1446     master_active = root->r.r_active;
1447     master_set_numthreads = master_th->th.th_set_nproc;
1448 
1449 #if OMPT_SUPPORT
1450     ompt_parallel_id_t ompt_parallel_id;
1451     ompt_task_id_t ompt_task_id;
1452     ompt_frame_t *ompt_frame;
1453     ompt_task_id_t my_task_id;
1454     ompt_parallel_id_t my_parallel_id;
1455 
1456     if (ompt_enabled) {
1457         ompt_parallel_id = __ompt_parallel_id_new(gtid);
1458         ompt_task_id = __ompt_get_task_id_internal(0);
1459         ompt_frame = __ompt_get_task_frame_internal(0);
1460     }
1461 #endif
1462 
1463     // Nested level will be an index in the nested nthreads array
1464     level         = parent_team->t.t_level;
1465     active_level  = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1466 #if OMP_40_ENABLED
1467     teams_level    = master_th->th.th_teams_level; // needed to check nesting inside the teams
1468 #endif
1469 #if KMP_NESTED_HOT_TEAMS
1470     p_hot_teams   = &master_th->th.th_hot_teams;
1471     if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1472         *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1473                 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1474         (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1475         (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1476     }
1477 #endif
1478 
1479 #if OMPT_SUPPORT
1480     if (ompt_enabled &&
1481         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1482         int team_size = master_set_numthreads;
1483 
1484         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1485             ompt_task_id, ompt_frame, ompt_parallel_id,
1486             team_size, unwrapped_task, OMPT_INVOKER(call_context));
1487     }
1488 #endif
1489 
1490     master_th->th.th_ident = loc;
1491 
1492 #if OMP_40_ENABLED
1493     if ( master_th->th.th_teams_microtask &&
1494          ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1495         // AC: This is start of parallel that is nested inside teams construct.
1496         //     The team is actual (hot), all workers are ready at the fork barrier.
1497         //     No lock needed to initialize the team a bit, then free workers.
1498         parent_team->t.t_ident = loc;
1499         __kmp_alloc_argv_entries( argc, parent_team, TRUE );
1500         parent_team->t.t_argc  = argc;
1501         argv = (void**)parent_team->t.t_argv;
1502         for( i=argc-1; i >= 0; --i )
1503 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1504 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1505             *argv++ = va_arg( *ap, void * );
1506 #else
1507             *argv++ = va_arg( ap, void * );
1508 #endif
1509         /* Increment our nested depth levels, but not increase the serialization */
1510         if ( parent_team == master_th->th.th_serial_team ) {
1511             // AC: we are in serialized parallel
1512             __kmpc_serialized_parallel(loc, gtid);
1513             KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1514             parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1515                                            //     work correctly, will restore at join time
1516 
1517 #if OMPT_SUPPORT
1518             void *dummy;
1519             void **exit_runtime_p;
1520 
1521             ompt_lw_taskteam_t lw_taskteam;
1522 
1523             if (ompt_enabled) {
1524                 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1525                     unwrapped_task, ompt_parallel_id);
1526                 lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1527                 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1528 
1529                 __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1530 
1531 #if OMPT_TRACE
1532                 /* OMPT implicit task begin */
1533                 my_task_id = lw_taskteam.ompt_task_info.task_id;
1534                 my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1535                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1536                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1537                         my_parallel_id, my_task_id);
1538                 }
1539 #endif
1540 
1541                 /* OMPT state */
1542                 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1543             } else {
1544                 exit_runtime_p = &dummy;
1545             }
1546 #endif
1547 
1548             {
1549                 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1550                 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1551                 __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1552 #if OMPT_SUPPORT
1553                                         , exit_runtime_p
1554 #endif
1555                                         );
1556             }
1557 
1558 #if OMPT_SUPPORT
1559             *exit_runtime_p = NULL;
1560             if (ompt_enabled) {
1561 #if OMPT_TRACE
1562                 lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1563 
1564                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1565                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1566                         ompt_parallel_id, ompt_task_id);
1567                 }
1568 
1569                 __ompt_lw_taskteam_unlink(master_th);
1570                 // reset clear the task id only after unlinking the task
1571                 lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1572 #endif
1573 
1574                 if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1575                     ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1576                         ompt_parallel_id, ompt_task_id,
1577                         OMPT_INVOKER(call_context));
1578                 }
1579                 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1580             }
1581 #endif
1582             return TRUE;
1583         }
1584 
1585         parent_team->t.t_pkfn  = microtask;
1586 #if OMPT_SUPPORT
1587         parent_team->t.ompt_team_info.microtask = unwrapped_task;
1588 #endif
1589         parent_team->t.t_invoke = invoker;
1590         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1591         parent_team->t.t_active_level ++;
1592         parent_team->t.t_level ++;
1593 
1594         /* Change number of threads in the team if requested */
1595         if ( master_set_numthreads ) {   // The parallel has num_threads clause
1596             if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1597                 // AC: only can reduce the number of threads dynamically, cannot increase
1598                 kmp_info_t **other_threads = parent_team->t.t_threads;
1599                 parent_team->t.t_nproc = master_set_numthreads;
1600                 for ( i = 0; i < master_set_numthreads; ++i ) {
1601                     other_threads[i]->th.th_team_nproc = master_set_numthreads;
1602                 }
1603                 // Keep extra threads hot in the team for possible next parallels
1604             }
1605             master_th->th.th_set_nproc = 0;
1606         }
1607 
1608 #if USE_DEBUGGER
1609     if ( __kmp_debugging ) {    // Let debugger override number of threads.
1610         int nth = __kmp_omp_num_threads( loc );
1611         if ( nth > 0 ) {        // 0 means debugger does not want to change number of threads.
1612             master_set_numthreads = nth;
1613         }; // if
1614     }; // if
1615 #endif
1616 
1617         KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1618         __kmp_internal_fork( loc, gtid, parent_team );
1619         KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1620 
1621         /* Invoke microtask for MASTER thread */
1622         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1623                     gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1624 
1625         {
1626             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1627             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1628             if (! parent_team->t.t_invoke( gtid )) {
1629                 KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1630             }
1631         }
1632         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1633             gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1634         KMP_MB();       /* Flush all pending memory write invalidates.  */
1635 
1636         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1637 
1638         return TRUE;
1639     } // Parallel closely nested in teams construct
1640 #endif /* OMP_40_ENABLED */
1641 
1642 #if KMP_DEBUG
1643     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1644         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1645     }
1646 #endif
1647 
1648     if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1649         nthreads = 1;
1650     } else {
1651 #if OMP_40_ENABLED
1652         int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1653 #endif
1654         nthreads = master_set_numthreads ?
1655             master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1656 
1657         // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1658         // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1659         if (nthreads > 1) {
1660             if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1661 #if OMP_40_ENABLED
1662                 && !enter_teams
1663 #endif /* OMP_40_ENABLED */
1664             ) ) || ( __kmp_library == library_serial ) ) {
1665                 KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1666                                 gtid, nthreads ));
1667                 nthreads = 1;
1668             }
1669         }
1670         if ( nthreads > 1 ) {
1671             /* determine how many new threads we can use */
1672             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1673 
1674             nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1675 #if OMP_40_ENABLED
1676 /* AC: If we execute teams from parallel region (on host), then teams should be created
1677    but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1678    then teams and their threads should be created regardless of the nesting setting. */
1679                                          , enter_teams
1680 #endif /* OMP_40_ENABLED */
1681                                          );
1682             if ( nthreads == 1 ) {
1683                 // Free lock for single thread execution here;
1684                 // for multi-thread execution it will be freed later
1685                 // after team of threads created and initialized
1686                 __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1687             }
1688         }
1689     }
1690     KMP_DEBUG_ASSERT( nthreads > 0 );
1691 
1692     /* If we temporarily changed the set number of threads then restore it now */
1693     master_th->th.th_set_nproc = 0;
1694 
1695     /* create a serialized parallel region? */
1696     if ( nthreads == 1 ) {
1697         /* josh todo: hypothetical question: what do we do for OS X*? */
1698 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1699         void *   args[ argc ];
1700 #else
1701         void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1702 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1703 
1704         KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1705 
1706         __kmpc_serialized_parallel(loc, gtid);
1707 
1708         if ( call_context == fork_context_intel ) {
1709             /* TODO this sucks, use the compiler itself to pass args! :) */
1710             master_th->th.th_serial_team->t.t_ident = loc;
1711 #if OMP_40_ENABLED
1712             if ( !ap ) {
1713                 // revert change made in __kmpc_serialized_parallel()
1714                 master_th->th.th_serial_team->t.t_level--;
1715                 // Get args from parent team for teams construct
1716 
1717 #if OMPT_SUPPORT
1718                 void *dummy;
1719                 void **exit_runtime_p;
1720 
1721                 ompt_lw_taskteam_t lw_taskteam;
1722 
1723                 if (ompt_enabled) {
1724                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1725                         unwrapped_task, ompt_parallel_id);
1726                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1727                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1728 
1729                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1730 
1731 #if OMPT_TRACE
1732                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1733                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1734                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1735                             ompt_parallel_id, my_task_id);
1736                     }
1737 #endif
1738 
1739                     /* OMPT state */
1740                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1741                 } else {
1742                     exit_runtime_p = &dummy;
1743                 }
1744 #endif
1745 
1746                 {
1747                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1748                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1749                     __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1750 #if OMPT_SUPPORT
1751                         , exit_runtime_p
1752 #endif
1753                     );
1754                 }
1755 
1756 #if OMPT_SUPPORT
1757                 *exit_runtime_p = NULL;
1758                 if (ompt_enabled) {
1759                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1760 
1761 #if OMPT_TRACE
1762                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1763                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1764                             ompt_parallel_id, ompt_task_id);
1765                     }
1766 #endif
1767 
1768                     __ompt_lw_taskteam_unlink(master_th);
1769                     // reset clear the task id only after unlinking the task
1770                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1771 
1772                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1773                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1774                             ompt_parallel_id, ompt_task_id,
1775                             OMPT_INVOKER(call_context));
1776                     }
1777                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1778                 }
1779 #endif
1780             } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1781                 KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1782                 team = master_th->th.th_team;
1783                 //team->t.t_pkfn = microtask;
1784                 team->t.t_invoke = invoker;
1785                 __kmp_alloc_argv_entries( argc, team, TRUE );
1786                 team->t.t_argc = argc;
1787                 argv = (void**) team->t.t_argv;
1788                 if ( ap ) {
1789                     for( i=argc-1; i >= 0; --i )
1790 // TODO: revert workaround for Intel(R) 64 tracker #96
1791 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1792                         *argv++ = va_arg( *ap, void * );
1793 # else
1794                         *argv++ = va_arg( ap, void * );
1795 # endif
1796                 } else {
1797                     for( i=0; i < argc; ++i )
1798                         // Get args from parent team for teams construct
1799                         argv[i] = parent_team->t.t_argv[i];
1800                 }
1801                 // AC: revert change made in __kmpc_serialized_parallel()
1802                 //     because initial code in teams should have level=0
1803                 team->t.t_level--;
1804                 // AC: call special invoker for outer "parallel" of the teams construct
1805                 {
1806                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1807                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1808                     invoker(gtid);
1809                 }
1810             } else {
1811 #endif /* OMP_40_ENABLED */
1812                 argv = args;
1813                 for( i=argc-1; i >= 0; --i )
1814 // TODO: revert workaround for Intel(R) 64 tracker #96
1815 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1816                     *argv++ = va_arg( *ap, void * );
1817 #else
1818                     *argv++ = va_arg( ap, void * );
1819 #endif
1820                 KMP_MB();
1821 
1822 #if OMPT_SUPPORT
1823                 void *dummy;
1824                 void **exit_runtime_p;
1825 
1826                 ompt_lw_taskteam_t lw_taskteam;
1827 
1828                 if (ompt_enabled) {
1829                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1830                         unwrapped_task, ompt_parallel_id);
1831                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1832                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1833 
1834                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1835 
1836 #if OMPT_TRACE
1837                     /* OMPT implicit task begin */
1838                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1839                     my_parallel_id = ompt_parallel_id;
1840                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1841                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1842                             my_parallel_id, my_task_id);
1843                     }
1844 #endif
1845 
1846                     /* OMPT state */
1847                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1848                 } else {
1849                     exit_runtime_p = &dummy;
1850                 }
1851 #endif
1852 
1853                 {
1854                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1855                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1856                     __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1857 #if OMPT_SUPPORT
1858                         , exit_runtime_p
1859 #endif
1860                     );
1861                 }
1862 
1863 #if OMPT_SUPPORT
1864                 *exit_runtime_p = NULL;
1865                 if (ompt_enabled) {
1866 #if OMPT_TRACE
1867                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1868 
1869                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1870                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1871                             my_parallel_id, my_task_id);
1872                     }
1873 #endif
1874 
1875                     __ompt_lw_taskteam_unlink(master_th);
1876                     // reset clear the task id only after unlinking the task
1877                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1878 
1879                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1880                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1881                             ompt_parallel_id, ompt_task_id,
1882                             OMPT_INVOKER(call_context));
1883                     }
1884                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1885                 }
1886 #endif
1887 #if OMP_40_ENABLED
1888             }
1889 #endif /* OMP_40_ENABLED */
1890         }
1891         else if ( call_context == fork_context_gnu ) {
1892 #if OMPT_SUPPORT
1893             ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1894                 __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1895             __ompt_lw_taskteam_init(lwt, master_th, gtid,
1896                 unwrapped_task, ompt_parallel_id);
1897 
1898             lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1899             lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1900             __ompt_lw_taskteam_link(lwt, master_th);
1901 #endif
1902 
1903             // we were called from GNU native code
1904             KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1905             return FALSE;
1906         }
1907         else {
1908             KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1909         }
1910 
1911 
1912         KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1913         KMP_MB();
1914         return FALSE;
1915     }
1916 
1917     // GEH: only modify the executing flag in the case when not serialized
1918     //      serialized case is handled in kmpc_serialized_parallel
1919     KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1920                   parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1921                   master_th->th.th_current_task->td_icvs.max_active_levels ) );
1922     // TODO: GEH - cannot do this assertion because root thread not set up as executing
1923     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1924     master_th->th.th_current_task->td_flags.executing = 0;
1925 
1926 #if OMP_40_ENABLED
1927     if ( !master_th->th.th_teams_microtask || level > teams_level )
1928 #endif /* OMP_40_ENABLED */
1929     {
1930         /* Increment our nested depth level */
1931         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1932     }
1933 
1934     // See if we need to make a copy of the ICVs.
1935     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1936     if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1937         nthreads_icv = __kmp_nested_nth.nth[level+1];
1938     }
1939     else {
1940         nthreads_icv = 0;  // don't update
1941     }
1942 
1943 #if OMP_40_ENABLED
1944     // Figure out the proc_bind_policy for the new team.
1945     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1946     kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1947     if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1948         proc_bind = proc_bind_false;
1949     }
1950     else {
1951         if (proc_bind == proc_bind_default) {
1952             // No proc_bind clause specified; use current proc-bind-var for this parallel region
1953             proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1954         }
1955         /* else: The proc_bind policy was specified explicitly on parallel clause. This
1956            overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1957         // Figure the value of proc-bind-var for the child threads.
1958         if ((level+1 < __kmp_nested_proc_bind.used)
1959             && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1960             proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1961         }
1962     }
1963 
1964     // Reset for next parallel region
1965     master_th->th.th_set_proc_bind = proc_bind_default;
1966 #endif /* OMP_40_ENABLED */
1967 
1968     if ((nthreads_icv > 0)
1969 #if OMP_40_ENABLED
1970         || (proc_bind_icv != proc_bind_default)
1971 #endif /* OMP_40_ENABLED */
1972         ) {
1973         kmp_internal_control_t new_icvs;
1974         copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1975         new_icvs.next = NULL;
1976         if (nthreads_icv > 0) {
1977             new_icvs.nproc = nthreads_icv;
1978         }
1979 
1980 #if OMP_40_ENABLED
1981         if (proc_bind_icv != proc_bind_default) {
1982             new_icvs.proc_bind = proc_bind_icv;
1983         }
1984 #endif /* OMP_40_ENABLED */
1985 
1986         /* allocate a new parallel team */
1987         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1988         team = __kmp_allocate_team(root, nthreads, nthreads,
1989 #if OMPT_SUPPORT
1990                                    ompt_parallel_id,
1991 #endif
1992 #if OMP_40_ENABLED
1993                                    proc_bind,
1994 #endif
1995                                    &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1996     } else {
1997         /* allocate a new parallel team */
1998         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1999         team = __kmp_allocate_team(root, nthreads, nthreads,
2000 #if OMPT_SUPPORT
2001                                    ompt_parallel_id,
2002 #endif
2003 #if OMP_40_ENABLED
2004                                    proc_bind,
2005 #endif
2006                                    &master_th->th.th_current_task->td_icvs, argc
2007                                    USE_NESTED_HOT_ARG(master_th) );
2008     }
2009     KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2010 
2011     /* setup the new team */
2012     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2013     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2014     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2015     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2016     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2017 #if OMPT_SUPPORT
2018     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2019 #endif
2020     KMP_CHECK_UPDATE(team->t.t_invoke, invoker);  /* TODO move this to root, maybe */
2021     // TODO: parent_team->t.t_level == INT_MAX ???
2022 #if OMP_40_ENABLED
2023     if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2024 #endif /* OMP_40_ENABLED */
2025         int new_level = parent_team->t.t_level + 1;
2026         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2027         new_level = parent_team->t.t_active_level + 1;
2028         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2029 #if OMP_40_ENABLED
2030     } else {
2031         // AC: Do not increase parallel level at start of the teams construct
2032         int new_level = parent_team->t.t_level;
2033         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2034         new_level = parent_team->t.t_active_level;
2035         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2036     }
2037 #endif /* OMP_40_ENABLED */
2038     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2039     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || team->t.t_sched.chunk != new_sched.chunk)
2040         team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
2041 
2042 #if OMP_40_ENABLED
2043     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2044 #endif
2045 
2046     // Update the floating point rounding in the team if required.
2047     propagateFPControl(team);
2048 
2049     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2050         // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2051 #if 0
2052         // Patch out an assertion that trips while the runtime seems to operate correctly.
2053         // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2054         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2055 #endif
2056         KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2057                       __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2058                       parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2059 
2060         if ( active_level || master_th->th.th_task_team ) {
2061             // Take a memo of master's task_state
2062             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2063             if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2064                 kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2065                 kmp_uint8 *old_stack, *new_stack;
2066                 kmp_uint32 i;
2067                 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2068                 for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2069                     new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2070                 }
2071                 for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2072                     new_stack[i] = 0;
2073                 }
2074                 old_stack = master_th->th.th_task_state_memo_stack;
2075                 master_th->th.th_task_state_memo_stack = new_stack;
2076                 master_th->th.th_task_state_stack_sz = new_size;
2077                 __kmp_free(old_stack);
2078             }
2079             // Store master's task_state on stack
2080             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2081             master_th->th.th_task_state_top++;
2082 #if KMP_NESTED_HOT_TEAMS
2083             if (team == master_th->th.th_hot_teams[active_level].hot_team) { // Restore master's nested state if nested hot team
2084                 master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2085             }
2086             else {
2087 #endif
2088                 master_th->th.th_task_state = 0;
2089 #if KMP_NESTED_HOT_TEAMS
2090             }
2091 #endif
2092         }
2093 #if !KMP_NESTED_HOT_TEAMS
2094         KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2095 #endif
2096     }
2097 
2098     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2099                 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2100     KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2101                       ( team->t.t_master_tid == 0 &&
2102                         ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2103     KMP_MB();
2104 
2105     /* now, setup the arguments */
2106     argv = (void**)team->t.t_argv;
2107 #if OMP_40_ENABLED
2108     if ( ap ) {
2109 #endif /* OMP_40_ENABLED */
2110         for ( i=argc-1; i >= 0; --i ) {
2111 // TODO: revert workaround for Intel(R) 64 tracker #96
2112 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2113             void *new_argv = va_arg(*ap, void *);
2114 #else
2115             void *new_argv = va_arg(ap, void *);
2116 #endif
2117             KMP_CHECK_UPDATE(*argv, new_argv);
2118             argv++;
2119         }
2120 #if OMP_40_ENABLED
2121     } else {
2122         for ( i=0; i < argc; ++i ) {
2123             // Get args from parent team for teams construct
2124             KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2125         }
2126     }
2127 #endif /* OMP_40_ENABLED */
2128 
2129     /* now actually fork the threads */
2130     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2131     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2132         root->r.r_active = TRUE;
2133 
2134     __kmp_fork_team_threads( root, team, master_th, gtid );
2135     __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2136 
2137 #if OMPT_SUPPORT
2138     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2139 #endif
2140 
2141     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2142 
2143 #if USE_ITT_BUILD
2144     if ( team->t.t_active_level == 1 // only report frames at level 1
2145 # if OMP_40_ENABLED
2146         && !master_th->th.th_teams_microtask // not in teams construct
2147 # endif /* OMP_40_ENABLED */
2148     ) {
2149 #if USE_ITT_NOTIFY
2150         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2151              ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2152         {
2153             kmp_uint64 tmp_time = 0;
2154             if ( __itt_get_timestamp_ptr )
2155                 tmp_time = __itt_get_timestamp();
2156             // Internal fork - report frame begin
2157             master_th->th.th_frame_time  = tmp_time;
2158             if ( __kmp_forkjoin_frames_mode == 3 )
2159                 team->t.t_region_time = tmp_time;
2160         } else // only one notification scheme (either "submit" or "forking/joined", not both)
2161 #endif /* USE_ITT_NOTIFY */
2162         if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2163              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2164         { // Mark start of "parallel" region for VTune.
2165             __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2166         }
2167     }
2168 #endif /* USE_ITT_BUILD */
2169 
2170     /* now go on and do the work */
2171     KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2172     KMP_MB();
2173     KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2174                   root, team, master_th, gtid));
2175 
2176 #if USE_ITT_BUILD
2177     if ( __itt_stack_caller_create_ptr ) {
2178         team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2179     }
2180 #endif /* USE_ITT_BUILD */
2181 
2182 #if OMP_40_ENABLED
2183     if ( ap )   // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2184 #endif /* OMP_40_ENABLED */
2185     {
2186         __kmp_internal_fork( loc, gtid, team );
2187         KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2188                       root, team, master_th, gtid));
2189     }
2190 
2191     if (call_context == fork_context_gnu) {
2192         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2193         return TRUE;
2194     }
2195 
2196     /* Invoke microtask for MASTER thread */
2197     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2198                 gtid, team->t.t_id, team->t.t_pkfn ) );
2199     }  // END of timer KMP_fork_call block
2200 
2201     {
2202         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2203         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2204         if (! team->t.t_invoke( gtid )) {
2205             KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2206         }
2207     }
2208     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2209         gtid, team->t.t_id, team->t.t_pkfn ) );
2210     KMP_MB();       /* Flush all pending memory write invalidates.  */
2211 
2212     KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2213 
2214 #if OMPT_SUPPORT
2215     if (ompt_enabled) {
2216         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2217     }
2218 #endif
2219 
2220     return TRUE;
2221 }
2222 
2223 #if OMPT_SUPPORT
2224 static inline void
2225 __kmp_join_restore_state(
2226     kmp_info_t *thread,
2227     kmp_team_t *team)
2228 {
2229     // restore state outside the region
2230     thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2231         ompt_state_work_serial : ompt_state_work_parallel);
2232 }
2233 
2234 static inline void
2235 __kmp_join_ompt(
2236     kmp_info_t *thread,
2237     kmp_team_t *team,
2238     ompt_parallel_id_t parallel_id,
2239     fork_context_e fork_context)
2240 {
2241     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2242     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2243         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2244             parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2245     }
2246 
2247     task_info->frame.reenter_runtime_frame = NULL;
2248     __kmp_join_restore_state(thread,team);
2249 }
2250 #endif
2251 
2252 void
2253 __kmp_join_call(ident_t *loc, int gtid
2254 #if OMPT_SUPPORT
2255                , enum fork_context_e fork_context
2256 #endif
2257 #if OMP_40_ENABLED
2258                , int exit_teams
2259 #endif /* OMP_40_ENABLED */
2260 )
2261 {
2262     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2263     kmp_team_t     *team;
2264     kmp_team_t     *parent_team;
2265     kmp_info_t     *master_th;
2266     kmp_root_t     *root;
2267     int             master_active;
2268     int             i;
2269 
2270     KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2271 
2272     /* setup current data */
2273     master_th     = __kmp_threads[ gtid ];
2274     root          = master_th->th.th_root;
2275     team          = master_th->th.th_team;
2276     parent_team   = team->t.t_parent;
2277 
2278     master_th->th.th_ident = loc;
2279 
2280 #if OMPT_SUPPORT
2281     if (ompt_enabled) {
2282         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2283     }
2284 #endif
2285 
2286 #if KMP_DEBUG
2287     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2288         KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2289                          __kmp_gtid_from_thread( master_th ), team,
2290                          team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2291         KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2292     }
2293 #endif
2294 
2295     if( team->t.t_serialized ) {
2296 #if OMP_40_ENABLED
2297         if ( master_th->th.th_teams_microtask ) {
2298             // We are in teams construct
2299             int level = team->t.t_level;
2300             int tlevel = master_th->th.th_teams_level;
2301             if ( level == tlevel ) {
2302                 // AC: we haven't incremented it earlier at start of teams construct,
2303                 //     so do it here - at the end of teams construct
2304                 team->t.t_level++;
2305             } else if ( level == tlevel + 1 ) {
2306                 // AC: we are exiting parallel inside teams, need to increment serialization
2307                 //     in order to restore it in the next call to __kmpc_end_serialized_parallel
2308                 team->t.t_serialized++;
2309             }
2310         }
2311 #endif /* OMP_40_ENABLED */
2312         __kmpc_end_serialized_parallel( loc, gtid );
2313 
2314 #if OMPT_SUPPORT
2315         if (ompt_enabled) {
2316             __kmp_join_restore_state(master_th, parent_team);
2317         }
2318 #endif
2319 
2320         return;
2321     }
2322 
2323     master_active = team->t.t_master_active;
2324 
2325 #if OMP_40_ENABLED
2326     if (!exit_teams)
2327 #endif /* OMP_40_ENABLED */
2328     {
2329         // AC: No barrier for internal teams at exit from teams construct.
2330         //     But there is barrier for external team (league).
2331         __kmp_internal_join( loc, gtid, team );
2332     }
2333 #if OMP_40_ENABLED
2334     else {
2335         master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2336     }
2337 #endif /* OMP_40_ENABLED */
2338 
2339     KMP_MB();
2340 
2341 #if OMPT_SUPPORT
2342     ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2343 #endif
2344 
2345 #if USE_ITT_BUILD
2346     if ( __itt_stack_caller_create_ptr ) {
2347         __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2348     }
2349 
2350     // Mark end of "parallel" region for VTune.
2351     if ( team->t.t_active_level == 1
2352 # if OMP_40_ENABLED
2353         && !master_th->th.th_teams_microtask /* not in teams construct */
2354 # endif /* OMP_40_ENABLED */
2355     ) {
2356         master_th->th.th_ident = loc;
2357         // only one notification scheme (either "submit" or "forking/joined", not both)
2358         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2359             __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2360                                     0, loc, master_th->th.th_team_nproc, 1 );
2361         else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2362             ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2363             __kmp_itt_region_joined( gtid );
2364     } // active_level == 1
2365 #endif /* USE_ITT_BUILD */
2366 
2367 #if OMP_40_ENABLED
2368     if ( master_th->th.th_teams_microtask &&
2369          !exit_teams &&
2370          team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2371          team->t.t_level == master_th->th.th_teams_level + 1 ) {
2372         // AC: We need to leave the team structure intact at the end
2373         //     of parallel inside the teams construct, so that at the next
2374         //     parallel same (hot) team works, only adjust nesting levels
2375 
2376         /* Decrement our nested depth level */
2377         team->t.t_level --;
2378         team->t.t_active_level --;
2379         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2380 
2381         /* Restore number of threads in the team if needed */
2382         if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2383             int old_num = master_th->th.th_team_nproc;
2384             int new_num = master_th->th.th_teams_size.nth;
2385             kmp_info_t **other_threads = team->t.t_threads;
2386             team->t.t_nproc = new_num;
2387             for ( i = 0; i < old_num; ++i ) {
2388                 other_threads[i]->th.th_team_nproc = new_num;
2389             }
2390             // Adjust states of non-used threads of the team
2391             for ( i = old_num; i < new_num; ++i ) {
2392                 // Re-initialize thread's barrier data.
2393                 int b;
2394                 kmp_balign_t * balign = other_threads[i]->th.th_bar;
2395                 for ( b = 0; b < bs_last_barrier; ++ b ) {
2396                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
2397                     KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2398 #if USE_DEBUGGER
2399                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2400 #endif
2401                 }
2402                 if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2403                     // Synchronize thread's task state
2404                     other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2405                 }
2406             }
2407         }
2408 
2409 #if OMPT_SUPPORT
2410         if (ompt_enabled) {
2411             __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2412         }
2413 #endif
2414 
2415         return;
2416     }
2417 #endif /* OMP_40_ENABLED */
2418 
2419     /* do cleanup and restore the parent team */
2420     master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2421     master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2422 
2423     master_th->th.th_dispatch =
2424                 & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2425 
2426     /* jc: The following lock has instructions with REL and ACQ semantics,
2427        separating the parallel user code called in this parallel region
2428        from the serial user code called after this function returns.
2429     */
2430     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2431 
2432 #if OMP_40_ENABLED
2433     if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2434 #endif /* OMP_40_ENABLED */
2435     {
2436         /* Decrement our nested depth level */
2437         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2438     }
2439     KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2440 
2441 #if OMPT_SUPPORT && OMPT_TRACE
2442     if(ompt_enabled){
2443         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2444         if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2445              ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2446                parallel_id, task_info->task_id);
2447         }
2448         task_info->frame.exit_runtime_frame = NULL;
2449         task_info->task_id = 0;
2450     }
2451 #endif
2452 
2453     KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2454                    0, master_th, team ) );
2455     __kmp_pop_current_task_from_thread( master_th );
2456 
2457 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2458     //
2459     // Restore master thread's partition.
2460     //
2461     master_th->th.th_first_place = team->t.t_first_place;
2462     master_th->th.th_last_place = team->t.t_last_place;
2463 #endif /* OMP_40_ENABLED */
2464 
2465     updateHWFPControl (team);
2466 
2467     if ( root->r.r_active != master_active )
2468         root->r.r_active = master_active;
2469 
2470     __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2471 
2472     /* this race was fun to find.  make sure the following is in the critical
2473      * region otherwise assertions may fail occasionally since the old team
2474      * may be reallocated and the hierarchy appears inconsistent.  it is
2475      * actually safe to run and won't cause any bugs, but will cause those
2476      * assertion failures.  it's only one deref&assign so might as well put this
2477      * in the critical region */
2478     master_th->th.th_team        =   parent_team;
2479     master_th->th.th_team_nproc  =   parent_team->t.t_nproc;
2480     master_th->th.th_team_master =   parent_team->t.t_threads[0];
2481     master_th->th.th_team_serialized = parent_team->t.t_serialized;
2482 
2483     /* restore serialized team, if need be */
2484     if( parent_team->t.t_serialized &&
2485         parent_team != master_th->th.th_serial_team &&
2486         parent_team != root->r.r_root_team ) {
2487             __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2488             master_th->th.th_serial_team = parent_team;
2489     }
2490 
2491     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2492         if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2493             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2494             // Remember master's state if we re-use this nested hot team
2495             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2496             --master_th->th.th_task_state_top; // pop
2497             // Now restore state at this level
2498             master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2499         }
2500         // Copy the task team from the parent team to the master thread
2501         master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2502         KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2503                         __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2504     }
2505 
2506      // TODO: GEH - cannot do this assertion because root thread not set up as executing
2507      // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2508      master_th->th.th_current_task->td_flags.executing = 1;
2509 
2510     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2511 
2512 #if OMPT_SUPPORT
2513     if (ompt_enabled) {
2514         __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2515     }
2516 #endif
2517 
2518     KMP_MB();
2519     KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2520 }
2521 
2522 /* ------------------------------------------------------------------------ */
2523 /* ------------------------------------------------------------------------ */
2524 
2525 /* Check whether we should push an internal control record onto the
2526    serial team stack.  If so, do it.  */
2527 void
2528 __kmp_save_internal_controls ( kmp_info_t * thread )
2529 {
2530 
2531     if ( thread->th.th_team != thread->th.th_serial_team ) {
2532         return;
2533     }
2534     if (thread->th.th_team->t.t_serialized > 1) {
2535         int push = 0;
2536 
2537         if (thread->th.th_team->t.t_control_stack_top == NULL) {
2538             push = 1;
2539         } else {
2540             if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2541                  thread->th.th_team->t.t_serialized ) {
2542                 push = 1;
2543             }
2544         }
2545         if (push) {  /* push a record on the serial team's stack */
2546             kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2547 
2548             copy_icvs( control, & thread->th.th_current_task->td_icvs );
2549 
2550             control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2551 
2552             control->next = thread->th.th_team->t.t_control_stack_top;
2553             thread->th.th_team->t.t_control_stack_top = control;
2554         }
2555     }
2556 }
2557 
2558 /* Changes set_nproc */
2559 void
2560 __kmp_set_num_threads( int new_nth, int gtid )
2561 {
2562     kmp_info_t *thread;
2563     kmp_root_t *root;
2564 
2565     KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2566     KMP_DEBUG_ASSERT( __kmp_init_serial );
2567 
2568     if (new_nth < 1)
2569         new_nth = 1;
2570     else if (new_nth > __kmp_max_nth)
2571         new_nth = __kmp_max_nth;
2572 
2573     KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2574     thread = __kmp_threads[gtid];
2575 
2576     __kmp_save_internal_controls( thread );
2577 
2578     set__nproc( thread, new_nth );
2579 
2580     //
2581     // If this omp_set_num_threads() call will cause the hot team size to be
2582     // reduced (in the absence of a num_threads clause), then reduce it now,
2583     // rather than waiting for the next parallel region.
2584     //
2585     root = thread->th.th_root;
2586     if ( __kmp_init_parallel && ( ! root->r.r_active )
2587       && ( root->r.r_hot_team->t.t_nproc > new_nth )
2588 #if KMP_NESTED_HOT_TEAMS
2589       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2590 #endif
2591     ) {
2592         kmp_team_t *hot_team = root->r.r_hot_team;
2593         int f;
2594 
2595         __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2596 
2597         // Release the extra threads we don't need any more.
2598         for ( f = new_nth;  f < hot_team->t.t_nproc; f++ ) {
2599             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2600             if ( __kmp_tasking_mode != tskm_immediate_exec) {
2601                 // When decreasing team size, threads no longer in the team should unref task team.
2602                 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2603             }
2604             __kmp_free_thread( hot_team->t.t_threads[f] );
2605             hot_team->t.t_threads[f] =  NULL;
2606         }
2607         hot_team->t.t_nproc = new_nth;
2608 #if KMP_NESTED_HOT_TEAMS
2609         if( thread->th.th_hot_teams ) {
2610             KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2611             thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2612         }
2613 #endif
2614 
2615         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2616 
2617         //
2618         // Update the t_nproc field in the threads that are still active.
2619         //
2620         for( f=0 ; f < new_nth; f++ ) {
2621             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2622             hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2623         }
2624         // Special flag in case omp_set_num_threads() call
2625         hot_team->t.t_size_changed = -1;
2626     }
2627 }
2628 
2629 /* Changes max_active_levels */
2630 void
2631 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2632 {
2633     kmp_info_t *thread;
2634 
2635     KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2636     KMP_DEBUG_ASSERT( __kmp_init_serial );
2637 
2638     // validate max_active_levels
2639     if( max_active_levels < 0 ) {
2640         KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2641         // We ignore this call if the user has specified a negative value.
2642         // The current setting won't be changed. The last valid setting will be used.
2643         // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2644         KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2645         return;
2646     }
2647     if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2648         // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2649         // We allow a zero value. (implementation defined behavior)
2650     } else {
2651         KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT  );
2652         max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2653         // Current upper limit is MAX_INT. (implementation defined behavior)
2654         // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2655         // Actually, the flow should never get here until we use MAX_INT limit.
2656     }
2657     KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2658 
2659     thread = __kmp_threads[ gtid ];
2660 
2661     __kmp_save_internal_controls( thread );
2662 
2663     set__max_active_levels( thread, max_active_levels );
2664 
2665 }
2666 
2667 /* Gets max_active_levels */
2668 int
2669 __kmp_get_max_active_levels( int gtid )
2670 {
2671     kmp_info_t *thread;
2672 
2673     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2674     KMP_DEBUG_ASSERT( __kmp_init_serial );
2675 
2676     thread = __kmp_threads[ gtid ];
2677     KMP_DEBUG_ASSERT( thread->th.th_current_task );
2678     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2679         gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2680     return thread->th.th_current_task->td_icvs.max_active_levels;
2681 }
2682 
2683 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2684 void
2685 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2686 {
2687     kmp_info_t *thread;
2688 //    kmp_team_t *team;
2689 
2690     KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2691     KMP_DEBUG_ASSERT( __kmp_init_serial );
2692 
2693     // Check if the kind parameter is valid, correct if needed.
2694     // Valid parameters should fit in one of two intervals - standard or extended:
2695     //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2696     // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2697     if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2698        ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2699     {
2700         // TODO: Hint needs attention in case we change the default schedule.
2701         __kmp_msg(
2702             kmp_ms_warning,
2703             KMP_MSG( ScheduleKindOutOfRange, kind ),
2704             KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2705             __kmp_msg_null
2706         );
2707         kind = kmp_sched_default;
2708         chunk = 0;         // ignore chunk value in case of bad kind
2709     }
2710 
2711     thread = __kmp_threads[ gtid ];
2712 
2713     __kmp_save_internal_controls( thread );
2714 
2715     if ( kind < kmp_sched_upper_std ) {
2716         if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2717             // differ static chunked vs. unchunked:
2718             // chunk should be invalid to indicate unchunked schedule (which is the default)
2719             thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2720         } else {
2721             thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2722         }
2723     } else {
2724         //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2725         thread->th.th_current_task->td_icvs.sched.r_sched_type =
2726             __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2727     }
2728     if ( kind == kmp_sched_auto ) {
2729         // ignore parameter chunk for schedule auto
2730         thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2731     } else {
2732         thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2733     }
2734 }
2735 
2736 /* Gets def_sched_var ICV values */
2737 void
2738 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2739 {
2740     kmp_info_t     *thread;
2741     enum sched_type th_type;
2742 
2743     KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2744     KMP_DEBUG_ASSERT( __kmp_init_serial );
2745 
2746     thread = __kmp_threads[ gtid ];
2747 
2748     th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2749 
2750     switch ( th_type ) {
2751     case kmp_sch_static:
2752     case kmp_sch_static_greedy:
2753     case kmp_sch_static_balanced:
2754         *kind = kmp_sched_static;
2755         *chunk = 0;   // chunk was not set, try to show this fact via zero value
2756         return;
2757     case kmp_sch_static_chunked:
2758         *kind = kmp_sched_static;
2759         break;
2760     case kmp_sch_dynamic_chunked:
2761         *kind = kmp_sched_dynamic;
2762         break;
2763     case kmp_sch_guided_chunked:
2764     case kmp_sch_guided_iterative_chunked:
2765     case kmp_sch_guided_analytical_chunked:
2766         *kind = kmp_sched_guided;
2767         break;
2768     case kmp_sch_auto:
2769         *kind = kmp_sched_auto;
2770         break;
2771     case kmp_sch_trapezoidal:
2772         *kind = kmp_sched_trapezoidal;
2773         break;
2774 #if KMP_STATIC_STEAL_ENABLED
2775     case kmp_sch_static_steal:
2776         *kind = kmp_sched_static_steal;
2777         break;
2778 #endif
2779     default:
2780         KMP_FATAL( UnknownSchedulingType, th_type );
2781     }
2782 
2783     *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2784 }
2785 
2786 int
2787 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2788 
2789     int ii, dd;
2790     kmp_team_t *team;
2791     kmp_info_t *thr;
2792 
2793     KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2794     KMP_DEBUG_ASSERT( __kmp_init_serial );
2795 
2796     // validate level
2797     if( level == 0 ) return 0;
2798     if( level < 0 ) return -1;
2799     thr = __kmp_threads[ gtid ];
2800     team = thr->th.th_team;
2801     ii = team->t.t_level;
2802     if( level > ii ) return -1;
2803 
2804 #if OMP_40_ENABLED
2805     if( thr->th.th_teams_microtask ) {
2806         // AC: we are in teams region where multiple nested teams have same level
2807         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2808         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2809             KMP_DEBUG_ASSERT( ii >= tlevel );
2810             // AC: As we need to pass by the teams league, we need to artificially increase ii
2811             if ( ii == tlevel ) {
2812                 ii += 2; // three teams have same level
2813             } else {
2814                 ii ++;   // two teams have same level
2815             }
2816         }
2817     }
2818 #endif
2819 
2820     if( ii == level ) return __kmp_tid_from_gtid( gtid );
2821 
2822     dd = team->t.t_serialized;
2823     level++;
2824     while( ii > level )
2825     {
2826         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2827         {
2828         }
2829         if( ( team->t.t_serialized ) && ( !dd ) ) {
2830             team = team->t.t_parent;
2831             continue;
2832         }
2833         if( ii > level ) {
2834             team = team->t.t_parent;
2835             dd = team->t.t_serialized;
2836             ii--;
2837         }
2838     }
2839 
2840     return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2841 }
2842 
2843 int
2844 __kmp_get_team_size( int gtid, int level ) {
2845 
2846     int ii, dd;
2847     kmp_team_t *team;
2848     kmp_info_t *thr;
2849 
2850     KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2851     KMP_DEBUG_ASSERT( __kmp_init_serial );
2852 
2853     // validate level
2854     if( level == 0 ) return 1;
2855     if( level < 0 ) return -1;
2856     thr = __kmp_threads[ gtid ];
2857     team = thr->th.th_team;
2858     ii = team->t.t_level;
2859     if( level > ii ) return -1;
2860 
2861 #if OMP_40_ENABLED
2862     if( thr->th.th_teams_microtask ) {
2863         // AC: we are in teams region where multiple nested teams have same level
2864         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2865         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2866             KMP_DEBUG_ASSERT( ii >= tlevel );
2867             // AC: As we need to pass by the teams league, we need to artificially increase ii
2868             if ( ii == tlevel ) {
2869                 ii += 2; // three teams have same level
2870             } else {
2871                 ii ++;   // two teams have same level
2872             }
2873         }
2874     }
2875 #endif
2876 
2877     while( ii > level )
2878     {
2879         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2880         {
2881         }
2882         if( team->t.t_serialized && ( !dd ) ) {
2883             team = team->t.t_parent;
2884             continue;
2885         }
2886         if( ii > level ) {
2887             team = team->t.t_parent;
2888             ii--;
2889         }
2890     }
2891 
2892     return team->t.t_nproc;
2893 }
2894 
2895 kmp_r_sched_t
2896 __kmp_get_schedule_global() {
2897 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2898 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2899 
2900     kmp_r_sched_t r_sched;
2901 
2902     // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2903     // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2904     // and thus have different run-time schedules in different roots (even in OMP 2.5)
2905     if ( __kmp_sched == kmp_sch_static ) {
2906         r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2907     } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2908         r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2909     } else {
2910         r_sched.r_sched_type = __kmp_sched;  // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2911     }
2912 
2913     if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2914         r_sched.chunk = KMP_DEFAULT_CHUNK;
2915     } else {
2916         r_sched.chunk = __kmp_chunk;
2917     }
2918 
2919     return r_sched;
2920 }
2921 
2922 /* ------------------------------------------------------------------------ */
2923 /* ------------------------------------------------------------------------ */
2924 
2925 
2926 /*
2927  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2928  * at least argc number of *t_argv entries for the requested team.
2929  */
2930 static void
2931 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2932 {
2933 
2934     KMP_DEBUG_ASSERT( team );
2935     if( !realloc || argc > team->t.t_max_argc ) {
2936 
2937         KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2938                          team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2939         /* if previously allocated heap space for args, free them */
2940         if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2941             __kmp_free( (void *) team->t.t_argv );
2942 
2943         if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2944             /* use unused space in the cache line for arguments */
2945             team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2946             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2947                              team->t.t_id, team->t.t_max_argc ));
2948             team->t.t_argv = &team->t.t_inline_argv[0];
2949             if ( __kmp_storage_map ) {
2950                 __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2951                                          &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2952                                          (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2953                                          "team_%d.t_inline_argv",
2954                                          team->t.t_id );
2955             }
2956         } else {
2957             /* allocate space for arguments in the heap */
2958             team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2959                                      KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2960             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2961                              team->t.t_id, team->t.t_max_argc ));
2962             team->t.t_argv     = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2963             if ( __kmp_storage_map ) {
2964                 __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2965                                          sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2966                                          team->t.t_id );
2967             }
2968         }
2969     }
2970 }
2971 
2972 static void
2973 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2974 {
2975     int i;
2976     int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2977     team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2978     team->t.t_disp_buffer = (dispatch_shared_info_t*)
2979         __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2980     team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2981     team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2982     team->t.t_max_nproc = max_nth;
2983 
2984     /* setup dispatch buffers */
2985     for(i = 0 ; i < num_disp_buff; ++i) {
2986         team->t.t_disp_buffer[i].buffer_index = i;
2987 #if OMP_45_ENABLED
2988         team->t.t_disp_buffer[i].doacross_buf_idx = i;
2989 #endif
2990     }
2991 }
2992 
2993 static void
2994 __kmp_free_team_arrays(kmp_team_t *team) {
2995     /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2996     int i;
2997     for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
2998         if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
2999             __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
3000             team->t.t_dispatch[ i ].th_disp_buffer = NULL;
3001         }; // if
3002     }; // for
3003     __kmp_free(team->t.t_threads);
3004     __kmp_free(team->t.t_disp_buffer);
3005     __kmp_free(team->t.t_dispatch);
3006     __kmp_free(team->t.t_implicit_task_taskdata);
3007     team->t.t_threads     = NULL;
3008     team->t.t_disp_buffer = NULL;
3009     team->t.t_dispatch    = NULL;
3010     team->t.t_implicit_task_taskdata = 0;
3011 }
3012 
3013 static void
3014 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3015     kmp_info_t **oldThreads = team->t.t_threads;
3016 
3017     __kmp_free(team->t.t_disp_buffer);
3018     __kmp_free(team->t.t_dispatch);
3019     __kmp_free(team->t.t_implicit_task_taskdata);
3020     __kmp_allocate_team_arrays(team, max_nth);
3021 
3022     KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3023 
3024     __kmp_free(oldThreads);
3025 }
3026 
3027 static kmp_internal_control_t
3028 __kmp_get_global_icvs( void ) {
3029 
3030     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3031 
3032 #if OMP_40_ENABLED
3033     KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3034 #endif /* OMP_40_ENABLED */
3035 
3036     kmp_internal_control_t g_icvs = {
3037       0,                            //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3038       (kmp_int8)__kmp_dflt_nested,            //int nested;               //internal control for nested parallelism (per thread)
3039       (kmp_int8)__kmp_global.g.g_dynamic,                                 //internal control for dynamic adjustment of threads (per thread)
3040       (kmp_int8)__kmp_env_blocktime,          //int bt_set;               //internal control for whether blocktime is explicitly set
3041       __kmp_dflt_blocktime,         //int blocktime;            //internal control for blocktime
3042 #if KMP_USE_MONITOR
3043       __kmp_bt_intervals,           //int bt_intervals;         //internal control for blocktime intervals
3044 #endif
3045       __kmp_dflt_team_nth,          //int nproc;                //internal control for # of threads for next parallel region (per thread)
3046                                     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3047       __kmp_dflt_max_active_levels, //int max_active_levels;    //internal control for max_active_levels
3048       r_sched,                      //kmp_r_sched_t sched;      //internal control for runtime schedule {sched,chunk} pair
3049 #if OMP_40_ENABLED
3050       __kmp_nested_proc_bind.bind_types[0],
3051       __kmp_default_device,
3052 #endif /* OMP_40_ENABLED */
3053       NULL                          //struct kmp_internal_control *next;
3054     };
3055 
3056     return g_icvs;
3057 }
3058 
3059 static kmp_internal_control_t
3060 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3061 
3062     kmp_internal_control_t gx_icvs;
3063     gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3064     copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3065     gx_icvs.next = NULL;
3066 
3067     return gx_icvs;
3068 }
3069 
3070 static void
3071 __kmp_initialize_root( kmp_root_t *root )
3072 {
3073     int           f;
3074     kmp_team_t   *root_team;
3075     kmp_team_t   *hot_team;
3076     int           hot_team_max_nth;
3077     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3078     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3079     KMP_DEBUG_ASSERT( root );
3080     KMP_ASSERT( ! root->r.r_begin );
3081 
3082     /* setup the root state structure */
3083     __kmp_init_lock( &root->r.r_begin_lock );
3084     root->r.r_begin        = FALSE;
3085     root->r.r_active       = FALSE;
3086     root->r.r_in_parallel  = 0;
3087     root->r.r_blocktime    = __kmp_dflt_blocktime;
3088     root->r.r_nested       = __kmp_dflt_nested;
3089 
3090     /* setup the root team for this task */
3091     /* allocate the root team structure */
3092     KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3093 
3094     root_team =
3095         __kmp_allocate_team(
3096             root,
3097             1,                                                         // new_nproc
3098             1,                                                         // max_nproc
3099 #if OMPT_SUPPORT
3100             0, // root parallel id
3101 #endif
3102 #if OMP_40_ENABLED
3103             __kmp_nested_proc_bind.bind_types[0],
3104 #endif
3105             &r_icvs,
3106             0                                                          // argc
3107             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3108         );
3109 #if USE_DEBUGGER
3110     // Non-NULL value should be assigned to make the debugger display the root team.
3111     TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3112 #endif
3113 
3114     KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3115 
3116     root->r.r_root_team = root_team;
3117     root_team->t.t_control_stack_top = NULL;
3118 
3119     /* initialize root team */
3120     root_team->t.t_threads[0] = NULL;
3121     root_team->t.t_nproc      = 1;
3122     root_team->t.t_serialized = 1;
3123     // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3124     root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3125     root_team->t.t_sched.chunk        = r_sched.chunk;
3126     KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3127                     root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3128 
3129     /* setup the  hot team for this task */
3130     /* allocate the hot team structure */
3131     KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3132 
3133     hot_team =
3134         __kmp_allocate_team(
3135             root,
3136             1,                                                         // new_nproc
3137             __kmp_dflt_team_nth_ub * 2,                                // max_nproc
3138 #if OMPT_SUPPORT
3139             0, // root parallel id
3140 #endif
3141 #if OMP_40_ENABLED
3142             __kmp_nested_proc_bind.bind_types[0],
3143 #endif
3144             &r_icvs,
3145             0                                                          // argc
3146             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3147         );
3148     KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3149 
3150     root->r.r_hot_team = hot_team;
3151     root_team->t.t_control_stack_top = NULL;
3152 
3153     /* first-time initialization */
3154     hot_team->t.t_parent = root_team;
3155 
3156     /* initialize hot team */
3157     hot_team_max_nth = hot_team->t.t_max_nproc;
3158     for ( f = 0; f < hot_team_max_nth; ++ f ) {
3159         hot_team->t.t_threads[ f ] = NULL;
3160     }; // for
3161     hot_team->t.t_nproc = 1;
3162     // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3163     hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3164     hot_team->t.t_sched.chunk        = r_sched.chunk;
3165     hot_team->t.t_size_changed = 0;
3166 }
3167 
3168 #ifdef KMP_DEBUG
3169 
3170 
3171 typedef struct kmp_team_list_item {
3172     kmp_team_p const *           entry;
3173     struct kmp_team_list_item *  next;
3174 } kmp_team_list_item_t;
3175 typedef kmp_team_list_item_t * kmp_team_list_t;
3176 
3177 
3178 static void
3179 __kmp_print_structure_team_accum(    // Add team to list of teams.
3180     kmp_team_list_t     list,        // List of teams.
3181     kmp_team_p const *  team         // Team to add.
3182 ) {
3183 
3184     // List must terminate with item where both entry and next are NULL.
3185     // Team is added to the list only once.
3186     // List is sorted in ascending order by team id.
3187     // Team id is *not* a key.
3188 
3189     kmp_team_list_t l;
3190 
3191     KMP_DEBUG_ASSERT( list != NULL );
3192     if ( team == NULL ) {
3193         return;
3194     }; // if
3195 
3196     __kmp_print_structure_team_accum( list, team->t.t_parent );
3197     __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3198 
3199     // Search list for the team.
3200     l = list;
3201     while ( l->next != NULL && l->entry != team ) {
3202         l = l->next;
3203     }; // while
3204     if ( l->next != NULL ) {
3205         return;  // Team has been added before, exit.
3206     }; // if
3207 
3208     // Team is not found. Search list again for insertion point.
3209     l = list;
3210     while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3211         l = l->next;
3212     }; // while
3213 
3214     // Insert team.
3215     {
3216         kmp_team_list_item_t * item =
3217             (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof(  kmp_team_list_item_t ) );
3218         * item = * l;
3219         l->entry = team;
3220         l->next  = item;
3221     }
3222 
3223 }
3224 
3225 static void
3226 __kmp_print_structure_team(
3227     char const *       title,
3228     kmp_team_p const * team
3229 
3230 ) {
3231     __kmp_printf( "%s", title );
3232     if ( team != NULL ) {
3233         __kmp_printf( "%2x %p\n", team->t.t_id, team );
3234     } else {
3235         __kmp_printf( " - (nil)\n" );
3236     }; // if
3237 }
3238 
3239 static void
3240 __kmp_print_structure_thread(
3241     char const *       title,
3242     kmp_info_p const * thread
3243 
3244 ) {
3245     __kmp_printf( "%s", title );
3246     if ( thread != NULL ) {
3247         __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3248     } else {
3249         __kmp_printf( " - (nil)\n" );
3250     }; // if
3251 }
3252 
3253 void
3254 __kmp_print_structure(
3255     void
3256 ) {
3257 
3258     kmp_team_list_t list;
3259 
3260     // Initialize list of teams.
3261     list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3262     list->entry = NULL;
3263     list->next  = NULL;
3264 
3265     __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3266     {
3267         int gtid;
3268         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3269             __kmp_printf( "%2d", gtid );
3270             if ( __kmp_threads != NULL ) {
3271                 __kmp_printf( " %p", __kmp_threads[ gtid ] );
3272             }; // if
3273             if ( __kmp_root != NULL ) {
3274                 __kmp_printf( " %p", __kmp_root[ gtid ] );
3275             }; // if
3276             __kmp_printf( "\n" );
3277         }; // for gtid
3278     }
3279 
3280     // Print out __kmp_threads array.
3281     __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3282     if ( __kmp_threads != NULL ) {
3283         int gtid;
3284         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3285             kmp_info_t const * thread = __kmp_threads[ gtid ];
3286             if ( thread != NULL ) {
3287                 __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3288                 __kmp_printf(                 "    Our Root:        %p\n", thread->th.th_root );
3289                 __kmp_print_structure_team(   "    Our Team:     ",        thread->th.th_team );
3290                 __kmp_print_structure_team(   "    Serial Team:  ",        thread->th.th_serial_team );
3291                 __kmp_printf(                 "    Threads:      %2d\n",   thread->th.th_team_nproc );
3292                 __kmp_print_structure_thread( "    Master:       ",        thread->th.th_team_master );
3293                 __kmp_printf(                 "    Serialized?:  %2d\n",   thread->th.th_team_serialized );
3294                 __kmp_printf(                 "    Set NProc:    %2d\n",   thread->th.th_set_nproc );
3295 #if OMP_40_ENABLED
3296                 __kmp_printf(                 "    Set Proc Bind: %2d\n",  thread->th.th_set_proc_bind );
3297 #endif
3298                 __kmp_print_structure_thread( "    Next in pool: ",        thread->th.th_next_pool );
3299                 __kmp_printf( "\n" );
3300                 __kmp_print_structure_team_accum( list, thread->th.th_team );
3301                 __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3302             }; // if
3303         }; // for gtid
3304     } else {
3305         __kmp_printf( "Threads array is not allocated.\n" );
3306     }; // if
3307 
3308     // Print out __kmp_root array.
3309     __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3310     if ( __kmp_root != NULL ) {
3311         int gtid;
3312         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3313             kmp_root_t const * root = __kmp_root[ gtid ];
3314             if ( root != NULL ) {
3315                 __kmp_printf( "GTID %2d %p:\n", gtid, root );
3316                 __kmp_print_structure_team(   "    Root Team:    ",      root->r.r_root_team );
3317                 __kmp_print_structure_team(   "    Hot Team:     ",      root->r.r_hot_team );
3318                 __kmp_print_structure_thread( "    Uber Thread:  ",      root->r.r_uber_thread );
3319                 __kmp_printf(                 "    Active?:      %2d\n", root->r.r_active );
3320                 __kmp_printf(                 "    Nested?:      %2d\n", root->r.r_nested );
3321                 __kmp_printf(                 "    In Parallel:  %2d\n", root->r.r_in_parallel );
3322                 __kmp_printf( "\n" );
3323                 __kmp_print_structure_team_accum( list, root->r.r_root_team );
3324                 __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3325             }; // if
3326         }; // for gtid
3327     } else {
3328         __kmp_printf( "Ubers array is not allocated.\n" );
3329     }; // if
3330 
3331     __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3332     while ( list->next != NULL ) {
3333         kmp_team_p const * team = list->entry;
3334         int i;
3335         __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3336         __kmp_print_structure_team( "    Parent Team:      ",      team->t.t_parent );
3337         __kmp_printf(               "    Master TID:       %2d\n", team->t.t_master_tid );
3338         __kmp_printf(               "    Max threads:      %2d\n", team->t.t_max_nproc );
3339         __kmp_printf(               "    Levels of serial: %2d\n", team->t.t_serialized );
3340         __kmp_printf(               "    Number threads:   %2d\n", team->t.t_nproc );
3341         for ( i = 0; i < team->t.t_nproc; ++ i ) {
3342             __kmp_printf(           "    Thread %2d:      ", i );
3343             __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3344         }; // for i
3345         __kmp_print_structure_team( "    Next in pool:     ",      team->t.t_next_pool );
3346         __kmp_printf( "\n" );
3347         list = list->next;
3348     }; // while
3349 
3350     // Print out __kmp_thread_pool and __kmp_team_pool.
3351     __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3352     __kmp_print_structure_thread(   "Thread pool:          ", (kmp_info_t *)__kmp_thread_pool );
3353     __kmp_print_structure_team(     "Team pool:            ", (kmp_team_t *)__kmp_team_pool );
3354     __kmp_printf( "\n" );
3355 
3356     // Free team list.
3357     while ( list != NULL ) {
3358         kmp_team_list_item_t * item = list;
3359         list = list->next;
3360         KMP_INTERNAL_FREE( item );
3361     }; // while
3362 
3363 }
3364 
3365 #endif
3366 
3367 
3368 //---------------------------------------------------------------------------
3369 //  Stuff for per-thread fast random number generator
3370 //  Table of primes
3371 
3372 static const unsigned __kmp_primes[] = {
3373   0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3374   0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3375   0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3376   0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3377   0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3378   0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3379   0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3380   0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3381   0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3382   0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3383   0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3384   0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3385   0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3386   0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3387   0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3388   0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3389 };
3390 
3391 //---------------------------------------------------------------------------
3392 //  __kmp_get_random: Get a random number using a linear congruential method.
3393 
3394 unsigned short
3395 __kmp_get_random( kmp_info_t * thread )
3396 {
3397   unsigned x = thread->th.th_x;
3398   unsigned short r = x>>16;
3399 
3400   thread->th.th_x = x*thread->th.th_a+1;
3401 
3402   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3403          thread->th.th_info.ds.ds_tid, r) );
3404 
3405   return r;
3406 }
3407 //--------------------------------------------------------
3408 // __kmp_init_random: Initialize a random number generator
3409 
3410 void
3411 __kmp_init_random( kmp_info_t * thread )
3412 {
3413   unsigned seed = thread->th.th_info.ds.ds_tid;
3414 
3415   thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3416   thread->th.th_x = (seed+1)*thread->th.th_a+1;
3417   KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3418 }
3419 
3420 
3421 #if KMP_OS_WINDOWS
3422 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3423 static int
3424 __kmp_reclaim_dead_roots(void) {
3425     int i, r = 0;
3426 
3427     for(i = 0; i < __kmp_threads_capacity; ++i) {
3428         if( KMP_UBER_GTID( i ) &&
3429           !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3430           !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3431             r += __kmp_unregister_root_other_thread(i);
3432         }
3433     }
3434     return r;
3435 }
3436 #endif
3437 
3438 /*
3439    This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3440    free entries generated.
3441 
3442    For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3443    already dead.
3444 
3445    On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3446    update to __kmp_threads_capacity.  Array capacity is increased by doubling with clipping to
3447     __kmp_tp_capacity, if threadprivate cache array has been created.
3448    Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3449 
3450    After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3451    of a total of nWish free slots, the function does that expansion.  If not, but the clipping value allows
3452    array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3453    Otherwise, nothing is done beyond the possible initial root thread reclamation.  However, if nNeed is zero,
3454    a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3455    as many free slots as possible up to nWish.
3456 
3457    If any argument is negative, the behavior is undefined.
3458 */
3459 static int
3460 __kmp_expand_threads(int nWish, int nNeed) {
3461     int added = 0;
3462     int old_tp_cached;
3463     int __kmp_actual_max_nth;
3464 
3465     if(nNeed > nWish) /* normalize the arguments */
3466         nWish = nNeed;
3467 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3468 /* only for Windows static library */
3469     /* reclaim array entries for root threads that are already dead */
3470     added = __kmp_reclaim_dead_roots();
3471 
3472     if(nNeed) {
3473         nNeed -= added;
3474         if(nNeed < 0)
3475             nNeed = 0;
3476     }
3477     if(nWish) {
3478         nWish -= added;
3479         if(nWish < 0)
3480             nWish = 0;
3481     }
3482 #endif
3483     if(nWish <= 0)
3484         return added;
3485 
3486     while(1) {
3487         int nTarget;
3488         int minimumRequiredCapacity;
3489         int newCapacity;
3490         kmp_info_t **newThreads;
3491         kmp_root_t **newRoot;
3492 
3493         //
3494         // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3495         // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3496         // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3497         // become > __kmp_max_nth in one of two ways:
3498         //
3499         // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3500         //    may not be resused by another thread, so we may need to increase
3501         //    __kmp_threads_capacity to __kmp_max_threads + 1.
3502         //
3503         // 2) New foreign root(s) are encountered.  We always register new
3504         //    foreign roots.  This may cause a smaller # of threads to be
3505         //    allocated at subsequent parallel regions, but the worker threads
3506         //    hang around (and eventually go to sleep) and need slots in the
3507         //    __kmp_threads[] array.
3508         //
3509         // Anyway, that is the reason for moving the check to see if
3510         // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3511         // instead of having it performed here. -BB
3512         //
3513         old_tp_cached = __kmp_tp_cached;
3514         __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3515         KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3516 
3517         /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3518         nTarget = nWish;
3519         if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3520             /* can't fulfil nWish, so try nNeed */
3521             if(nNeed) {
3522                 nTarget = nNeed;
3523                 if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3524                     /* possible expansion too small -- give up */
3525                     break;
3526                 }
3527             } else {
3528                 /* best-effort */
3529                 nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3530                 if(!nTarget) {
3531                     /* can expand at all -- give up */
3532                     break;
3533                 }
3534             }
3535         }
3536         minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3537 
3538         newCapacity = __kmp_threads_capacity;
3539         do{
3540             newCapacity =
3541                 newCapacity <= (__kmp_actual_max_nth >> 1) ?
3542                 (newCapacity << 1) :
3543                 __kmp_actual_max_nth;
3544         } while(newCapacity < minimumRequiredCapacity);
3545         newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3546         newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3547         KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3548         KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3549         memset(newThreads + __kmp_threads_capacity, 0,
3550                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3551         memset(newRoot + __kmp_threads_capacity, 0,
3552                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3553 
3554         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3555             /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3556                while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3557                cache capacity, so we should deallocate the expanded arrays and try again.  This is the first check
3558                of a double-check pair.
3559             */
3560             __kmp_free(newThreads);
3561             continue; /* start over and try again */
3562         }
3563         __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3564         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3565             /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3566             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3567             __kmp_free(newThreads);
3568             continue; /* start over and try again */
3569         } else {
3570             /* success */
3571             // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3572             //
3573             *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3574             *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3575             added += newCapacity - __kmp_threads_capacity;
3576             *(volatile int*)&__kmp_threads_capacity = newCapacity;
3577             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3578             break; /* succeeded, so we can exit the loop */
3579         }
3580     }
3581     return added;
3582 }
3583 
3584 /* register the current thread as a root thread and obtain our gtid */
3585 /* we must have the __kmp_initz_lock held at this point */
3586 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3587 int
3588 __kmp_register_root( int initial_thread )
3589 {
3590     kmp_info_t *root_thread;
3591     kmp_root_t *root;
3592     int         gtid;
3593     int         capacity;
3594     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3595     KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3596     KMP_MB();
3597 
3598 
3599     /*
3600         2007-03-02:
3601 
3602         If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3603         "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3604         return false (that means there is at least one empty slot in __kmp_threads array), but it
3605         is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3606         used for this one. Following code workarounds this bug.
3607 
3608         However, right solution seems to be not reserving slot #0 for initial thread because:
3609             (1) there is no magic in slot #0,
3610             (2) we cannot detect initial thread reliably (the first thread which does serial
3611                 initialization may be not a real initial thread).
3612     */
3613     capacity = __kmp_threads_capacity;
3614     if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3615         -- capacity;
3616     }; // if
3617 
3618     /* see if there are too many threads */
3619     if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3620         if ( __kmp_tp_cached ) {
3621             __kmp_msg(
3622                 kmp_ms_fatal,
3623                 KMP_MSG( CantRegisterNewThread ),
3624                 KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3625                 KMP_HNT( PossibleSystemLimitOnThreads ),
3626                 __kmp_msg_null
3627             );
3628         }
3629         else {
3630             __kmp_msg(
3631                 kmp_ms_fatal,
3632                 KMP_MSG( CantRegisterNewThread ),
3633                 KMP_HNT( SystemLimitOnThreads ),
3634                 __kmp_msg_null
3635             );
3636         }
3637     }; // if
3638 
3639     /* find an available thread slot */
3640     /* Don't reassign the zero slot since we need that to only be used by initial
3641        thread */
3642     for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3643         ;
3644     KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3645     KMP_ASSERT( gtid < __kmp_threads_capacity );
3646 
3647     /* update global accounting */
3648     __kmp_all_nth ++;
3649     TCW_4(__kmp_nth, __kmp_nth + 1);
3650 
3651     //
3652     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3653     // for low numbers of procs, and method #2 (keyed API call) for higher
3654     // numbers of procs.
3655     //
3656     if ( __kmp_adjust_gtid_mode ) {
3657         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3658             if ( TCR_4(__kmp_gtid_mode) != 2) {
3659                 TCW_4(__kmp_gtid_mode, 2);
3660             }
3661         }
3662         else {
3663             if (TCR_4(__kmp_gtid_mode) != 1 ) {
3664                 TCW_4(__kmp_gtid_mode, 1);
3665             }
3666         }
3667     }
3668 
3669 #ifdef KMP_ADJUST_BLOCKTIME
3670     /* Adjust blocktime to zero if necessary            */
3671     /* Middle initialization might not have occurred yet */
3672     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3673         if ( __kmp_nth > __kmp_avail_proc ) {
3674             __kmp_zero_bt = TRUE;
3675         }
3676     }
3677 #endif /* KMP_ADJUST_BLOCKTIME */
3678 
3679     /* setup this new hierarchy */
3680     if( ! ( root = __kmp_root[gtid] )) {
3681         root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3682         KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3683     }
3684 
3685 #if KMP_STATS_ENABLED
3686     // Initialize stats as soon as possible (right after gtid assignment).
3687     __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3688     KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3689     KMP_SET_THREAD_STATE(SERIAL_REGION);
3690     KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3691 #endif
3692     __kmp_initialize_root( root );
3693 
3694     /* setup new root thread structure */
3695     if( root->r.r_uber_thread ) {
3696         root_thread = root->r.r_uber_thread;
3697     } else {
3698         root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3699         if ( __kmp_storage_map ) {
3700             __kmp_print_thread_storage_map( root_thread, gtid );
3701         }
3702         root_thread->th.th_info .ds.ds_gtid = gtid;
3703         root_thread->th.th_root =  root;
3704         if( __kmp_env_consistency_check ) {
3705             root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3706         }
3707         #if USE_FAST_MEMORY
3708             __kmp_initialize_fast_memory( root_thread );
3709         #endif /* USE_FAST_MEMORY */
3710 
3711         #if KMP_USE_BGET
3712             KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3713             __kmp_initialize_bget( root_thread );
3714         #endif
3715         __kmp_init_random( root_thread );  // Initialize random number generator
3716     }
3717 
3718     /* setup the serial team held in reserve by the root thread */
3719     if( ! root_thread->th.th_serial_team ) {
3720         kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3721         KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3722 
3723         root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3724 #if OMPT_SUPPORT
3725           0, // root parallel id
3726 #endif
3727 #if OMP_40_ENABLED
3728           proc_bind_default,
3729 #endif
3730           &r_icvs,
3731           0 USE_NESTED_HOT_ARG(NULL) );
3732     }
3733     KMP_ASSERT( root_thread->th.th_serial_team );
3734     KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3735       root_thread->th.th_serial_team ) );
3736 
3737     /* drop root_thread into place */
3738     TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3739 
3740     root->r.r_root_team->t.t_threads[0] = root_thread;
3741     root->r.r_hot_team ->t.t_threads[0] = root_thread;
3742     root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3743     root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3744     root->r.r_uber_thread = root_thread;
3745 
3746     /* initialize the thread, get it ready to go */
3747     __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3748     TCW_4(__kmp_init_gtid, TRUE);
3749 
3750     /* prepare the master thread for get_gtid() */
3751     __kmp_gtid_set_specific( gtid );
3752 
3753 #if USE_ITT_BUILD
3754     __kmp_itt_thread_name( gtid );
3755 #endif /* USE_ITT_BUILD */
3756 
3757     #ifdef KMP_TDATA_GTID
3758         __kmp_gtid = gtid;
3759     #endif
3760     __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3761     KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3762 
3763     KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3764                     gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3765                     root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3766                     KMP_INIT_BARRIER_STATE ) );
3767     { // Initialize barrier data.
3768         int b;
3769         for ( b = 0; b < bs_last_barrier; ++ b ) {
3770             root_thread->th.th_bar[ b ].bb.b_arrived        = KMP_INIT_BARRIER_STATE;
3771 #if USE_DEBUGGER
3772             root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3773 #endif
3774         }; // for
3775     }
3776     KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3777 
3778 #if KMP_AFFINITY_SUPPORTED
3779 # if OMP_40_ENABLED
3780     root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3781     root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3782     root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3783     root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3784 # endif
3785 
3786     if ( TCR_4(__kmp_init_middle) ) {
3787         __kmp_affinity_set_init_mask( gtid, TRUE );
3788     }
3789 #endif /* KMP_AFFINITY_SUPPORTED */
3790 
3791     __kmp_root_counter ++;
3792 
3793     KMP_MB();
3794     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3795 
3796     return gtid;
3797 }
3798 
3799 #if KMP_NESTED_HOT_TEAMS
3800 static int
3801 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3802 {
3803     int i, n, nth;
3804     kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3805     if( !hot_teams || !hot_teams[level].hot_team ) {
3806         return 0;
3807     }
3808     KMP_DEBUG_ASSERT( level < max_level );
3809     kmp_team_t *team = hot_teams[level].hot_team;
3810     nth = hot_teams[level].hot_team_nth;
3811     n = nth - 1;                   // master is not freed
3812     if( level < max_level - 1 ) {
3813         for( i = 0; i < nth; ++i ) {
3814             kmp_info_t *th = team->t.t_threads[i];
3815             n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3816             if( i > 0 && th->th.th_hot_teams ) {
3817                 __kmp_free( th->th.th_hot_teams );
3818                 th->th.th_hot_teams = NULL;
3819             }
3820         }
3821     }
3822     __kmp_free_team( root, team, NULL );
3823     return n;
3824 }
3825 #endif
3826 
3827 /* Resets a root thread and clear its root and hot teams.
3828    Returns the number of __kmp_threads entries directly and indirectly freed.
3829 */
3830 static int
3831 __kmp_reset_root(int gtid, kmp_root_t *root)
3832 {
3833     kmp_team_t * root_team = root->r.r_root_team;
3834     kmp_team_t * hot_team  = root->r.r_hot_team;
3835     int          n         = hot_team->t.t_nproc;
3836     int i;
3837 
3838     KMP_DEBUG_ASSERT( ! root->r.r_active );
3839 
3840     root->r.r_root_team = NULL;
3841     root->r.r_hot_team  = NULL;
3842         // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3843         // to __kmp_free_team().
3844     __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3845 #if KMP_NESTED_HOT_TEAMS
3846     if( __kmp_hot_teams_max_level > 0 ) {  // need to free nested hot teams and their threads if any
3847         for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3848             kmp_info_t *th = hot_team->t.t_threads[i];
3849             if( __kmp_hot_teams_max_level > 1 ) {
3850                 n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3851             }
3852             if( th->th.th_hot_teams ) {
3853                 __kmp_free( th->th.th_hot_teams );
3854                 th->th.th_hot_teams = NULL;
3855             }
3856         }
3857     }
3858 #endif
3859     __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3860 
3861     //
3862     // Before we can reap the thread, we need to make certain that all
3863     // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3864     //
3865     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3866         __kmp_wait_to_unref_task_teams();
3867     }
3868 
3869     #if KMP_OS_WINDOWS
3870         /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3871         KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3872             (LPVOID)&(root->r.r_uber_thread->th),
3873             root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3874         __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3875     #endif /* KMP_OS_WINDOWS */
3876 
3877 #if OMPT_SUPPORT
3878     if (ompt_enabled &&
3879         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3880         int gtid = __kmp_get_gtid();
3881         __ompt_thread_end(ompt_thread_initial, gtid);
3882     }
3883 #endif
3884 
3885     TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3886     __kmp_reap_thread( root->r.r_uber_thread, 1 );
3887 
3888         // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3889     root->r.r_uber_thread = NULL;
3890     /* mark root as no longer in use */
3891     root->r.r_begin = FALSE;
3892 
3893     return n;
3894 }
3895 
3896 void
3897 __kmp_unregister_root_current_thread( int gtid )
3898 {
3899     KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3900     /* this lock should be ok, since unregister_root_current_thread is never called during
3901      * and abort, only during a normal close.  furthermore, if you have the
3902      * forkjoin lock, you should never try to get the initz lock */
3903 
3904     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3905     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3906         KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3907         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3908         return;
3909     }
3910     kmp_root_t *root = __kmp_root[gtid];
3911 
3912     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3913     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3914     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3915     KMP_ASSERT( root->r.r_active == FALSE );
3916 
3917 
3918     KMP_MB();
3919 
3920 #if OMP_45_ENABLED
3921    kmp_info_t * thread = __kmp_threads[gtid];
3922    kmp_team_t * team = thread->th.th_team;
3923    kmp_task_team_t *   task_team = thread->th.th_task_team;
3924 
3925    // we need to wait for the proxy tasks before finishing the thread
3926    if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3927 #if OMPT_SUPPORT
3928         // the runtime is shutting down so we won't report any events
3929         thread->th.ompt_thread_info.state = ompt_state_undefined;
3930 #endif
3931         __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3932    }
3933 #endif
3934 
3935     __kmp_reset_root(gtid, root);
3936 
3937     /* free up this thread slot */
3938     __kmp_gtid_set_specific( KMP_GTID_DNE );
3939 #ifdef KMP_TDATA_GTID
3940     __kmp_gtid = KMP_GTID_DNE;
3941 #endif
3942 
3943     KMP_MB();
3944     KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3945 
3946     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3947 }
3948 
3949 #if KMP_OS_WINDOWS
3950 /* __kmp_forkjoin_lock must be already held
3951    Unregisters a root thread that is not the current thread.  Returns the number of
3952    __kmp_threads entries freed as a result.
3953  */
3954 static int
3955 __kmp_unregister_root_other_thread( int gtid )
3956 {
3957     kmp_root_t *root = __kmp_root[gtid];
3958     int r;
3959 
3960     KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3961     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3962     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3963     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3964     KMP_ASSERT( root->r.r_active == FALSE );
3965 
3966     r = __kmp_reset_root(gtid, root);
3967     KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3968     return r;
3969 }
3970 #endif
3971 
3972 #if KMP_DEBUG
3973 void __kmp_task_info() {
3974 
3975     kmp_int32 gtid       = __kmp_entry_gtid();
3976     kmp_int32 tid        = __kmp_tid_from_gtid( gtid );
3977     kmp_info_t *this_thr = __kmp_threads[ gtid ];
3978     kmp_team_t *steam    = this_thr->th.th_serial_team;
3979     kmp_team_t *team     = this_thr->th.th_team;
3980 
3981     __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3982         gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3983 }
3984 #endif // KMP_DEBUG
3985 
3986 /* TODO optimize with one big memclr, take out what isn't needed,
3987  * split responsibility to workers as much as possible, and delay
3988  * initialization of features as much as possible  */
3989 static void
3990 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3991 {
3992     /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3993      * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3994     kmp_info_t *master = team->t.t_threads[0];
3995     KMP_DEBUG_ASSERT( this_thr != NULL );
3996     KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
3997     KMP_DEBUG_ASSERT( team );
3998     KMP_DEBUG_ASSERT( team->t.t_threads  );
3999     KMP_DEBUG_ASSERT( team->t.t_dispatch );
4000     KMP_DEBUG_ASSERT( master );
4001     KMP_DEBUG_ASSERT( master->th.th_root );
4002 
4003     KMP_MB();
4004 
4005     TCW_SYNC_PTR(this_thr->th.th_team, team);
4006 
4007     this_thr->th.th_info.ds.ds_tid  = tid;
4008     this_thr->th.th_set_nproc       = 0;
4009     if (__kmp_tasking_mode != tskm_immediate_exec)
4010         // When tasking is possible, threads are not safe to reap until they are
4011         // done tasking; this will be set when tasking code is exited in wait
4012         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4013     else  // no tasking --> always safe to reap
4014         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4015 #if OMP_40_ENABLED
4016     this_thr->th.th_set_proc_bind   = proc_bind_default;
4017 # if KMP_AFFINITY_SUPPORTED
4018     this_thr->th.th_new_place       = this_thr->th.th_current_place;
4019 # endif
4020 #endif
4021     this_thr->th.th_root            = master->th.th_root;
4022 
4023     /* setup the thread's cache of the team structure */
4024     this_thr->th.th_team_nproc      = team->t.t_nproc;
4025     this_thr->th.th_team_master     = master;
4026     this_thr->th.th_team_serialized = team->t.t_serialized;
4027     TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4028 
4029     KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4030 
4031     KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4032                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4033 
4034     __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4035 
4036     KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4037                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4038     // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4039 
4040     /* TODO no worksharing in speculative threads */
4041     this_thr->th.th_dispatch      = &team->t.t_dispatch[ tid ];
4042 
4043     this_thr->th.th_local.this_construct = 0;
4044 
4045 #ifdef BUILD_TV
4046     this_thr->th.th_local.tv_data = 0;
4047 #endif
4048 
4049     if ( ! this_thr->th.th_pri_common ) {
4050         this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4051         if ( __kmp_storage_map ) {
4052             __kmp_print_storage_map_gtid(
4053                 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4054                 sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4055             );
4056         }; // if
4057         this_thr->th.th_pri_head = NULL;
4058     }; // if
4059 
4060     /* Initialize dynamic dispatch */
4061     {
4062         volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4063         /*
4064          * Use team max_nproc since this will never change for the team.
4065          */
4066         size_t disp_size = sizeof( dispatch_private_info_t ) *
4067             ( team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers );
4068         KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4069         KMP_ASSERT( dispatch );
4070         KMP_DEBUG_ASSERT( team->t.t_dispatch );
4071         KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4072 
4073         dispatch->th_disp_index = 0;
4074 #if OMP_45_ENABLED
4075         dispatch->th_doacross_buf_idx = 0;
4076 #endif
4077         if( ! dispatch->th_disp_buffer )  {
4078             dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4079 
4080             if ( __kmp_storage_map ) {
4081                 __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4082                                          &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers ],
4083                                          disp_size, "th_%d.th_dispatch.th_disp_buffer "
4084                                          "(team_%d.t_dispatch[%d].th_disp_buffer)",
4085                                          gtid, team->t.t_id, gtid );
4086             }
4087         } else {
4088             memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4089         }
4090 
4091         dispatch->th_dispatch_pr_current = 0;
4092         dispatch->th_dispatch_sh_current = 0;
4093 
4094         dispatch->th_deo_fcn = 0;             /* ORDERED     */
4095         dispatch->th_dxo_fcn = 0;             /* END ORDERED */
4096     }
4097 
4098     this_thr->th.th_next_pool = NULL;
4099 
4100     if (!this_thr->th.th_task_state_memo_stack) {
4101         size_t i;
4102         this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4103         this_thr->th.th_task_state_top = 0;
4104         this_thr->th.th_task_state_stack_sz = 4;
4105         for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4106             this_thr->th.th_task_state_memo_stack[i] = 0;
4107     }
4108 
4109     KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4110     KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4111 
4112     KMP_MB();
4113 }
4114 
4115 
4116 /* allocate a new thread for the requesting team.  this is only called from within a
4117  * forkjoin critical section.  we will first try to get an available thread from the
4118  * thread pool.  if none is available, we will fork a new one assuming we are able
4119  * to create a new one.  this should be assured, as the caller should check on this
4120  * first.
4121  */
4122 kmp_info_t *
4123 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4124 {
4125     kmp_team_t  *serial_team;
4126     kmp_info_t  *new_thr;
4127     int          new_gtid;
4128 
4129     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4130     KMP_DEBUG_ASSERT( root && team );
4131 #if !KMP_NESTED_HOT_TEAMS
4132     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4133 #endif
4134     KMP_MB();
4135 
4136     /* first, try to get one from the thread pool */
4137     if ( __kmp_thread_pool ) {
4138 
4139         new_thr = (kmp_info_t*)__kmp_thread_pool;
4140         __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4141         if ( new_thr == __kmp_thread_pool_insert_pt ) {
4142             __kmp_thread_pool_insert_pt = NULL;
4143         }
4144         TCW_4(new_thr->th.th_in_pool, FALSE);
4145         //
4146         // Don't touch th_active_in_pool or th_active.
4147         // The worker thread adjusts those flags as it sleeps/awakens.
4148         //
4149         __kmp_thread_pool_nth--;
4150 
4151         KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4152                     __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4153         KMP_ASSERT(       ! new_thr->th.th_team );
4154         KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4155         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4156 
4157         /* setup the thread structure */
4158         __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4159         KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4160 
4161         TCW_4(__kmp_nth, __kmp_nth + 1);
4162 
4163         new_thr->th.th_task_state = 0;
4164         new_thr->th.th_task_state_top = 0;
4165         new_thr->th.th_task_state_stack_sz = 4;
4166 
4167 #ifdef KMP_ADJUST_BLOCKTIME
4168         /* Adjust blocktime back to zero if necessar      y */
4169         /* Middle initialization might not have occurred yet */
4170         if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4171             if ( __kmp_nth > __kmp_avail_proc ) {
4172                 __kmp_zero_bt = TRUE;
4173             }
4174         }
4175 #endif /* KMP_ADJUST_BLOCKTIME */
4176 
4177 #if KMP_DEBUG
4178         // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4179         int b;
4180         kmp_balign_t * balign = new_thr->th.th_bar;
4181         for( b = 0; b < bs_last_barrier; ++ b )
4182             KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4183 #endif
4184 
4185         KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4186                     __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4187 
4188         KMP_MB();
4189         return new_thr;
4190     }
4191 
4192 
4193     /* no, well fork a new one */
4194     KMP_ASSERT( __kmp_nth    == __kmp_all_nth );
4195     KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4196 
4197 #if KMP_USE_MONITOR
4198     //
4199     // If this is the first worker thread the RTL is creating, then also
4200     // launch the monitor thread.  We try to do this as early as possible.
4201     //
4202     if ( ! TCR_4( __kmp_init_monitor ) ) {
4203         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4204         if ( ! TCR_4( __kmp_init_monitor ) ) {
4205             KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4206             TCW_4( __kmp_init_monitor, 1 );
4207             __kmp_create_monitor( & __kmp_monitor );
4208             KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4209             #if KMP_OS_WINDOWS
4210                 // AC: wait until monitor has started. This is a fix for CQ232808.
4211                 //     The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4212                 //     work in between, then there is high probability that monitor thread started after
4213                 //     the library shutdown. At shutdown it is too late to cope with the problem, because
4214                 //     when the master is in DllMain (process detach) the monitor has no chances to start
4215                 //     (it is blocked), and master has no means to inform the monitor that the library has gone,
4216                 //     because all the memory which the monitor can access is going to be released/reset.
4217                 while ( TCR_4(__kmp_init_monitor) < 2 ) {
4218                     KMP_YIELD( TRUE );
4219                 }
4220                 KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4221             #endif
4222         }
4223         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4224     }
4225 #endif
4226 
4227     KMP_MB();
4228     for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4229         KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4230     }
4231 
4232     /* allocate space for it. */
4233     new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4234 
4235     TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4236 
4237     if ( __kmp_storage_map ) {
4238         __kmp_print_thread_storage_map( new_thr, new_gtid );
4239     }
4240 
4241     /* add the reserve serialized team, initialized from the team's master thread */
4242     {
4243     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4244     KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4245 
4246     new_thr->th.th_serial_team = serial_team =
4247         (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4248 #if OMPT_SUPPORT
4249                                            0, // root parallel id
4250 #endif
4251 #if OMP_40_ENABLED
4252                                            proc_bind_default,
4253 #endif
4254                                            &r_icvs,
4255                                            0 USE_NESTED_HOT_ARG(NULL) );
4256     }
4257     KMP_ASSERT ( serial_team );
4258     serial_team->t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
4259     serial_team->t.t_threads[0] = new_thr;
4260     KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4261       new_thr ) );
4262 
4263     /* setup the thread structures */
4264     __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4265 
4266     #if USE_FAST_MEMORY
4267         __kmp_initialize_fast_memory( new_thr );
4268     #endif /* USE_FAST_MEMORY */
4269 
4270     #if KMP_USE_BGET
4271         KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4272         __kmp_initialize_bget( new_thr );
4273     #endif
4274 
4275     __kmp_init_random( new_thr );  // Initialize random number generator
4276 
4277     /* Initialize these only once when thread is grabbed for a team allocation */
4278     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4279                     __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4280 
4281     int b;
4282     kmp_balign_t * balign = new_thr->th.th_bar;
4283     for(b=0; b<bs_last_barrier; ++b) {
4284         balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4285         balign[b].bb.team = NULL;
4286         balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4287         balign[b].bb.use_oncore_barrier = 0;
4288     }
4289 
4290     new_thr->th.th_spin_here = FALSE;
4291     new_thr->th.th_next_waiting = 0;
4292 
4293 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4294     new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4295     new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4296     new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4297     new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4298 #endif
4299 
4300     TCW_4(new_thr->th.th_in_pool, FALSE);
4301     new_thr->th.th_active_in_pool = FALSE;
4302     TCW_4(new_thr->th.th_active, TRUE);
4303 
4304     /* adjust the global counters */
4305     __kmp_all_nth ++;
4306     __kmp_nth ++;
4307 
4308     //
4309     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4310     // for low numbers of procs, and method #2 (keyed API call) for higher
4311     // numbers of procs.
4312     //
4313     if ( __kmp_adjust_gtid_mode ) {
4314         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4315             if ( TCR_4(__kmp_gtid_mode) != 2) {
4316                 TCW_4(__kmp_gtid_mode, 2);
4317             }
4318         }
4319         else {
4320             if (TCR_4(__kmp_gtid_mode) != 1 ) {
4321                 TCW_4(__kmp_gtid_mode, 1);
4322             }
4323         }
4324     }
4325 
4326 #ifdef KMP_ADJUST_BLOCKTIME
4327     /* Adjust blocktime back to zero if necessary       */
4328     /* Middle initialization might not have occurred yet */
4329     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4330         if ( __kmp_nth > __kmp_avail_proc ) {
4331             __kmp_zero_bt = TRUE;
4332         }
4333     }
4334 #endif /* KMP_ADJUST_BLOCKTIME */
4335 
4336     /* actually fork it and create the new worker thread */
4337     KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4338     __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4339     KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4340 
4341     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4342     KMP_MB();
4343     return new_thr;
4344 }
4345 
4346 /*
4347  * reinitialize team for reuse.
4348  *
4349  * The hot team code calls this case at every fork barrier, so EPCC barrier
4350  * test are extremely sensitive to changes in it, esp. writes to the team
4351  * struct, which cause a cache invalidation in all threads.
4352  *
4353  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4354  */
4355 static void
4356 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4357     KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4358                     team->t.t_threads[0], team ) );
4359     KMP_DEBUG_ASSERT( team && new_icvs);
4360     KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4361     KMP_CHECK_UPDATE(team->t.t_ident, loc);
4362 
4363     KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4364 
4365     // Copy ICVs to the master thread's implicit taskdata
4366     __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4367     copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4368 
4369     KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4370                     team->t.t_threads[0], team ) );
4371 }
4372 
4373 
4374 /* initialize the team data structure
4375  * this assumes the t_threads and t_max_nproc are already set
4376  * also, we don't touch the arguments */
4377 static void
4378 __kmp_initialize_team(
4379     kmp_team_t * team,
4380     int          new_nproc,
4381     kmp_internal_control_t * new_icvs,
4382     ident_t *                loc
4383 ) {
4384     KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4385 
4386     /* verify */
4387     KMP_DEBUG_ASSERT( team );
4388     KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4389     KMP_DEBUG_ASSERT( team->t.t_threads );
4390     KMP_MB();
4391 
4392     team->t.t_master_tid  = 0;    /* not needed */
4393     /* team->t.t_master_bar;        not needed */
4394     team->t.t_serialized  = new_nproc > 1 ? 0 : 1;
4395     team->t.t_nproc       = new_nproc;
4396 
4397     /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4398     team->t.t_next_pool   = NULL;
4399     /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4400 
4401     TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4402     team->t.t_invoke      = NULL; /* not needed */
4403 
4404     // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4405     team->t.t_sched       = new_icvs->sched;
4406 
4407 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4408     team->t.t_fp_control_saved = FALSE; /* not needed */
4409     team->t.t_x87_fpu_control_word = 0; /* not needed */
4410     team->t.t_mxcsr = 0;                /* not needed */
4411 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4412 
4413     team->t.t_construct   = 0;
4414     __kmp_init_lock( & team->t.t_single_lock );
4415 
4416     team->t.t_ordered .dt.t_value = 0;
4417     team->t.t_master_active = FALSE;
4418 
4419     memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4420 
4421 #ifdef KMP_DEBUG
4422     team->t.t_copypriv_data = NULL;  /* not necessary, but nice for debugging */
4423 #endif
4424     team->t.t_copyin_counter = 0;    /* for barrier-free copyin implementation */
4425 
4426     team->t.t_control_stack_top = NULL;
4427 
4428     __kmp_reinitialize_team( team, new_icvs, loc );
4429 
4430     KMP_MB();
4431     KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4432 }
4433 
4434 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4435 /* Sets full mask for thread and returns old mask, no changes to structures. */
4436 static void
4437 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4438 {
4439     if ( KMP_AFFINITY_CAPABLE() ) {
4440         int status;
4441         if ( old_mask != NULL ) {
4442             status = __kmp_get_system_affinity( old_mask, TRUE );
4443             int error = errno;
4444             if ( status != 0 ) {
4445                 __kmp_msg(
4446                     kmp_ms_fatal,
4447                     KMP_MSG( ChangeThreadAffMaskError ),
4448                     KMP_ERR( error ),
4449                     __kmp_msg_null
4450                 );
4451             }
4452         }
4453         __kmp_set_system_affinity( __kmp_affin_fullMask, TRUE );
4454     }
4455 }
4456 #endif
4457 
4458 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4459 
4460 //
4461 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4462 // It calculats the worker + master thread's partition based upon the parent
4463 // thread's partition, and binds each worker to a thread in their partition.
4464 // The master thread's partition should already include its current binding.
4465 //
4466 static void
4467 __kmp_partition_places( kmp_team_t *team, int update_master_only )
4468 {
4469     //
4470     // Copy the master thread's place partion to the team struct
4471     //
4472     kmp_info_t *master_th = team->t.t_threads[0];
4473     KMP_DEBUG_ASSERT( master_th != NULL );
4474     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4475     int first_place = master_th->th.th_first_place;
4476     int last_place = master_th->th.th_last_place;
4477     int masters_place = master_th->th.th_current_place;
4478     team->t.t_first_place = first_place;
4479     team->t.t_last_place = last_place;
4480 
4481     KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4482        proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4483        masters_place, first_place, last_place ) );
4484 
4485     switch ( proc_bind ) {
4486 
4487         case proc_bind_default:
4488         //
4489         // serial teams might have the proc_bind policy set to
4490         // proc_bind_default.  It doesn't matter, as we don't
4491         // rebind the master thread for any proc_bind policy.
4492         //
4493         KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4494         break;
4495 
4496         case proc_bind_master:
4497         {
4498             int f;
4499             int n_th = team->t.t_nproc;
4500             for ( f = 1; f < n_th; f++ ) {
4501                 kmp_info_t *th = team->t.t_threads[f];
4502                 KMP_DEBUG_ASSERT( th != NULL );
4503                 th->th.th_first_place = first_place;
4504                 th->th.th_last_place = last_place;
4505                 th->th.th_new_place = masters_place;
4506 
4507                 KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4508                   __kmp_gtid_from_thread( team->t.t_threads[f] ),
4509                   team->t.t_id, f, masters_place, first_place, last_place ) );
4510             }
4511         }
4512         break;
4513 
4514         case proc_bind_close:
4515         {
4516             int f;
4517             int n_th = team->t.t_nproc;
4518             int n_places;
4519             if ( first_place <= last_place ) {
4520                 n_places = last_place - first_place + 1;
4521             }
4522             else {
4523                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4524             }
4525             if ( n_th <= n_places ) {
4526                 int place = masters_place;
4527                 for ( f = 1; f < n_th; f++ ) {
4528                     kmp_info_t *th = team->t.t_threads[f];
4529                     KMP_DEBUG_ASSERT( th != NULL );
4530 
4531                     if ( place == last_place ) {
4532                         place = first_place;
4533                     }
4534                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4535                         place = 0;
4536                     }
4537                     else {
4538                         place++;
4539                     }
4540                     th->th.th_first_place = first_place;
4541                     th->th.th_last_place = last_place;
4542                     th->th.th_new_place = place;
4543 
4544                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4545                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4546                        team->t.t_id, f, place, first_place, last_place ) );
4547                 }
4548             }
4549             else {
4550                 int S, rem, gap, s_count;
4551                 S = n_th / n_places;
4552                 s_count = 0;
4553                 rem = n_th - ( S * n_places );
4554                 gap = rem > 0 ? n_places/rem : n_places;
4555                 int place = masters_place;
4556                 int gap_ct = gap;
4557                 for ( f = 0; f < n_th; f++ ) {
4558                     kmp_info_t *th = team->t.t_threads[f];
4559                     KMP_DEBUG_ASSERT( th != NULL );
4560 
4561                     th->th.th_first_place = first_place;
4562                     th->th.th_last_place = last_place;
4563                     th->th.th_new_place = place;
4564                     s_count++;
4565 
4566                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4567                         // do nothing, add an extra thread to place on next iteration
4568                     }
4569                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4570                         // we added an extra thread to this place; move to next place
4571                         if ( place == last_place ) {
4572                             place = first_place;
4573                         }
4574                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4575                             place = 0;
4576                         }
4577                         else {
4578                             place++;
4579                         }
4580                         s_count = 0;
4581                         gap_ct = 1;
4582                         rem--;
4583                     }
4584                     else if (s_count == S) { // place full; don't add extra
4585                         if ( place == last_place ) {
4586                             place = first_place;
4587                         }
4588                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4589                             place = 0;
4590                         }
4591                         else {
4592                             place++;
4593                         }
4594                         gap_ct++;
4595                         s_count = 0;
4596                     }
4597 
4598                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4599                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4600                       team->t.t_id, f, th->th.th_new_place, first_place,
4601                       last_place ) );
4602                 }
4603                 KMP_DEBUG_ASSERT( place == masters_place );
4604             }
4605         }
4606         break;
4607 
4608         case proc_bind_spread:
4609         {
4610             int f;
4611             int n_th = team->t.t_nproc;
4612             int n_places;
4613             int thidx;
4614             if ( first_place <= last_place ) {
4615                 n_places = last_place - first_place + 1;
4616             }
4617             else {
4618                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4619             }
4620             if ( n_th <= n_places ) {
4621                 int place = masters_place;
4622                 int S = n_places/n_th;
4623                 int s_count, rem, gap, gap_ct;
4624                 rem = n_places - n_th*S;
4625                 gap = rem ? n_th/rem : 1;
4626                 gap_ct = gap;
4627                 thidx = n_th;
4628                 if (update_master_only == 1)
4629                     thidx = 1;
4630                 for ( f = 0; f < thidx; f++ ) {
4631                     kmp_info_t *th = team->t.t_threads[f];
4632                     KMP_DEBUG_ASSERT( th != NULL );
4633 
4634                     th->th.th_first_place = place;
4635                     th->th.th_new_place = place;
4636                     s_count = 1;
4637                     while (s_count < S) {
4638                         if ( place == last_place ) {
4639                             place = first_place;
4640                         }
4641                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4642                             place = 0;
4643                         }
4644                         else {
4645                             place++;
4646                         }
4647                         s_count++;
4648                     }
4649                     if (rem && (gap_ct == gap)) {
4650                         if ( place == last_place ) {
4651                             place = first_place;
4652                         }
4653                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4654                             place = 0;
4655                         }
4656                         else {
4657                             place++;
4658                         }
4659                         rem--;
4660                         gap_ct = 0;
4661                     }
4662                     th->th.th_last_place = place;
4663                     gap_ct++;
4664 
4665                     if ( place == last_place ) {
4666                         place = first_place;
4667                     }
4668                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4669                         place = 0;
4670                     }
4671                     else {
4672                         place++;
4673                     }
4674 
4675                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4676                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4677                       team->t.t_id, f, th->th.th_new_place,
4678                       th->th.th_first_place, th->th.th_last_place ) );
4679                 }
4680                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4681             }
4682             else {
4683                 int S, rem, gap, s_count;
4684                 S = n_th / n_places;
4685                 s_count = 0;
4686                 rem = n_th - ( S * n_places );
4687                 gap = rem > 0 ? n_places/rem : n_places;
4688                 int place = masters_place;
4689                 int gap_ct = gap;
4690                 thidx = n_th;
4691                 if (update_master_only == 1)
4692                     thidx = 1;
4693                 for ( f = 0; f < thidx; f++ ) {
4694                     kmp_info_t *th = team->t.t_threads[f];
4695                     KMP_DEBUG_ASSERT( th != NULL );
4696 
4697                     th->th.th_first_place = place;
4698                     th->th.th_last_place = place;
4699                     th->th.th_new_place = place;
4700                     s_count++;
4701 
4702                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4703                         // do nothing, add an extra thread to place on next iteration
4704                     }
4705                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4706                         // we added an extra thread to this place; move on to next place
4707                         if ( place == last_place ) {
4708                             place = first_place;
4709                         }
4710                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4711                             place = 0;
4712                         }
4713                         else {
4714                             place++;
4715                         }
4716                         s_count = 0;
4717                         gap_ct = 1;
4718                         rem--;
4719                     }
4720                     else if (s_count == S) { // place is full; don't add extra thread
4721                         if ( place == last_place ) {
4722                             place = first_place;
4723                         }
4724                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4725                             place = 0;
4726                         }
4727                         else {
4728                             place++;
4729                         }
4730                         gap_ct++;
4731                         s_count = 0;
4732                     }
4733 
4734                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4735                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4736                        team->t.t_id, f, th->th.th_new_place,
4737                        th->th.th_first_place, th->th.th_last_place) );
4738                 }
4739                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4740             }
4741         }
4742         break;
4743 
4744         default:
4745         break;
4746     }
4747 
4748     KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4749 }
4750 
4751 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4752 
4753 /* allocate a new team data structure to use.  take one off of the free pool if available */
4754 kmp_team_t *
4755 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4756 #if OMPT_SUPPORT
4757     ompt_parallel_id_t ompt_parallel_id,
4758 #endif
4759 #if OMP_40_ENABLED
4760     kmp_proc_bind_t new_proc_bind,
4761 #endif
4762     kmp_internal_control_t *new_icvs,
4763     int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4764 {
4765     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4766     int f;
4767     kmp_team_t *team;
4768     int use_hot_team = ! root->r.r_active;
4769     int level = 0;
4770 
4771     KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4772     KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4773     KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4774     KMP_MB();
4775 
4776 #if KMP_NESTED_HOT_TEAMS
4777     kmp_hot_team_ptr_t *hot_teams;
4778     if( master ) {
4779         team = master->th.th_team;
4780         level = team->t.t_active_level;
4781         if( master->th.th_teams_microtask ) {                         // in teams construct?
4782             if( master->th.th_teams_size.nteams > 1 && (             // #teams > 1
4783                 team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4784                 master->th.th_teams_level < team->t.t_level ) ) {    // or nested parallel inside the teams
4785                 ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4786             }
4787         }
4788         hot_teams = master->th.th_hot_teams;
4789         if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4790         {   // hot team has already been allocated for given level
4791             use_hot_team = 1;
4792         } else {
4793             use_hot_team = 0;
4794         }
4795     }
4796 #endif
4797     // Optimization to use a "hot" team
4798     if( use_hot_team && new_nproc > 1 ) {
4799         KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4800 #if KMP_NESTED_HOT_TEAMS
4801         team = hot_teams[level].hot_team;
4802 #else
4803         team =  root->r.r_hot_team;
4804 #endif
4805 #if KMP_DEBUG
4806         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4807             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4808                            team->t.t_task_team[0], team->t.t_task_team[1] ));
4809         }
4810 #endif
4811 
4812         // Has the number of threads changed?
4813         /* Let's assume the most common case is that the number of threads is unchanged, and
4814            put that case first. */
4815         if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4816             KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4817             // This case can mean that omp_set_num_threads() was called and the hot team size
4818             // was already reduced, so we check the special flag
4819             if ( team->t.t_size_changed == -1 ) {
4820                 team->t.t_size_changed = 1;
4821             } else {
4822                 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4823             }
4824 
4825             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4826             kmp_r_sched_t new_sched = new_icvs->sched;
4827             if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4828                 team->t.t_sched.chunk != new_sched.chunk)
4829                 team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
4830 
4831             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4832 
4833             KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4834                            0, team->t.t_threads[0], team ) );
4835             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4836 
4837 #if OMP_40_ENABLED
4838 # if KMP_AFFINITY_SUPPORTED
4839             if ( ( team->t.t_size_changed == 0 )
4840               && ( team->t.t_proc_bind == new_proc_bind ) ) {
4841                 if (new_proc_bind == proc_bind_spread) {
4842                     __kmp_partition_places(team, 1); // add flag to update only master for spread
4843                 }
4844                 KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4845                   team->t.t_id, new_proc_bind, team->t.t_first_place,
4846                   team->t.t_last_place ) );
4847             }
4848             else {
4849                 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4850                 __kmp_partition_places( team );
4851             }
4852 # else
4853             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4854 # endif /* KMP_AFFINITY_SUPPORTED */
4855 #endif /* OMP_40_ENABLED */
4856         }
4857         else if( team->t.t_nproc > new_nproc ) {
4858             KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4859 
4860             team->t.t_size_changed = 1;
4861 #if KMP_NESTED_HOT_TEAMS
4862             if( __kmp_hot_teams_mode == 0 ) {
4863                 // AC: saved number of threads should correspond to team's value in this mode,
4864                 // can be bigger in mode 1, when hot team has some threads in reserve
4865                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4866                 hot_teams[level].hot_team_nth = new_nproc;
4867 #endif // KMP_NESTED_HOT_TEAMS
4868                 /* release the extra threads we don't need any more */
4869                 for( f = new_nproc  ;  f < team->t.t_nproc  ;  f++ ) {
4870                     KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4871                     if ( __kmp_tasking_mode != tskm_immediate_exec) {
4872                         // When decreasing team size, threads no longer in the team should unref task team.
4873                         team->t.t_threads[f]->th.th_task_team = NULL;
4874                     }
4875                     __kmp_free_thread( team->t.t_threads[ f ] );
4876                     team->t.t_threads[ f ] = NULL;
4877                 }
4878 #if KMP_NESTED_HOT_TEAMS
4879             } // (__kmp_hot_teams_mode == 0)
4880             else {
4881                 // When keeping extra threads in team, switch threads to wait on own b_go flag
4882                 for (f=new_nproc; f<team->t.t_nproc; ++f) {
4883                     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4884                     kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4885                     for (int b=0; b<bs_last_barrier; ++b) {
4886                         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4887                             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4888                         }
4889                         KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4890                     }
4891                 }
4892             }
4893 #endif // KMP_NESTED_HOT_TEAMS
4894             team->t.t_nproc =  new_nproc;
4895             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4896             if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4897                 team->t.t_sched.chunk != new_icvs->sched.chunk)
4898                 team->t.t_sched = new_icvs->sched;
4899             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4900 
4901             /* update the remaining threads */
4902             for(f = 0; f < new_nproc; ++f) {
4903                 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4904             }
4905             // restore the current task state of the master thread: should be the implicit task
4906             KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4907                        0, team->t.t_threads[0], team ) );
4908 
4909             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4910 
4911 #ifdef KMP_DEBUG
4912             for ( f = 0; f < team->t.t_nproc; f++ ) {
4913                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4914                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4915             }
4916 #endif
4917 
4918 #if OMP_40_ENABLED
4919             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4920 # if KMP_AFFINITY_SUPPORTED
4921             __kmp_partition_places( team );
4922 # endif
4923 #endif
4924         }
4925         else { // team->t.t_nproc < new_nproc
4926 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4927             kmp_affin_mask_t *old_mask;
4928             if ( KMP_AFFINITY_CAPABLE() ) {
4929                 KMP_CPU_ALLOC(old_mask);
4930             }
4931 #endif
4932 
4933             KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4934 
4935             team->t.t_size_changed = 1;
4936 
4937 #if KMP_NESTED_HOT_TEAMS
4938             int avail_threads = hot_teams[level].hot_team_nth;
4939             if( new_nproc < avail_threads )
4940                 avail_threads = new_nproc;
4941             kmp_info_t **other_threads = team->t.t_threads;
4942             for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4943                 // Adjust barrier data of reserved threads (if any) of the team
4944                 // Other data will be set in __kmp_initialize_info() below.
4945                 int b;
4946                 kmp_balign_t * balign = other_threads[f]->th.th_bar;
4947                 for ( b = 0; b < bs_last_barrier; ++ b ) {
4948                     balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4949                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4950 #if USE_DEBUGGER
4951                     balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4952 #endif
4953                 }
4954             }
4955             if( hot_teams[level].hot_team_nth >= new_nproc ) {
4956                 // we have all needed threads in reserve, no need to allocate any
4957                 // this only possible in mode 1, cannot have reserved threads in mode 0
4958                 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4959                 team->t.t_nproc = new_nproc;                     // just get reserved threads involved
4960             } else {
4961                 // we may have some threads in reserve, but not enough
4962                 team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4963                 hot_teams[level].hot_team_nth = new_nproc;       // adjust hot team max size
4964 #endif // KMP_NESTED_HOT_TEAMS
4965             if(team->t.t_max_nproc < new_nproc) {
4966                 /* reallocate larger arrays */
4967                 __kmp_reallocate_team_arrays(team, new_nproc);
4968                 __kmp_reinitialize_team( team, new_icvs, NULL );
4969             }
4970 
4971 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4972             /* Temporarily set full mask for master thread before
4973                creation of workers. The reason is that workers inherit
4974                the affinity from master, so if a lot of workers are
4975                created on the single core quickly, they don't get
4976                a chance to set their own affinity for a long time.
4977             */
4978             __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4979 #endif
4980 
4981             /* allocate new threads for the hot team */
4982             for( f = team->t.t_nproc  ;  f < new_nproc  ;  f++ ) {
4983                 kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4984                 KMP_DEBUG_ASSERT( new_worker );
4985                 team->t.t_threads[ f ] = new_worker;
4986 
4987                 KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4988                                 team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4989                                 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4990                                 team->t.t_bar[bs_plain_barrier].b_arrived ) );
4991 
4992                 { // Initialize barrier data for new threads.
4993                     int b;
4994                     kmp_balign_t * balign = new_worker->th.th_bar;
4995                     for( b = 0; b < bs_last_barrier; ++ b ) {
4996                         balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
4997                         KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4998 #if USE_DEBUGGER
4999                         balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5000 #endif
5001                     }
5002                 }
5003             }
5004 
5005 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5006             if ( KMP_AFFINITY_CAPABLE() ) {
5007                 /* Restore initial master thread's affinity mask */
5008                 __kmp_set_system_affinity( old_mask, TRUE );
5009                 KMP_CPU_FREE(old_mask);
5010             }
5011 #endif
5012 #if KMP_NESTED_HOT_TEAMS
5013             } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5014 #endif // KMP_NESTED_HOT_TEAMS
5015             /* make sure everyone is syncronized */
5016             int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
5017             __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5018 
5019             /* reinitialize the threads */
5020             KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5021             for (f=0;  f < team->t.t_nproc; ++f)
5022                 __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5023             if (level) { // set th_task_state for new threads in nested hot team
5024                 // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
5025                 // th_task_state for the new threads. th_task_state for master thread will not be accurate until
5026                 // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
5027                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5028                     team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5029             }
5030             else { // set th_task_state for new threads in non-nested hot team
5031                 int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
5032                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5033                     team->t.t_threads[f]->th.th_task_state = old_state;
5034             }
5035 
5036 #ifdef KMP_DEBUG
5037             for ( f = 0; f < team->t.t_nproc; ++ f ) {
5038                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5039                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5040             }
5041 #endif
5042 
5043 #if OMP_40_ENABLED
5044             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5045 # if KMP_AFFINITY_SUPPORTED
5046             __kmp_partition_places( team );
5047 # endif
5048 #endif
5049         } // Check changes in number of threads
5050 
5051 #if OMP_40_ENABLED
5052         kmp_info_t *master = team->t.t_threads[0];
5053         if( master->th.th_teams_microtask ) {
5054             for( f = 1; f < new_nproc; ++f ) {
5055                 // propagate teams construct specific info to workers
5056                 kmp_info_t *thr = team->t.t_threads[f];
5057                 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5058                 thr->th.th_teams_level     = master->th.th_teams_level;
5059                 thr->th.th_teams_size      = master->th.th_teams_size;
5060             }
5061         }
5062 #endif /* OMP_40_ENABLED */
5063 #if KMP_NESTED_HOT_TEAMS
5064         if( level ) {
5065             // Sync barrier state for nested hot teams, not needed for outermost hot team.
5066             for( f = 1; f < new_nproc; ++f ) {
5067                 kmp_info_t *thr = team->t.t_threads[f];
5068                 int b;
5069                 kmp_balign_t * balign = thr->th.th_bar;
5070                 for( b = 0; b < bs_last_barrier; ++ b ) {
5071                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
5072                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5073 #if USE_DEBUGGER
5074                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5075 #endif
5076                 }
5077             }
5078         }
5079 #endif // KMP_NESTED_HOT_TEAMS
5080 
5081         /* reallocate space for arguments if necessary */
5082         __kmp_alloc_argv_entries( argc, team, TRUE );
5083         KMP_CHECK_UPDATE(team->t.t_argc, argc);
5084         //
5085         // The hot team re-uses the previous task team,
5086         // if untouched during the previous release->gather phase.
5087         //
5088 
5089         KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5090 
5091 #if KMP_DEBUG
5092         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5093             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5094                            team->t.t_task_team[0], team->t.t_task_team[1] ));
5095         }
5096 #endif
5097 
5098 #if OMPT_SUPPORT
5099         __ompt_team_assign_id(team, ompt_parallel_id);
5100 #endif
5101 
5102         KMP_MB();
5103 
5104         return team;
5105     }
5106 
5107     /* next, let's try to take one from the team pool */
5108     KMP_MB();
5109     for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5110     {
5111         /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5112         if ( team->t.t_max_nproc >= max_nproc ) {
5113             /* take this team from the team pool */
5114             __kmp_team_pool = team->t.t_next_pool;
5115 
5116             /* setup the team for fresh use */
5117             __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5118 
5119             KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5120                             &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5121             team->t.t_task_team[0] = NULL;
5122             team->t.t_task_team[1] = NULL;
5123 
5124             /* reallocate space for arguments if necessary */
5125             __kmp_alloc_argv_entries( argc, team, TRUE );
5126             KMP_CHECK_UPDATE(team->t.t_argc, argc);
5127 
5128             KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5129                             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5130             { // Initialize barrier data.
5131                 int b;
5132                 for ( b = 0; b < bs_last_barrier; ++ b) {
5133                     team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5134 #if USE_DEBUGGER
5135                     team->t.t_bar[ b ].b_master_arrived = 0;
5136                     team->t.t_bar[ b ].b_team_arrived   = 0;
5137 #endif
5138                 }
5139             }
5140 
5141 #if OMP_40_ENABLED
5142             team->t.t_proc_bind = new_proc_bind;
5143 #endif
5144 
5145             KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5146 
5147 #if OMPT_SUPPORT
5148             __ompt_team_assign_id(team, ompt_parallel_id);
5149 #endif
5150 
5151             KMP_MB();
5152 
5153             return team;
5154         }
5155 
5156         /* reap team if it is too small, then loop back and check the next one */
5157         /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5158         /* TODO: Use technique to find the right size hot-team, don't reap them */
5159         team =  __kmp_reap_team( team );
5160         __kmp_team_pool = team;
5161     }
5162 
5163     /* nothing available in the pool, no matter, make a new team! */
5164     KMP_MB();
5165     team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5166 
5167     /* and set it up */
5168     team->t.t_max_nproc   = max_nproc;
5169     /* NOTE well, for some reason allocating one big buffer and dividing it
5170      * up seems to really hurt performance a lot on the P4, so, let's not use
5171      * this... */
5172     __kmp_allocate_team_arrays( team, max_nproc );
5173 
5174     KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5175     __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5176 
5177     KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5178                     &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5179     team->t.t_task_team[0] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5180     team->t.t_task_team[1] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5181 
5182     if ( __kmp_storage_map ) {
5183         __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5184     }
5185 
5186     /* allocate space for arguments */
5187     __kmp_alloc_argv_entries( argc, team, FALSE );
5188     team->t.t_argc        = argc;
5189 
5190     KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5191                     team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5192     { // Initialize barrier data.
5193         int b;
5194         for ( b = 0; b < bs_last_barrier; ++ b ) {
5195             team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5196 #if USE_DEBUGGER
5197             team->t.t_bar[ b ].b_master_arrived = 0;
5198             team->t.t_bar[ b ].b_team_arrived   = 0;
5199 #endif
5200         }
5201     }
5202 
5203 #if OMP_40_ENABLED
5204     team->t.t_proc_bind = new_proc_bind;
5205 #endif
5206 
5207 #if OMPT_SUPPORT
5208     __ompt_team_assign_id(team, ompt_parallel_id);
5209     team->t.ompt_serialized_team_info = NULL;
5210 #endif
5211 
5212     KMP_MB();
5213 
5214     KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5215 
5216     return team;
5217 }
5218 
5219 /* TODO implement hot-teams at all levels */
5220 /* TODO implement lazy thread release on demand (disband request) */
5221 
5222 /* free the team.  return it to the team pool.  release all the threads
5223  * associated with it */
5224 void
5225 __kmp_free_team( kmp_root_t *root, kmp_team_t *team  USE_NESTED_HOT_ARG(kmp_info_t *master) )
5226 {
5227     int f;
5228     KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5229 
5230     /* verify state */
5231     KMP_DEBUG_ASSERT( root );
5232     KMP_DEBUG_ASSERT( team );
5233     KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5234     KMP_DEBUG_ASSERT( team->t.t_threads );
5235 
5236     int use_hot_team = team == root->r.r_hot_team;
5237 #if KMP_NESTED_HOT_TEAMS
5238     int level;
5239     kmp_hot_team_ptr_t *hot_teams;
5240     if( master ) {
5241         level = team->t.t_active_level - 1;
5242         if( master->th.th_teams_microtask ) {                         // in teams construct?
5243             if( master->th.th_teams_size.nteams > 1 ) {
5244                ++level; // level was not increased in teams construct for team_of_masters
5245             }
5246             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5247                 master->th.th_teams_level == team->t.t_level ) {
5248                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
5249             }            // team->t.t_level will be increased inside parallel
5250         }
5251         hot_teams = master->th.th_hot_teams;
5252         if( level < __kmp_hot_teams_max_level ) {
5253             KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5254             use_hot_team = 1;
5255         }
5256     }
5257 #endif // KMP_NESTED_HOT_TEAMS
5258 
5259     /* team is done working */
5260     TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5261     team->t.t_copyin_counter = 0; // init counter for possible reuse
5262     // Do not reset pointer to parent team to NULL for hot teams.
5263 
5264     /* if we are non-hot team, release our threads */
5265     if( ! use_hot_team ) {
5266         if (__kmp_tasking_mode != tskm_immediate_exec) {
5267             // Wait for threads to reach reapable state
5268             for (f = 1; f < team->t.t_nproc; ++f) {
5269                 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5270                 kmp_info_t *th = team->t.t_threads[f];
5271                 volatile kmp_uint32 *state = &th->th.th_reap_state;
5272                 while (*state != KMP_SAFE_TO_REAP) {
5273 #if KMP_OS_WINDOWS
5274                     // On Windows a thread can be killed at any time, check this
5275                     DWORD ecode;
5276                     if (!__kmp_is_thread_alive(th, &ecode)) {
5277                         *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5278                         break;
5279                     }
5280 #endif
5281                     // first check if thread is sleeping
5282                     kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5283                     if (fl.is_sleeping())
5284                         fl.resume(__kmp_gtid_from_thread(th));
5285                     KMP_CPU_PAUSE();
5286                 }
5287             }
5288 
5289             // Delete task teams
5290             int tt_idx;
5291             for (tt_idx=0; tt_idx<2; ++tt_idx) {
5292                 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5293                 if ( task_team != NULL ) {
5294                     for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5295                         team->t.t_threads[f]->th.th_task_team = NULL;
5296                     }
5297                     KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5298 #if KMP_NESTED_HOT_TEAMS
5299                     __kmp_free_task_team( master, task_team );
5300 #endif
5301                     team->t.t_task_team[tt_idx] = NULL;
5302                 }
5303             }
5304         }
5305 
5306         // Reset pointer to parent team only for non-hot teams.
5307         team->t.t_parent = NULL;
5308         team->t.t_level = 0;
5309         team->t.t_active_level = 0;
5310 
5311         /* free the worker threads */
5312         for ( f = 1; f < team->t.t_nproc; ++ f ) {
5313             KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5314             __kmp_free_thread( team->t.t_threads[ f ] );
5315             team->t.t_threads[ f ] = NULL;
5316         }
5317 
5318         /* put the team back in the team pool */
5319         /* TODO limit size of team pool, call reap_team if pool too large */
5320         team->t.t_next_pool  = (kmp_team_t*) __kmp_team_pool;
5321         __kmp_team_pool        = (volatile kmp_team_t*) team;
5322     }
5323 
5324     KMP_MB();
5325 }
5326 
5327 
5328 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5329 kmp_team_t *
5330 __kmp_reap_team( kmp_team_t *team )
5331 {
5332     kmp_team_t *next_pool = team->t.t_next_pool;
5333 
5334     KMP_DEBUG_ASSERT( team );
5335     KMP_DEBUG_ASSERT( team->t.t_dispatch    );
5336     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5337     KMP_DEBUG_ASSERT( team->t.t_threads     );
5338     KMP_DEBUG_ASSERT( team->t.t_argv        );
5339 
5340     /* TODO clean the threads that are a part of this? */
5341 
5342     /* free stuff */
5343 
5344     __kmp_free_team_arrays( team );
5345     if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5346         __kmp_free( (void*) team->t.t_argv );
5347     __kmp_free( team );
5348 
5349     KMP_MB();
5350     return next_pool;
5351 }
5352 
5353 //
5354 // Free the thread.  Don't reap it, just place it on the pool of available
5355 // threads.
5356 //
5357 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5358 // binding for the affinity mechanism to be useful.
5359 //
5360 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5361 // However, we want to avoid a potential performance problem by always
5362 // scanning through the list to find the correct point at which to insert
5363 // the thread (potential N**2 behavior).  To do this we keep track of the
5364 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5365 // With single-level parallelism, threads will always be added to the tail
5366 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5367 // parallelism, all bets are off and we may need to scan through the entire
5368 // free list.
5369 //
5370 // This change also has a potentially large performance benefit, for some
5371 // applications.  Previously, as threads were freed from the hot team, they
5372 // would be placed back on the free list in inverse order.  If the hot team
5373 // grew back to it's original size, then the freed thread would be placed
5374 // back on the hot team in reverse order.  This could cause bad cache
5375 // locality problems on programs where the size of the hot team regularly
5376 // grew and shrunk.
5377 //
5378 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5379 //
5380 void
5381 __kmp_free_thread( kmp_info_t *this_th )
5382 {
5383     int gtid;
5384     kmp_info_t **scan;
5385 
5386     KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5387                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5388 
5389     KMP_DEBUG_ASSERT( this_th );
5390 
5391     // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5392     int b;
5393     kmp_balign_t *balign = this_th->th.th_bar;
5394     for (b=0; b<bs_last_barrier; ++b) {
5395         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5396             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5397         balign[b].bb.team = NULL;
5398         balign[b].bb.leaf_kids = 0;
5399     }
5400     this_th->th.th_task_state = 0;
5401 
5402     /* put thread back on the free pool */
5403     TCW_PTR(this_th->th.th_team, NULL);
5404     TCW_PTR(this_th->th.th_root, NULL);
5405     TCW_PTR(this_th->th.th_dispatch, NULL);               /* NOT NEEDED */
5406 
5407     //
5408     // If the __kmp_thread_pool_insert_pt is already past the new insert
5409     // point, then we need to re-scan the entire list.
5410     //
5411     gtid = this_th->th.th_info.ds.ds_gtid;
5412     if ( __kmp_thread_pool_insert_pt != NULL ) {
5413         KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5414         if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5415              __kmp_thread_pool_insert_pt = NULL;
5416         }
5417     }
5418 
5419     //
5420     // Scan down the list to find the place to insert the thread.
5421     // scan is the address of a link in the list, possibly the address of
5422     // __kmp_thread_pool itself.
5423     //
5424     // In the absence of nested parallism, the for loop will have 0 iterations.
5425     //
5426     if ( __kmp_thread_pool_insert_pt != NULL ) {
5427         scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5428     }
5429     else {
5430         scan = (kmp_info_t **)&__kmp_thread_pool;
5431     }
5432     for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5433       scan = &( (*scan)->th.th_next_pool ) );
5434 
5435     //
5436     // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5437     // to its address.
5438     //
5439     TCW_PTR(this_th->th.th_next_pool, *scan);
5440     __kmp_thread_pool_insert_pt = *scan = this_th;
5441     KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5442       || ( this_th->th.th_info.ds.ds_gtid
5443       < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5444     TCW_4(this_th->th.th_in_pool, TRUE);
5445     __kmp_thread_pool_nth++;
5446 
5447     TCW_4(__kmp_nth, __kmp_nth - 1);
5448 
5449 #ifdef KMP_ADJUST_BLOCKTIME
5450     /* Adjust blocktime back to user setting or default if necessary */
5451     /* Middle initialization might never have occurred                */
5452     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5453         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5454         if ( __kmp_nth <= __kmp_avail_proc ) {
5455             __kmp_zero_bt = FALSE;
5456         }
5457     }
5458 #endif /* KMP_ADJUST_BLOCKTIME */
5459 
5460     KMP_MB();
5461 }
5462 
5463 
5464 /* ------------------------------------------------------------------------ */
5465 
5466 void *
5467 __kmp_launch_thread( kmp_info_t *this_thr )
5468 {
5469     int                   gtid = this_thr->th.th_info.ds.ds_gtid;
5470 /*    void                 *stack_data;*/
5471     kmp_team_t *(*volatile pteam);
5472 
5473     KMP_MB();
5474     KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5475 
5476     if( __kmp_env_consistency_check ) {
5477         this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
5478     }
5479 
5480 #if OMPT_SUPPORT
5481     if (ompt_enabled) {
5482         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5483         this_thr->th.ompt_thread_info.wait_id = 0;
5484         this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5485         if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5486             __ompt_thread_begin(ompt_thread_worker, gtid);
5487         }
5488     }
5489 #endif
5490 
5491     /* This is the place where threads wait for work */
5492     while( ! TCR_4(__kmp_global.g.g_done) ) {
5493         KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5494         KMP_MB();
5495 
5496         /* wait for work to do */
5497         KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5498 
5499 #if OMPT_SUPPORT
5500         if (ompt_enabled) {
5501             this_thr->th.ompt_thread_info.state = ompt_state_idle;
5502         }
5503 #endif
5504 
5505         /* No tid yet since not part of a team */
5506         __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5507 
5508 #if OMPT_SUPPORT
5509         if (ompt_enabled) {
5510             this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5511         }
5512 #endif
5513 
5514         pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5515 
5516         /* have we been allocated? */
5517         if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5518 #if OMPT_SUPPORT
5519             ompt_task_info_t *task_info;
5520             ompt_parallel_id_t my_parallel_id;
5521             if (ompt_enabled) {
5522                 task_info = __ompt_get_taskinfo(0);
5523                 my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5524             }
5525 #endif
5526             /* we were just woken up, so run our new task */
5527             if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5528                 int rc;
5529                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5530                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5531 
5532                 updateHWFPControl (*pteam);
5533 
5534 #if OMPT_SUPPORT
5535                 if (ompt_enabled) {
5536                     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5537                     // Initialize OMPT task id for implicit task.
5538                     int tid = __kmp_tid_from_gtid(gtid);
5539                     task_info->task_id = __ompt_task_id_new(tid);
5540                 }
5541 #endif
5542 
5543                 {
5544                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5545                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5546                     rc = (*pteam)->t.t_invoke( gtid );
5547                 }
5548                 KMP_ASSERT( rc );
5549 
5550 #if OMPT_SUPPORT
5551                 if (ompt_enabled) {
5552                     /* no frame set while outside task */
5553                     task_info->frame.exit_runtime_frame = NULL;
5554 
5555                     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5556                 }
5557 #endif
5558                 KMP_MB();
5559                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5560                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5561             }
5562             /* join barrier after parallel region */
5563             __kmp_join_barrier( gtid );
5564 #if OMPT_SUPPORT && OMPT_TRACE
5565             if (ompt_enabled) {
5566                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5567                     // don't access *pteam here: it may have already been freed
5568                     // by the master thread behind the barrier (possible race)
5569                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5570                         my_parallel_id, task_info->task_id);
5571                 }
5572                 task_info->frame.exit_runtime_frame = NULL;
5573                 task_info->task_id = 0;
5574             }
5575 #endif
5576         }
5577     }
5578     TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5579 
5580 #if OMPT_SUPPORT
5581     if (ompt_enabled &&
5582         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5583         __ompt_thread_end(ompt_thread_worker, gtid);
5584     }
5585 #endif
5586 
5587     this_thr->th.th_task_team = NULL;
5588     /* run the destructors for the threadprivate data for this thread */
5589     __kmp_common_destroy_gtid( gtid );
5590 
5591     KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5592     KMP_MB();
5593     return this_thr;
5594 }
5595 
5596 /* ------------------------------------------------------------------------ */
5597 /* ------------------------------------------------------------------------ */
5598 
5599 void
5600 __kmp_internal_end_dest( void *specific_gtid )
5601 {
5602     #if KMP_COMPILER_ICC
5603         #pragma warning( push )
5604         #pragma warning( disable:  810 ) // conversion from "void *" to "int" may lose significant bits
5605     #endif
5606     // Make sure no significant bits are lost
5607     int gtid = (kmp_intptr_t)specific_gtid - 1;
5608     #if KMP_COMPILER_ICC
5609         #pragma warning( pop )
5610     #endif
5611 
5612     KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5613     /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5614      * this is because 0 is reserved for the nothing-stored case */
5615 
5616     /* josh: One reason for setting the gtid specific data even when it is being
5617        destroyed by pthread is to allow gtid lookup through thread specific data
5618        (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5619        that gets executed in the call to __kmp_internal_end_thread, actually
5620        gets the gtid through the thread specific data.  Setting it here seems
5621        rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5622        to run smoothly.
5623        todo: get rid of this after we remove the dependence on
5624        __kmp_gtid_get_specific
5625     */
5626     if(gtid >= 0 && KMP_UBER_GTID(gtid))
5627         __kmp_gtid_set_specific( gtid );
5628     #ifdef KMP_TDATA_GTID
5629         __kmp_gtid = gtid;
5630     #endif
5631     __kmp_internal_end_thread( gtid );
5632 }
5633 
5634 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5635 
5636 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5637 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5638 // option in makefile.mk works fine.
5639 
5640 __attribute__(( destructor ))
5641 void
5642 __kmp_internal_end_dtor( void )
5643 {
5644     __kmp_internal_end_atexit();
5645 }
5646 
5647 void
5648 __kmp_internal_end_fini( void )
5649 {
5650     __kmp_internal_end_atexit();
5651 }
5652 
5653 #endif
5654 
5655 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5656 void
5657 __kmp_internal_end_atexit( void )
5658 {
5659     KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5660     /* [Windows]
5661        josh: ideally, we want to completely shutdown the library in this atexit handler, but
5662        stat code that depends on thread specific data for gtid fails because that data becomes
5663        unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5664        instead.  We should eventually remove the dependency on __kmp_get_specific_gtid in the
5665        stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5666 
5667 // TODO: Can some of this comment about GVS be removed?
5668        I suspect that the offending stat code is executed when the calling thread tries to
5669        clean up a dead root thread's data structures, resulting in GVS code trying to close
5670        the GVS structures for that thread, but since the stat code uses
5671        __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5672        cleaning up itself instead of another thread, it gets confused.  This happens because
5673        allowing a thread to unregister and cleanup another thread is a recent modification for
5674        addressing an issue with Maxon Cinema4D.  Based on the current design (20050722), a
5675        thread may end up trying to unregister another thread only if thread death does not
5676        trigger the calling of __kmp_internal_end_thread.  For Linux* OS, there is the thread
5677        specific data destructor function to detect thread death.  For Windows dynamic, there
5678        is DllMain(THREAD_DETACH).  For Windows static, there is nothing.  Thus, the
5679        workaround is applicable only for Windows static stat library.
5680     */
5681     __kmp_internal_end_library( -1 );
5682     #if KMP_OS_WINDOWS
5683         __kmp_close_console();
5684     #endif
5685 }
5686 
5687 static void
5688 __kmp_reap_thread(
5689     kmp_info_t * thread,
5690     int is_root
5691 ) {
5692 
5693     // It is assumed __kmp_forkjoin_lock is acquired.
5694 
5695     int gtid;
5696 
5697     KMP_DEBUG_ASSERT( thread != NULL );
5698 
5699     gtid = thread->th.th_info.ds.ds_gtid;
5700 
5701     if ( ! is_root ) {
5702 
5703         if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5704             /* Assume the threads are at the fork barrier here */
5705             KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5706             /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5707             ANNOTATE_HAPPENS_BEFORE(thread);
5708             kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5709             __kmp_release_64(&flag);
5710         }; // if
5711 
5712         // Terminate OS thread.
5713         __kmp_reap_worker( thread );
5714 
5715         //
5716         // The thread was killed asynchronously.  If it was actively
5717         // spinning in the thread pool, decrement the global count.
5718         //
5719         // There is a small timing hole here - if the worker thread was
5720         // just waking up after sleeping in the pool, had reset it's
5721         // th_active_in_pool flag but not decremented the global counter
5722         // __kmp_thread_pool_active_nth yet, then the global counter
5723         // might not get updated.
5724         //
5725         // Currently, this can only happen as the library is unloaded,
5726         // so there are no harmful side effects.
5727         //
5728         if ( thread->th.th_active_in_pool ) {
5729             thread->th.th_active_in_pool = FALSE;
5730             KMP_TEST_THEN_DEC32(
5731               (kmp_int32 *) &__kmp_thread_pool_active_nth );
5732             KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5733         }
5734 
5735         // Decrement # of [worker] threads in the pool.
5736         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5737         --__kmp_thread_pool_nth;
5738     }; // if
5739 
5740     __kmp_free_implicit_task(thread);
5741 
5742     // Free the fast memory for tasking
5743     #if USE_FAST_MEMORY
5744         __kmp_free_fast_memory( thread );
5745     #endif /* USE_FAST_MEMORY */
5746 
5747     __kmp_suspend_uninitialize_thread( thread );
5748 
5749     KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5750     TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5751 
5752     -- __kmp_all_nth;
5753     // __kmp_nth was decremented when thread is added to the pool.
5754 
5755 #ifdef KMP_ADJUST_BLOCKTIME
5756     /* Adjust blocktime back to user setting or default if necessary */
5757     /* Middle initialization might never have occurred                */
5758     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5759         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5760         if ( __kmp_nth <= __kmp_avail_proc ) {
5761             __kmp_zero_bt = FALSE;
5762         }
5763     }
5764 #endif /* KMP_ADJUST_BLOCKTIME */
5765 
5766     /* free the memory being used */
5767     if( __kmp_env_consistency_check ) {
5768         if ( thread->th.th_cons ) {
5769             __kmp_free_cons_stack( thread->th.th_cons );
5770             thread->th.th_cons = NULL;
5771         }; // if
5772     }
5773 
5774     if ( thread->th.th_pri_common != NULL ) {
5775         __kmp_free( thread->th.th_pri_common );
5776         thread->th.th_pri_common = NULL;
5777     }; // if
5778 
5779     if (thread->th.th_task_state_memo_stack != NULL) {
5780         __kmp_free(thread->th.th_task_state_memo_stack);
5781         thread->th.th_task_state_memo_stack = NULL;
5782     }
5783 
5784     #if KMP_USE_BGET
5785         if ( thread->th.th_local.bget_data != NULL ) {
5786             __kmp_finalize_bget( thread );
5787         }; // if
5788     #endif
5789 
5790 #if KMP_AFFINITY_SUPPORTED
5791     if ( thread->th.th_affin_mask != NULL ) {
5792         KMP_CPU_FREE( thread->th.th_affin_mask );
5793         thread->th.th_affin_mask = NULL;
5794     }; // if
5795 #endif /* KMP_AFFINITY_SUPPORTED */
5796 
5797     __kmp_reap_team( thread->th.th_serial_team );
5798     thread->th.th_serial_team = NULL;
5799     __kmp_free( thread );
5800 
5801     KMP_MB();
5802 
5803 } // __kmp_reap_thread
5804 
5805 static void
5806 __kmp_internal_end(void)
5807 {
5808     int i;
5809 
5810     /* First, unregister the library */
5811     __kmp_unregister_library();
5812 
5813     #if KMP_OS_WINDOWS
5814         /* In Win static library, we can't tell when a root actually dies, so we
5815            reclaim the data structures for any root threads that have died but not
5816            unregistered themselves, in order to shut down cleanly.
5817            In Win dynamic library we also can't tell when a thread dies.
5818         */
5819         __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5820     #endif
5821 
5822     for( i=0 ; i<__kmp_threads_capacity ; i++ )
5823         if( __kmp_root[i] )
5824             if( __kmp_root[i]->r.r_active )
5825                 break;
5826     KMP_MB();       /* Flush all pending memory write invalidates.  */
5827     TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5828 
5829     if ( i < __kmp_threads_capacity ) {
5830 #if KMP_USE_MONITOR
5831         // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5832         KMP_MB();       /* Flush all pending memory write invalidates.  */
5833 
5834         //
5835         // Need to check that monitor was initialized before reaping it.
5836         // If we are called form __kmp_atfork_child (which sets
5837         // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5838         // contain valid data, but it is only valid in the parent process,
5839         // not the child.
5840         //
5841         // New behavior (201008): instead of keying off of the flag
5842         // __kmp_init_parallel, the monitor thread creation is keyed off
5843         // of the new flag __kmp_init_monitor.
5844         //
5845         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5846         if ( TCR_4( __kmp_init_monitor ) ) {
5847             __kmp_reap_monitor( & __kmp_monitor );
5848             TCW_4( __kmp_init_monitor, 0 );
5849         }
5850         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5851         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5852 #endif // KMP_USE_MONITOR
5853     } else {
5854         /* TODO move this to cleanup code */
5855         #ifdef KMP_DEBUG
5856             /* make sure that everything has properly ended */
5857             for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5858                 if( __kmp_root[i] ) {
5859 //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC: there can be uber threads alive here
5860                     KMP_ASSERT( ! __kmp_root[i]->r.r_active );  // TODO: can they be active?
5861                 }
5862             }
5863         #endif
5864 
5865         KMP_MB();
5866 
5867         // Reap the worker threads.
5868         // This is valid for now, but be careful if threads are reaped sooner.
5869         while ( __kmp_thread_pool != NULL ) {    // Loop thru all the thread in the pool.
5870             // Get the next thread from the pool.
5871             kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5872             __kmp_thread_pool = thread->th.th_next_pool;
5873             // Reap it.
5874             KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5875             thread->th.th_next_pool = NULL;
5876             thread->th.th_in_pool = FALSE;
5877             __kmp_reap_thread( thread, 0 );
5878         }; // while
5879         __kmp_thread_pool_insert_pt = NULL;
5880 
5881         // Reap teams.
5882         while ( __kmp_team_pool != NULL ) {     // Loop thru all the teams in the pool.
5883             // Get the next team from the pool.
5884             kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5885             __kmp_team_pool = team->t.t_next_pool;
5886             // Reap it.
5887             team->t.t_next_pool = NULL;
5888             __kmp_reap_team( team );
5889         }; // while
5890 
5891         __kmp_reap_task_teams( );
5892 
5893         for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5894             // TBD: Add some checking...
5895             // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5896         }
5897 
5898         /* Make sure all threadprivate destructors get run by joining with all worker
5899            threads before resetting this flag */
5900         TCW_SYNC_4(__kmp_init_common, FALSE);
5901 
5902         KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5903         KMP_MB();
5904 
5905 #if KMP_USE_MONITOR
5906         //
5907         // See note above: One of the possible fixes for CQ138434 / CQ140126
5908         //
5909         // FIXME: push both code fragments down and CSE them?
5910         // push them into __kmp_cleanup() ?
5911         //
5912         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5913         if ( TCR_4( __kmp_init_monitor ) ) {
5914             __kmp_reap_monitor( & __kmp_monitor );
5915             TCW_4( __kmp_init_monitor, 0 );
5916         }
5917         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5918         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5919 #endif
5920     } /* else !__kmp_global.t_active */
5921     TCW_4(__kmp_init_gtid, FALSE);
5922     KMP_MB();       /* Flush all pending memory write invalidates.  */
5923 
5924     __kmp_cleanup();
5925 #if OMPT_SUPPORT
5926     ompt_fini();
5927 #endif
5928 }
5929 
5930 void
5931 __kmp_internal_end_library( int gtid_req )
5932 {
5933     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5934     /* this shouldn't be a race condition because __kmp_internal_end() is the
5935      * only place to clear __kmp_serial_init */
5936     /* we'll check this later too, after we get the lock */
5937     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5938     // because the next check will work in any case.
5939     if( __kmp_global.g.g_abort ) {
5940         KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5941         /* TODO abort? */
5942         return;
5943     }
5944     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5945         KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5946         return;
5947     }
5948 
5949 
5950     KMP_MB();       /* Flush all pending memory write invalidates.  */
5951 
5952     /* find out who we are and what we should do */
5953     {
5954         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5955         KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req ));
5956         if( gtid == KMP_GTID_SHUTDOWN ) {
5957             KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5958             return;
5959         } else if( gtid == KMP_GTID_MONITOR ) {
5960             KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5961             return;
5962         } else if( gtid == KMP_GTID_DNE ) {
5963             KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5964             /* we don't know who we are, but we may still shutdown the library */
5965         } else if( KMP_UBER_GTID( gtid )) {
5966             /* unregister ourselves as an uber thread.  gtid is no longer valid */
5967             if( __kmp_root[gtid]->r.r_active ) {
5968                 __kmp_global.g.g_abort = -1;
5969                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5970                 KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5971                 return;
5972             } else {
5973                 KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5974                 __kmp_unregister_root_current_thread( gtid );
5975             }
5976         } else {
5977             /* worker threads may call this function through the atexit handler, if they call exit() */
5978             /* For now, skip the usual subsequent processing and just dump the debug buffer.
5979                TODO: do a thorough shutdown instead
5980             */
5981             #ifdef DUMP_DEBUG_ON_EXIT
5982                 if ( __kmp_debug_buf )
5983                     __kmp_dump_debug_buffer( );
5984             #endif
5985             return;
5986         }
5987     }
5988     /* synchronize the termination process */
5989     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5990 
5991     /* have we already finished */
5992     if( __kmp_global.g.g_abort ) {
5993         KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5994         /* TODO abort? */
5995         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5996         return;
5997     }
5998     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5999         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6000         return;
6001     }
6002 
6003     /* We need this lock to enforce mutex between this reading of
6004        __kmp_threads_capacity and the writing by __kmp_register_root.
6005        Alternatively, we can use a counter of roots that is
6006        atomically updated by __kmp_get_global_thread_id_reg,
6007        __kmp_do_serial_initialize and __kmp_internal_end_*.
6008     */
6009     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6010 
6011     /* now we can safely conduct the actual termination */
6012     __kmp_internal_end();
6013 
6014     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6015     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6016 
6017     KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
6018 
6019     #ifdef DUMP_DEBUG_ON_EXIT
6020         if ( __kmp_debug_buf )
6021             __kmp_dump_debug_buffer();
6022     #endif
6023 
6024     #if KMP_OS_WINDOWS
6025         __kmp_close_console();
6026     #endif
6027 
6028     __kmp_fini_allocator();
6029 
6030 } // __kmp_internal_end_library
6031 
6032 void
6033 __kmp_internal_end_thread( int gtid_req )
6034 {
6035     int i;
6036 
6037     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6038     /* this shouldn't be a race condition because __kmp_internal_end() is the
6039      * only place to clear __kmp_serial_init */
6040     /* we'll check this later too, after we get the lock */
6041     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6042     // because the next check will work in any case.
6043     if( __kmp_global.g.g_abort ) {
6044         KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6045         /* TODO abort? */
6046         return;
6047     }
6048     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6049         KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6050         return;
6051     }
6052 
6053     KMP_MB();       /* Flush all pending memory write invalidates.  */
6054 
6055     /* find out who we are and what we should do */
6056     {
6057         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6058         KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req ));
6059         if( gtid == KMP_GTID_SHUTDOWN ) {
6060             KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6061             return;
6062         } else if( gtid == KMP_GTID_MONITOR ) {
6063             KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6064             return;
6065         } else if( gtid == KMP_GTID_DNE ) {
6066             KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6067             return;
6068             /* we don't know who we are */
6069         } else if( KMP_UBER_GTID( gtid )) {
6070         /* unregister ourselves as an uber thread.  gtid is no longer valid */
6071             if( __kmp_root[gtid]->r.r_active ) {
6072                 __kmp_global.g.g_abort = -1;
6073                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6074                 KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6075                 return;
6076             } else {
6077                 KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6078                 __kmp_unregister_root_current_thread( gtid );
6079             }
6080         } else {
6081             /* just a worker thread, let's leave */
6082             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6083 
6084             if ( gtid >= 0 ) {
6085                 __kmp_threads[gtid]->th.th_task_team = NULL;
6086             }
6087 
6088             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6089             return;
6090         }
6091     }
6092     #if defined KMP_DYNAMIC_LIB
6093     // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6094     //     because we will better shutdown later in the library destructor.
6095     //     The reason of this change is performance problem when non-openmp thread
6096     //     in a loop forks and joins many openmp threads. We can save a lot of time
6097     //     keeping worker threads alive until the program shutdown.
6098     // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6099     //     Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6100         KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6101         return;
6102     #endif
6103     /* synchronize the termination process */
6104     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6105 
6106     /* have we already finished */
6107     if( __kmp_global.g.g_abort ) {
6108         KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6109         /* TODO abort? */
6110         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6111         return;
6112     }
6113     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6114         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6115         return;
6116     }
6117 
6118     /* We need this lock to enforce mutex between this reading of
6119        __kmp_threads_capacity and the writing by __kmp_register_root.
6120        Alternatively, we can use a counter of roots that is
6121        atomically updated by __kmp_get_global_thread_id_reg,
6122        __kmp_do_serial_initialize and __kmp_internal_end_*.
6123     */
6124 
6125     /* should we finish the run-time?  are all siblings done? */
6126     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6127 
6128     for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6129         if ( KMP_UBER_GTID( i ) ) {
6130             KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6131             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6132             __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6133             return;
6134         };
6135     }
6136 
6137     /* now we can safely conduct the actual termination */
6138 
6139     __kmp_internal_end();
6140 
6141     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6142     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6143 
6144     KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6145 
6146     #ifdef DUMP_DEBUG_ON_EXIT
6147         if ( __kmp_debug_buf )
6148             __kmp_dump_debug_buffer();
6149     #endif
6150 } // __kmp_internal_end_thread
6151 
6152 // -------------------------------------------------------------------------------------------------
6153 // Library registration stuff.
6154 
6155 static long   __kmp_registration_flag = 0;
6156     // Random value used to indicate library initialization.
6157 static char * __kmp_registration_str  = NULL;
6158     // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6159 
6160 
6161 static inline
6162 char *
6163 __kmp_reg_status_name() {
6164     /*
6165         On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6166         If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6167         the name of registered_lib_env env var can not be found, because the name will contain different pid.
6168     */
6169     return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6170 } // __kmp_reg_status_get
6171 
6172 
6173 void
6174 __kmp_register_library_startup(
6175     void
6176 ) {
6177 
6178     char * name   = __kmp_reg_status_name();  // Name of the environment variable.
6179     int    done   = 0;
6180     union {
6181         double dtime;
6182         long   ltime;
6183     } time;
6184     #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6185         __kmp_initialize_system_tick();
6186     #endif
6187     __kmp_read_system_time( & time.dtime );
6188     __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6189     __kmp_registration_str =
6190         __kmp_str_format(
6191             "%p-%lx-%s",
6192             & __kmp_registration_flag,
6193             __kmp_registration_flag,
6194             KMP_LIBRARY_FILE
6195         );
6196 
6197     KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6198 
6199     while ( ! done ) {
6200 
6201         char * value  = NULL; // Actual value of the environment variable.
6202 
6203         // Set environment variable, but do not overwrite if it is exist.
6204         __kmp_env_set( name, __kmp_registration_str, 0 );
6205         // Check the variable is written.
6206         value = __kmp_env_get( name );
6207         if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6208 
6209             done = 1;    // Ok, environment variable set successfully, exit the loop.
6210 
6211         } else {
6212 
6213             // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6214             // Check whether it alive or dead.
6215             int    neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6216             char * tail          = value;
6217             char * flag_addr_str = NULL;
6218             char * flag_val_str  = NULL;
6219             char const * file_name     = NULL;
6220             __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6221             __kmp_str_split( tail, '-', & flag_val_str,  & tail );
6222             file_name = tail;
6223             if ( tail != NULL ) {
6224                 long * flag_addr = 0;
6225                 long   flag_val  = 0;
6226                 KMP_SSCANF( flag_addr_str, "%p",  & flag_addr );
6227                 KMP_SSCANF( flag_val_str,  "%lx", & flag_val  );
6228                 if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6229                     // First, check whether environment-encoded address is mapped into addr space.
6230                     // If so, dereference it to see if it still has the right value.
6231 
6232                     if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6233                         neighbor = 1;
6234                     } else {
6235                         // If not, then we know the other copy of the library is no longer running.
6236                         neighbor = 2;
6237                     }; // if
6238                 }; // if
6239             }; // if
6240             switch ( neighbor ) {
6241                 case 0 :      // Cannot parse environment variable -- neighbor status unknown.
6242                     // Assume it is the incompatible format of future version of the library.
6243                     // Assume the other library is alive.
6244                     // WARN( ... ); // TODO: Issue a warning.
6245                     file_name = "unknown library";
6246                     // Attention! Falling to the next case. That's intentional.
6247                 case 1 : {    // Neighbor is alive.
6248                     // Check it is allowed.
6249                     char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6250                     if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6251                         // That's not allowed. Issue fatal error.
6252                         __kmp_msg(
6253                             kmp_ms_fatal,
6254                             KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6255                             KMP_HNT( DuplicateLibrary ),
6256                             __kmp_msg_null
6257                         );
6258                     }; // if
6259                     KMP_INTERNAL_FREE( duplicate_ok );
6260                     __kmp_duplicate_library_ok = 1;
6261                     done = 1;    // Exit the loop.
6262                 } break;
6263                 case 2 : {    // Neighbor is dead.
6264                     // Clear the variable and try to register library again.
6265                     __kmp_env_unset( name );
6266                 }  break;
6267                 default : {
6268                     KMP_DEBUG_ASSERT( 0 );
6269                 } break;
6270             }; // switch
6271 
6272         }; // if
6273         KMP_INTERNAL_FREE( (void *) value );
6274 
6275     }; // while
6276     KMP_INTERNAL_FREE( (void *) name );
6277 
6278 } // func __kmp_register_library_startup
6279 
6280 
6281 void
6282 __kmp_unregister_library( void ) {
6283 
6284     char * name  = __kmp_reg_status_name();
6285     char * value = __kmp_env_get( name );
6286 
6287     KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6288     KMP_DEBUG_ASSERT( __kmp_registration_str  != NULL );
6289     if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6290         // Ok, this is our variable. Delete it.
6291         __kmp_env_unset( name );
6292     }; // if
6293 
6294     KMP_INTERNAL_FREE( __kmp_registration_str );
6295     KMP_INTERNAL_FREE( value );
6296     KMP_INTERNAL_FREE( name );
6297 
6298     __kmp_registration_flag = 0;
6299     __kmp_registration_str  = NULL;
6300 
6301 } // __kmp_unregister_library
6302 
6303 
6304 // End of Library registration stuff.
6305 // -------------------------------------------------------------------------------------------------
6306 
6307 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6308 
6309 static void __kmp_check_mic_type()
6310 {
6311     kmp_cpuid_t cpuid_state = {0};
6312     kmp_cpuid_t * cs_p = &cpuid_state;
6313     __kmp_x86_cpuid(1, 0, cs_p);
6314     // We don't support mic1 at the moment
6315     if( (cs_p->eax & 0xff0) == 0xB10 ) {
6316         __kmp_mic_type = mic2;
6317     } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6318         __kmp_mic_type = mic3;
6319     } else {
6320         __kmp_mic_type = non_mic;
6321     }
6322 }
6323 
6324 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6325 
6326 static void
6327 __kmp_do_serial_initialize( void )
6328 {
6329     int i, gtid;
6330     int size;
6331 
6332     KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6333 
6334     KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6335     KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6336     KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6337     KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6338     KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6339 
6340 #if OMPT_SUPPORT
6341     ompt_pre_init();
6342 #endif
6343 
6344     __kmp_validate_locks();
6345 
6346     /* Initialize internal memory allocator */
6347     __kmp_init_allocator();
6348 
6349     /* Register the library startup via an environment variable
6350        and check to see whether another copy of the library is already
6351        registered. */
6352 
6353     __kmp_register_library_startup( );
6354 
6355     /* TODO reinitialization of library */
6356     if( TCR_4(__kmp_global.g.g_done) ) {
6357        KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6358     }
6359 
6360     __kmp_global.g.g_abort = 0;
6361     TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6362 
6363     /* initialize the locks */
6364 #if KMP_USE_ADAPTIVE_LOCKS
6365 #if KMP_DEBUG_ADAPTIVE_LOCKS
6366     __kmp_init_speculative_stats();
6367 #endif
6368 #endif
6369 #if KMP_STATS_ENABLED
6370     __kmp_stats_init();
6371 #endif
6372     __kmp_init_lock( & __kmp_global_lock     );
6373     __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6374     __kmp_init_lock( & __kmp_debug_lock      );
6375     __kmp_init_atomic_lock( & __kmp_atomic_lock     );
6376     __kmp_init_atomic_lock( & __kmp_atomic_lock_1i  );
6377     __kmp_init_atomic_lock( & __kmp_atomic_lock_2i  );
6378     __kmp_init_atomic_lock( & __kmp_atomic_lock_4i  );
6379     __kmp_init_atomic_lock( & __kmp_atomic_lock_4r  );
6380     __kmp_init_atomic_lock( & __kmp_atomic_lock_8i  );
6381     __kmp_init_atomic_lock( & __kmp_atomic_lock_8r  );
6382     __kmp_init_atomic_lock( & __kmp_atomic_lock_8c  );
6383     __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6384     __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6385     __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6386     __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6387     __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6388     __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock  );
6389     __kmp_init_bootstrap_lock( & __kmp_exit_lock      );
6390 #if KMP_USE_MONITOR
6391     __kmp_init_bootstrap_lock( & __kmp_monitor_lock   );
6392 #endif
6393     __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6394 
6395     /* conduct initialization and initial setup of configuration */
6396 
6397     __kmp_runtime_initialize();
6398 
6399 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6400     __kmp_check_mic_type();
6401 #endif
6402 
6403     // Some global variable initialization moved here from kmp_env_initialize()
6404 #ifdef KMP_DEBUG
6405     kmp_diag = 0;
6406 #endif
6407     __kmp_abort_delay = 0;
6408 
6409     // From __kmp_init_dflt_team_nth()
6410     /* assume the entire machine will be used */
6411     __kmp_dflt_team_nth_ub = __kmp_xproc;
6412     if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6413         __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6414     }
6415     if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6416         __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6417     }
6418     __kmp_max_nth = __kmp_sys_max_nth;
6419 
6420     // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6421     __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6422 #if KMP_USE_MONITOR
6423     __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6424     __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6425 #endif
6426     // From "KMP_LIBRARY" part of __kmp_env_initialize()
6427     __kmp_library = library_throughput;
6428     // From KMP_SCHEDULE initialization
6429     __kmp_static = kmp_sch_static_balanced;
6430     // AC: do not use analytical here, because it is non-monotonous
6431     //__kmp_guided = kmp_sch_guided_iterative_chunked;
6432     //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6433     // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6434     // control parts
6435     #if KMP_FAST_REDUCTION_BARRIER
6436         #define kmp_reduction_barrier_gather_bb ((int)1)
6437         #define kmp_reduction_barrier_release_bb ((int)1)
6438         #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6439         #define kmp_reduction_barrier_release_pat bp_hyper_bar
6440     #endif // KMP_FAST_REDUCTION_BARRIER
6441     for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6442         __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6443         __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6444         __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6445         __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6446         #if KMP_FAST_REDUCTION_BARRIER
6447         if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6448             __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6449             __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6450             __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6451             __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6452         }
6453         #endif // KMP_FAST_REDUCTION_BARRIER
6454     }
6455     #if KMP_FAST_REDUCTION_BARRIER
6456         #undef kmp_reduction_barrier_release_pat
6457         #undef kmp_reduction_barrier_gather_pat
6458         #undef kmp_reduction_barrier_release_bb
6459         #undef kmp_reduction_barrier_gather_bb
6460     #endif // KMP_FAST_REDUCTION_BARRIER
6461 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6462     if (__kmp_mic_type == mic2) { // KNC
6463         // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6464         __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3;  // plain gather
6465         __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1;  // forkjoin release
6466         __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6467         __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6468     }
6469 #if KMP_FAST_REDUCTION_BARRIER
6470     if (__kmp_mic_type == mic2) { // KNC
6471         __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6472         __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6473     }
6474 #endif
6475 #endif
6476 
6477     // From KMP_CHECKS initialization
6478 #ifdef KMP_DEBUG
6479     __kmp_env_checks = TRUE;   /* development versions have the extra checks */
6480 #else
6481     __kmp_env_checks = FALSE;  /* port versions do not have the extra checks */
6482 #endif
6483 
6484     // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6485     __kmp_foreign_tp = TRUE;
6486 
6487     __kmp_global.g.g_dynamic = FALSE;
6488     __kmp_global.g.g_dynamic_mode = dynamic_default;
6489 
6490     __kmp_env_initialize( NULL );
6491 
6492     // Print all messages in message catalog for testing purposes.
6493     #ifdef KMP_DEBUG
6494         char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6495         if ( __kmp_str_match_true( val ) ) {
6496             kmp_str_buf_t buffer;
6497             __kmp_str_buf_init( & buffer );
6498             __kmp_i18n_dump_catalog( & buffer );
6499             __kmp_printf( "%s", buffer.str );
6500             __kmp_str_buf_free( & buffer );
6501         }; // if
6502         __kmp_env_free( & val );
6503     #endif
6504 
6505     __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6506     // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6507     __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6508 
6509     // If the library is shut down properly, both pools must be NULL. Just in case, set them
6510     // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6511     KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6512     KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6513     KMP_DEBUG_ASSERT( __kmp_team_pool   == NULL );
6514     __kmp_thread_pool = NULL;
6515     __kmp_thread_pool_insert_pt = NULL;
6516     __kmp_team_pool   = NULL;
6517 
6518     /* Allocate all of the variable sized records */
6519     /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6520     /* Since allocation is cache-aligned, just add extra padding at the end */
6521     size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6522     __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6523     __kmp_root    = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6524 
6525     /* init thread counts */
6526     KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6527     KMP_DEBUG_ASSERT( __kmp_nth == 0 );     // something was wrong in termination.
6528     __kmp_all_nth = 0;
6529     __kmp_nth     = 0;
6530 
6531     /* setup the uber master thread and hierarchy */
6532     gtid = __kmp_register_root( TRUE );
6533     KA_TRACE( 10, ("__kmp_do_serial_initialize  T#%d\n", gtid ));
6534     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6535     KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6536 
6537     KMP_MB();       /* Flush all pending memory write invalidates.  */
6538 
6539     __kmp_common_initialize();
6540 
6541     #if KMP_OS_UNIX
6542         /* invoke the child fork handler */
6543         __kmp_register_atfork();
6544     #endif
6545 
6546     #if ! defined KMP_DYNAMIC_LIB
6547         {
6548             /* Invoke the exit handler when the program finishes, only for static library.
6549                For dynamic library, we already have _fini and DllMain.
6550              */
6551             int rc = atexit( __kmp_internal_end_atexit );
6552             if ( rc != 0 ) {
6553                 __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6554             }; // if
6555         }
6556     #endif
6557 
6558     #if KMP_HANDLE_SIGNALS
6559         #if KMP_OS_UNIX
6560             /* NOTE: make sure that this is called before the user installs
6561              *          their own signal handlers so that the user handlers
6562              *          are called first.  this way they can return false,
6563              *          not call our handler, avoid terminating the library,
6564              *          and continue execution where they left off. */
6565             __kmp_install_signals( FALSE );
6566         #endif /* KMP_OS_UNIX */
6567         #if KMP_OS_WINDOWS
6568             __kmp_install_signals( TRUE );
6569         #endif /* KMP_OS_WINDOWS */
6570     #endif
6571 
6572     /* we have finished the serial initialization */
6573     __kmp_init_counter ++;
6574 
6575     __kmp_init_serial = TRUE;
6576 
6577     if (__kmp_settings) {
6578         __kmp_env_print();
6579     }
6580 
6581 #if OMP_40_ENABLED
6582     if (__kmp_display_env || __kmp_display_env_verbose) {
6583         __kmp_env_print_2();
6584     }
6585 #endif // OMP_40_ENABLED
6586 
6587 #if OMPT_SUPPORT
6588     ompt_post_init();
6589 #endif
6590 
6591     KMP_MB();
6592 
6593     KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6594 }
6595 
6596 void
6597 __kmp_serial_initialize( void )
6598 {
6599     if ( __kmp_init_serial ) {
6600         return;
6601     }
6602     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6603     if ( __kmp_init_serial ) {
6604         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6605         return;
6606     }
6607     __kmp_do_serial_initialize();
6608     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6609 }
6610 
6611 static void
6612 __kmp_do_middle_initialize( void )
6613 {
6614     int i, j;
6615     int prev_dflt_team_nth;
6616 
6617     if( !__kmp_init_serial ) {
6618         __kmp_do_serial_initialize();
6619     }
6620 
6621     KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6622 
6623     //
6624     // Save the previous value for the __kmp_dflt_team_nth so that
6625     // we can avoid some reinitialization if it hasn't changed.
6626     //
6627     prev_dflt_team_nth = __kmp_dflt_team_nth;
6628 
6629 #if KMP_AFFINITY_SUPPORTED
6630     //
6631     // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6632     // number of cores on the machine.
6633     //
6634     __kmp_affinity_initialize();
6635 
6636     //
6637     // Run through the __kmp_threads array and set the affinity mask
6638     // for each root thread that is currently registered with the RTL.
6639     //
6640     for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6641         if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6642             __kmp_affinity_set_init_mask( i, TRUE );
6643         }
6644     }
6645 #endif /* KMP_AFFINITY_SUPPORTED */
6646 
6647     KMP_ASSERT( __kmp_xproc > 0 );
6648     if ( __kmp_avail_proc == 0 ) {
6649         __kmp_avail_proc = __kmp_xproc;
6650     }
6651 
6652     // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6653     j = 0;
6654     while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6655         __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6656         j++;
6657     }
6658 
6659     if ( __kmp_dflt_team_nth == 0 ) {
6660 #ifdef KMP_DFLT_NTH_CORES
6661         //
6662         // Default #threads = #cores
6663         //
6664         __kmp_dflt_team_nth = __kmp_ncores;
6665         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6666           __kmp_dflt_team_nth ) );
6667 #else
6668         //
6669         // Default #threads = #available OS procs
6670         //
6671         __kmp_dflt_team_nth = __kmp_avail_proc;
6672         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6673           __kmp_dflt_team_nth ) );
6674 #endif /* KMP_DFLT_NTH_CORES */
6675     }
6676 
6677     if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6678         __kmp_dflt_team_nth = KMP_MIN_NTH;
6679     }
6680     if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6681         __kmp_dflt_team_nth = __kmp_sys_max_nth;
6682     }
6683 
6684     //
6685     // There's no harm in continuing if the following check fails,
6686     // but it indicates an error in the previous logic.
6687     //
6688     KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6689 
6690     if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6691         //
6692         // Run through the __kmp_threads array and set the num threads icv
6693         // for each root thread that is currently registered with the RTL
6694         // (which has not already explicitly set its nthreads-var with a
6695         // call to omp_set_num_threads()).
6696         //
6697         for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6698             kmp_info_t *thread = __kmp_threads[ i ];
6699             if ( thread == NULL ) continue;
6700             if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6701 
6702             set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6703         }
6704     }
6705     KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6706       __kmp_dflt_team_nth) );
6707 
6708 #ifdef KMP_ADJUST_BLOCKTIME
6709     /* Adjust blocktime to zero if necessary */
6710     /* now that __kmp_avail_proc is set      */
6711     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6712         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6713         if ( __kmp_nth > __kmp_avail_proc ) {
6714             __kmp_zero_bt = TRUE;
6715         }
6716     }
6717 #endif /* KMP_ADJUST_BLOCKTIME */
6718 
6719     /* we have finished middle initialization */
6720     TCW_SYNC_4(__kmp_init_middle, TRUE);
6721 
6722     KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6723 }
6724 
6725 void
6726 __kmp_middle_initialize( void )
6727 {
6728     if ( __kmp_init_middle ) {
6729         return;
6730     }
6731     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6732     if ( __kmp_init_middle ) {
6733         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6734         return;
6735     }
6736     __kmp_do_middle_initialize();
6737     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6738 }
6739 
6740 void
6741 __kmp_parallel_initialize( void )
6742 {
6743     int gtid = __kmp_entry_gtid();      // this might be a new root
6744 
6745     /* synchronize parallel initialization (for sibling) */
6746     if( TCR_4(__kmp_init_parallel) ) return;
6747     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6748     if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6749 
6750     /* TODO reinitialization after we have already shut down */
6751     if( TCR_4(__kmp_global.g.g_done) ) {
6752         KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6753         __kmp_infinite_loop();
6754     }
6755 
6756     /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6757            would cause a deadlock.  So we call __kmp_do_serial_initialize directly.
6758     */
6759     if( !__kmp_init_middle ) {
6760         __kmp_do_middle_initialize();
6761     }
6762 
6763     /* begin initialization */
6764     KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6765     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6766 
6767 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6768     //
6769     // Save the FP control regs.
6770     // Worker threads will set theirs to these values at thread startup.
6771     //
6772     __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6773     __kmp_store_mxcsr( &__kmp_init_mxcsr );
6774     __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6775 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6776 
6777 #if KMP_OS_UNIX
6778 # if KMP_HANDLE_SIGNALS
6779     /*  must be after __kmp_serial_initialize  */
6780     __kmp_install_signals( TRUE );
6781 # endif
6782 #endif
6783 
6784     __kmp_suspend_initialize();
6785 
6786 #if defined(USE_LOAD_BALANCE)
6787     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6788         __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6789     }
6790 #else
6791     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6792         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6793     }
6794 #endif
6795 
6796     if ( __kmp_version ) {
6797         __kmp_print_version_2();
6798     }
6799 
6800     /* we have finished parallel initialization */
6801     TCW_SYNC_4(__kmp_init_parallel, TRUE);
6802 
6803     KMP_MB();
6804     KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6805 
6806     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6807 }
6808 
6809 
6810 /* ------------------------------------------------------------------------ */
6811 
6812 void
6813 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6814   kmp_team_t *team )
6815 {
6816     kmp_disp_t *dispatch;
6817 
6818     KMP_MB();
6819 
6820     /* none of the threads have encountered any constructs, yet. */
6821     this_thr->th.th_local.this_construct = 0;
6822 #if KMP_CACHE_MANAGE
6823     KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6824 #endif /* KMP_CACHE_MANAGE */
6825     dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6826     KMP_DEBUG_ASSERT( dispatch );
6827     KMP_DEBUG_ASSERT( team->t.t_dispatch );
6828     //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6829 
6830     dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
6831 #if OMP_45_ENABLED
6832     dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
6833 #endif
6834     if( __kmp_env_consistency_check )
6835         __kmp_push_parallel( gtid, team->t.t_ident );
6836 
6837     KMP_MB();       /* Flush all pending memory write invalidates.  */
6838 }
6839 
6840 void
6841 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6842   kmp_team_t *team )
6843 {
6844     if( __kmp_env_consistency_check )
6845         __kmp_pop_parallel( gtid, team->t.t_ident );
6846 
6847     __kmp_finish_implicit_task(this_thr);
6848 }
6849 
6850 int
6851 __kmp_invoke_task_func( int gtid )
6852 {
6853     int          rc;
6854     int          tid      = __kmp_tid_from_gtid( gtid );
6855     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6856     kmp_team_t  *team     = this_thr->th.th_team;
6857 
6858     __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6859 #if USE_ITT_BUILD
6860     if ( __itt_stack_caller_create_ptr ) {
6861         __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6862     }
6863 #endif /* USE_ITT_BUILD */
6864 #if INCLUDE_SSC_MARKS
6865     SSC_MARK_INVOKING();
6866 #endif
6867 
6868 #if OMPT_SUPPORT
6869     void *dummy;
6870     void **exit_runtime_p;
6871     ompt_task_id_t my_task_id;
6872     ompt_parallel_id_t my_parallel_id;
6873 
6874     if (ompt_enabled) {
6875         exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6876             ompt_task_info.frame.exit_runtime_frame);
6877     } else {
6878         exit_runtime_p = &dummy;
6879     }
6880 
6881 #if OMPT_TRACE
6882     my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6883     my_parallel_id = team->t.ompt_team_info.parallel_id;
6884     if (ompt_enabled &&
6885         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6886         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6887             my_parallel_id, my_task_id);
6888     }
6889 #endif
6890 #endif
6891 
6892     {
6893         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6894         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6895         rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6896                                      gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6897 #if OMPT_SUPPORT
6898                                      , exit_runtime_p
6899 #endif
6900                                      );
6901 #if OMPT_SUPPORT
6902         *exit_runtime_p = NULL;
6903 #endif
6904     }
6905 
6906 #if USE_ITT_BUILD
6907     if ( __itt_stack_caller_create_ptr ) {
6908         __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6909     }
6910 #endif /* USE_ITT_BUILD */
6911     __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6912 
6913     return rc;
6914 }
6915 
6916 #if OMP_40_ENABLED
6917 void
6918 __kmp_teams_master( int gtid )
6919 {
6920     // This routine is called by all master threads in teams construct
6921     kmp_info_t *thr = __kmp_threads[ gtid ];
6922     kmp_team_t *team = thr->th.th_team;
6923     ident_t     *loc =  team->t.t_ident;
6924     thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6925     KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6926     KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6927     KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6928                    gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6929     // Launch league of teams now, but not let workers execute
6930     // (they hang on fork barrier until next parallel)
6931 #if INCLUDE_SSC_MARKS
6932     SSC_MARK_FORKING();
6933 #endif
6934     __kmp_fork_call( loc, gtid, fork_context_intel,
6935             team->t.t_argc,
6936 #if OMPT_SUPPORT
6937             (void *)thr->th.th_teams_microtask,      // "unwrapped" task
6938 #endif
6939             (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6940             VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6941             NULL );
6942 #if INCLUDE_SSC_MARKS
6943     SSC_MARK_JOINING();
6944 #endif
6945 
6946     // AC: last parameter "1" eliminates join barrier which won't work because
6947     // worker threads are in a fork barrier waiting for more parallel regions
6948     __kmp_join_call( loc, gtid
6949 #if OMPT_SUPPORT
6950         , fork_context_intel
6951 #endif
6952         , 1 );
6953 }
6954 
6955 int
6956 __kmp_invoke_teams_master( int gtid )
6957 {
6958     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6959     kmp_team_t  *team     = this_thr->th.th_team;
6960     #if KMP_DEBUG
6961     if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6962         KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6963     #endif
6964     __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6965     __kmp_teams_master( gtid );
6966     __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6967     return 1;
6968 }
6969 #endif /* OMP_40_ENABLED */
6970 
6971 /* this sets the requested number of threads for the next parallel region
6972  * encountered by this team */
6973 /* since this should be enclosed in the forkjoin critical section it
6974  * should avoid race conditions with assymmetrical nested parallelism */
6975 
6976 void
6977 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6978 {
6979     kmp_info_t *thr = __kmp_threads[gtid];
6980 
6981     if( num_threads > 0 )
6982         thr->th.th_set_nproc = num_threads;
6983 }
6984 
6985 #if OMP_40_ENABLED
6986 
6987 /* this sets the requested number of teams for the teams region and/or
6988  * the number of threads for the next parallel region encountered  */
6989 void
6990 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6991 {
6992     kmp_info_t *thr = __kmp_threads[gtid];
6993     KMP_DEBUG_ASSERT(num_teams >= 0);
6994     KMP_DEBUG_ASSERT(num_threads >= 0);
6995 
6996     if( num_teams == 0 )
6997         num_teams = 1;    // default number of teams is 1.
6998     if( num_teams > __kmp_max_nth ) { // if too many teams requested?
6999         if ( !__kmp_reserve_warn ) {
7000             __kmp_reserve_warn = 1;
7001             __kmp_msg(
7002                 kmp_ms_warning,
7003                 KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
7004                 KMP_HNT( Unset_ALL_THREADS ),
7005                 __kmp_msg_null
7006             );
7007         }
7008         num_teams = __kmp_max_nth;
7009     }
7010     // Set number of teams (number of threads in the outer "parallel" of the teams)
7011     thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7012 
7013     // Remember the number of threads for inner parallel regions
7014     if( num_threads == 0 ) {
7015         if( !TCR_4(__kmp_init_middle) )
7016             __kmp_middle_initialize();  // get __kmp_avail_proc calculated
7017         num_threads = __kmp_avail_proc / num_teams;
7018         if( num_teams * num_threads > __kmp_max_nth ) {
7019             // adjust num_threads w/o warning as it is not user setting
7020             num_threads = __kmp_max_nth / num_teams;
7021         }
7022     } else {
7023         if( num_teams * num_threads > __kmp_max_nth ) {
7024             int new_threads = __kmp_max_nth / num_teams;
7025             if ( !__kmp_reserve_warn ) { // user asked for too many threads
7026                 __kmp_reserve_warn = 1;  // that conflicts with OMP_THREAD_LIMIT
7027                 __kmp_msg(
7028                     kmp_ms_warning,
7029                     KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
7030                     KMP_HNT( Unset_ALL_THREADS ),
7031                     __kmp_msg_null
7032                 );
7033             }
7034             num_threads = new_threads;
7035         }
7036     }
7037     thr->th.th_teams_size.nth = num_threads;
7038 }
7039 
7040 
7041 //
7042 // Set the proc_bind var to use in the following parallel region.
7043 //
7044 void
7045 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7046 {
7047     kmp_info_t *thr = __kmp_threads[gtid];
7048     thr->th.th_set_proc_bind = proc_bind;
7049 }
7050 
7051 #endif /* OMP_40_ENABLED */
7052 
7053 /* Launch the worker threads into the microtask. */
7054 
7055 void
7056 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7057 {
7058     kmp_info_t *this_thr = __kmp_threads[gtid];
7059 
7060 #ifdef KMP_DEBUG
7061     int f;
7062 #endif /* KMP_DEBUG */
7063 
7064     KMP_DEBUG_ASSERT( team );
7065     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7066     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7067     KMP_MB();       /* Flush all pending memory write invalidates.  */
7068 
7069     team->t.t_construct = 0;          /* no single directives seen yet */
7070     team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7071 
7072     /* Reset the identifiers on the dispatch buffer */
7073     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7074     if ( team->t.t_max_nproc > 1 ) {
7075         int i;
7076         for (i = 0; i <  __kmp_dispatch_num_buffers; ++i) {
7077             team->t.t_disp_buffer[ i ].buffer_index = i;
7078 #if OMP_45_ENABLED
7079             team->t.t_disp_buffer[i].doacross_buf_idx = i;
7080 #endif
7081         }
7082     } else {
7083         team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7084 #if OMP_45_ENABLED
7085         team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7086 #endif
7087     }
7088 
7089     KMP_MB();       /* Flush all pending memory write invalidates.  */
7090     KMP_ASSERT( this_thr->th.th_team  ==  team );
7091 
7092 #ifdef KMP_DEBUG
7093     for( f=0 ; f<team->t.t_nproc ; f++ ) {
7094         KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7095                           team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7096     }
7097 #endif /* KMP_DEBUG */
7098 
7099     /* release the worker threads so they may begin working */
7100     __kmp_fork_barrier( gtid, 0 );
7101 }
7102 
7103 
7104 void
7105 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7106 {
7107     kmp_info_t *this_thr = __kmp_threads[gtid];
7108 
7109     KMP_DEBUG_ASSERT( team );
7110     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7111     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7112     KMP_MB();       /* Flush all pending memory write invalidates.  */
7113 
7114     /* Join barrier after fork */
7115 
7116 #ifdef KMP_DEBUG
7117     if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7118         __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7119         __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7120                      gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7121         __kmp_print_structure();
7122     }
7123     KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7124                      __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7125 #endif /* KMP_DEBUG */
7126 
7127     __kmp_join_barrier( gtid );  /* wait for everyone */
7128 
7129     KMP_MB();       /* Flush all pending memory write invalidates.  */
7130     KMP_ASSERT( this_thr->th.th_team  ==  team );
7131 }
7132 
7133 
7134 /* ------------------------------------------------------------------------ */
7135 /* ------------------------------------------------------------------------ */
7136 
7137 #ifdef USE_LOAD_BALANCE
7138 
7139 //
7140 // Return the worker threads actively spinning in the hot team, if we
7141 // are at the outermost level of parallelism.  Otherwise, return 0.
7142 //
7143 static int
7144 __kmp_active_hot_team_nproc( kmp_root_t *root )
7145 {
7146     int i;
7147     int retval;
7148     kmp_team_t *hot_team;
7149 
7150     if ( root->r.r_active ) {
7151         return 0;
7152     }
7153     hot_team = root->r.r_hot_team;
7154     if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7155         return hot_team->t.t_nproc - 1;  // Don't count master thread
7156     }
7157 
7158     //
7159     // Skip the master thread - it is accounted for elsewhere.
7160     //
7161     retval = 0;
7162     for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7163         if ( hot_team->t.t_threads[i]->th.th_active ) {
7164             retval++;
7165         }
7166     }
7167     return retval;
7168 }
7169 
7170 //
7171 // Perform an automatic adjustment to the number of
7172 // threads used by the next parallel region.
7173 //
7174 static int
7175 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7176 {
7177     int retval;
7178     int pool_active;
7179     int hot_team_active;
7180     int team_curr_active;
7181     int system_active;
7182 
7183     KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7184                 root, set_nproc ) );
7185     KMP_DEBUG_ASSERT( root );
7186     KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7187     KMP_DEBUG_ASSERT( set_nproc > 1 );
7188 
7189     if ( set_nproc == 1) {
7190         KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7191         return 1;
7192     }
7193 
7194     //
7195     // Threads that are active in the thread pool, active in the hot team
7196     // for this particular root (if we are at the outer par level), and
7197     // the currently executing thread (to become the master) are available
7198     // to add to the new team, but are currently contributing to the system
7199     // load, and must be accounted for.
7200     //
7201     pool_active = TCR_4(__kmp_thread_pool_active_nth);
7202     hot_team_active = __kmp_active_hot_team_nproc( root );
7203     team_curr_active = pool_active + hot_team_active + 1;
7204 
7205     //
7206     // Check the system load.
7207     //
7208     system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7209     KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7210       system_active, pool_active, hot_team_active ) );
7211 
7212     if ( system_active < 0 ) {
7213         //
7214         // There was an error reading the necessary info from /proc,
7215         // so use the thread limit algorithm instead.  Once we set
7216         // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7217         // we shouldn't wind up getting back here.
7218         //
7219         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7220         KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7221 
7222         //
7223         // Make this call behave like the thread limit algorithm.
7224         //
7225         retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7226           : root->r.r_hot_team->t.t_nproc);
7227         if ( retval > set_nproc ) {
7228             retval = set_nproc;
7229         }
7230         if ( retval < KMP_MIN_NTH ) {
7231             retval = KMP_MIN_NTH;
7232         }
7233 
7234         KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7235         return retval;
7236     }
7237 
7238     //
7239     // There is a slight delay in the load balance algorithm in detecting
7240     // new running procs.  The real system load at this instant should be
7241     // at least as large as the #active omp thread that are available to
7242     // add to the team.
7243     //
7244     if ( system_active < team_curr_active ) {
7245         system_active = team_curr_active;
7246     }
7247     retval = __kmp_avail_proc - system_active + team_curr_active;
7248     if ( retval > set_nproc ) {
7249         retval = set_nproc;
7250     }
7251     if ( retval < KMP_MIN_NTH ) {
7252         retval = KMP_MIN_NTH;
7253     }
7254 
7255     KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7256     return retval;
7257 } // __kmp_load_balance_nproc()
7258 
7259 #endif /* USE_LOAD_BALANCE */
7260 
7261 /* ------------------------------------------------------------------------ */
7262 /* ------------------------------------------------------------------------ */
7263 
7264 /* NOTE: this is called with the __kmp_init_lock held */
7265 void
7266 __kmp_cleanup( void )
7267 {
7268     int f;
7269 
7270     KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7271 
7272     if (TCR_4(__kmp_init_parallel)) {
7273 #if KMP_HANDLE_SIGNALS
7274         __kmp_remove_signals();
7275 #endif
7276         TCW_4(__kmp_init_parallel, FALSE);
7277     }
7278 
7279     if (TCR_4(__kmp_init_middle)) {
7280 #if KMP_AFFINITY_SUPPORTED
7281         __kmp_affinity_uninitialize();
7282 #endif /* KMP_AFFINITY_SUPPORTED */
7283         __kmp_cleanup_hierarchy();
7284         TCW_4(__kmp_init_middle, FALSE);
7285     }
7286 
7287     KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7288 
7289     if (__kmp_init_serial) {
7290         __kmp_runtime_destroy();
7291         __kmp_init_serial = FALSE;
7292     }
7293 
7294     for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7295         if ( __kmp_root[ f ] != NULL ) {
7296             __kmp_free( __kmp_root[ f ] );
7297             __kmp_root[ f ] = NULL;
7298         }
7299     }
7300     __kmp_free( __kmp_threads );
7301     // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7302     // freeing __kmp_root.
7303     __kmp_threads = NULL;
7304     __kmp_root    = NULL;
7305     __kmp_threads_capacity = 0;
7306 
7307 #if KMP_USE_DYNAMIC_LOCK
7308     __kmp_cleanup_indirect_user_locks();
7309 #else
7310     __kmp_cleanup_user_locks();
7311 #endif
7312 
7313     #if KMP_AFFINITY_SUPPORTED
7314         KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7315         __kmp_cpuinfo_file = NULL;
7316     #endif /* KMP_AFFINITY_SUPPORTED */
7317 
7318    #if KMP_USE_ADAPTIVE_LOCKS
7319    #if KMP_DEBUG_ADAPTIVE_LOCKS
7320        __kmp_print_speculative_stats();
7321    #endif
7322    #endif
7323     KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7324     __kmp_nested_nth.nth = NULL;
7325     __kmp_nested_nth.size = 0;
7326     __kmp_nested_nth.used = 0;
7327     KMP_INTERNAL_FREE( __kmp_nested_proc_bind.bind_types );
7328     __kmp_nested_proc_bind.bind_types = NULL;
7329     __kmp_nested_proc_bind.size = 0;
7330     __kmp_nested_proc_bind.used = 0;
7331 
7332     __kmp_i18n_catclose();
7333 
7334 #if KMP_STATS_ENABLED
7335     __kmp_stats_fini();
7336 #endif
7337 
7338     KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7339 }
7340 
7341 /* ------------------------------------------------------------------------ */
7342 /* ------------------------------------------------------------------------ */
7343 
7344 int
7345 __kmp_ignore_mppbeg( void )
7346 {
7347     char *env;
7348 
7349     if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7350         if (__kmp_str_match_false( env ))
7351             return FALSE;
7352     }
7353     // By default __kmpc_begin() is no-op.
7354     return TRUE;
7355 }
7356 
7357 int
7358 __kmp_ignore_mppend( void )
7359 {
7360     char *env;
7361 
7362     if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7363         if (__kmp_str_match_false( env ))
7364             return FALSE;
7365     }
7366     // By default __kmpc_end() is no-op.
7367     return TRUE;
7368 }
7369 
7370 void
7371 __kmp_internal_begin( void )
7372 {
7373     int gtid;
7374     kmp_root_t *root;
7375 
7376     /* this is a very important step as it will register new sibling threads
7377      * and assign these new uber threads a new gtid */
7378     gtid = __kmp_entry_gtid();
7379     root = __kmp_threads[ gtid ]->th.th_root;
7380     KMP_ASSERT( KMP_UBER_GTID( gtid ));
7381 
7382     if( root->r.r_begin ) return;
7383     __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7384     if( root->r.r_begin ) {
7385         __kmp_release_lock( & root->r.r_begin_lock, gtid );
7386         return;
7387     }
7388 
7389     root->r.r_begin = TRUE;
7390 
7391     __kmp_release_lock( & root->r.r_begin_lock, gtid );
7392 }
7393 
7394 
7395 /* ------------------------------------------------------------------------ */
7396 /* ------------------------------------------------------------------------ */
7397 
7398 void
7399 __kmp_user_set_library (enum library_type arg)
7400 {
7401     int gtid;
7402     kmp_root_t *root;
7403     kmp_info_t *thread;
7404 
7405     /* first, make sure we are initialized so we can get our gtid */
7406 
7407     gtid = __kmp_entry_gtid();
7408     thread = __kmp_threads[ gtid ];
7409 
7410     root = thread->th.th_root;
7411 
7412     KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7413     if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7414         KMP_WARNING( SetLibraryIncorrectCall );
7415         return;
7416     }
7417 
7418     switch ( arg ) {
7419     case library_serial :
7420         thread->th.th_set_nproc = 0;
7421         set__nproc( thread, 1 );
7422         break;
7423     case library_turnaround :
7424         thread->th.th_set_nproc = 0;
7425         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7426         break;
7427     case library_throughput :
7428         thread->th.th_set_nproc = 0;
7429         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7430         break;
7431     default:
7432         KMP_FATAL( UnknownLibraryType, arg );
7433     }
7434 
7435     __kmp_aux_set_library ( arg );
7436 }
7437 
7438 void
7439 __kmp_aux_set_stacksize( size_t arg )
7440 {
7441     if (! __kmp_init_serial)
7442         __kmp_serial_initialize();
7443 
7444 #if KMP_OS_DARWIN
7445     if (arg & (0x1000 - 1)) {
7446         arg &= ~(0x1000 - 1);
7447         if(arg + 0x1000) /* check for overflow if we round up */
7448             arg += 0x1000;
7449     }
7450 #endif
7451     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7452 
7453     /* only change the default stacksize before the first parallel region */
7454     if (! TCR_4(__kmp_init_parallel)) {
7455         size_t value = arg;       /* argument is in bytes */
7456 
7457         if (value < __kmp_sys_min_stksize )
7458             value = __kmp_sys_min_stksize ;
7459         else if (value > KMP_MAX_STKSIZE)
7460             value = KMP_MAX_STKSIZE;
7461 
7462         __kmp_stksize = value;
7463 
7464         __kmp_env_stksize = TRUE;    /* was KMP_STACKSIZE specified? */
7465     }
7466 
7467     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7468 }
7469 
7470 /* set the behaviour of the runtime library */
7471 /* TODO this can cause some odd behaviour with sibling parallelism... */
7472 void
7473 __kmp_aux_set_library (enum library_type arg)
7474 {
7475     __kmp_library = arg;
7476 
7477     switch ( __kmp_library ) {
7478     case library_serial :
7479         {
7480             KMP_INFORM( LibraryIsSerial );
7481             (void) __kmp_change_library( TRUE );
7482         }
7483         break;
7484     case library_turnaround :
7485         (void) __kmp_change_library( TRUE );
7486         break;
7487     case library_throughput :
7488         (void) __kmp_change_library( FALSE );
7489         break;
7490     default:
7491         KMP_FATAL( UnknownLibraryType, arg );
7492     }
7493 }
7494 
7495 /* ------------------------------------------------------------------------ */
7496 /* ------------------------------------------------------------------------ */
7497 
7498 void
7499 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7500 {
7501     int blocktime = arg;        /* argument is in milliseconds */
7502 #if KMP_USE_MONITOR
7503     int bt_intervals;
7504 #endif
7505     int bt_set;
7506 
7507     __kmp_save_internal_controls( thread );
7508 
7509     /* Normalize and set blocktime for the teams */
7510     if (blocktime < KMP_MIN_BLOCKTIME)
7511         blocktime = KMP_MIN_BLOCKTIME;
7512     else if (blocktime > KMP_MAX_BLOCKTIME)
7513         blocktime = KMP_MAX_BLOCKTIME;
7514 
7515     set__blocktime_team( thread->th.th_team, tid, blocktime );
7516     set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7517 
7518 #if KMP_USE_MONITOR
7519     /* Calculate and set blocktime intervals for the teams */
7520     bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7521 
7522     set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7523     set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7524 #endif
7525 
7526     /* Set whether blocktime has been set to "TRUE" */
7527     bt_set = TRUE;
7528 
7529     set__bt_set_team( thread->th.th_team, tid, bt_set );
7530     set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7531 #if KMP_USE_MONITOR
7532     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7533                   "bt_intervals=%d, monitor_updates=%d\n",
7534                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7535                   thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7536                   __kmp_monitor_wakeups));
7537 #else
7538     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7539                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7540                   thread->th.th_team->t.t_id, tid, blocktime));
7541 #endif
7542 }
7543 
7544 void
7545 __kmp_aux_set_defaults(
7546     char const * str,
7547     int          len
7548 ) {
7549     if ( ! __kmp_init_serial ) {
7550         __kmp_serial_initialize();
7551     };
7552     __kmp_env_initialize( str );
7553 
7554     if (__kmp_settings
7555 #if OMP_40_ENABLED
7556         || __kmp_display_env || __kmp_display_env_verbose
7557 #endif // OMP_40_ENABLED
7558         ) {
7559         __kmp_env_print();
7560     }
7561 } // __kmp_aux_set_defaults
7562 
7563 /* ------------------------------------------------------------------------ */
7564 
7565 /*
7566  * internal fast reduction routines
7567  */
7568 
7569 PACKED_REDUCTION_METHOD_T
7570 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7571         kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7572         kmp_critical_name *lck )
7573 {
7574 
7575     // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7576     // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7577     // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7578     // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7579 
7580     PACKED_REDUCTION_METHOD_T retval;
7581 
7582     int team_size;
7583 
7584     KMP_DEBUG_ASSERT( loc );    // it would be nice to test ( loc != 0 )
7585     KMP_DEBUG_ASSERT( lck );    // it would be nice to test ( lck != 0 )
7586 
7587     #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7588     #define FAST_REDUCTION_TREE_METHOD_GENERATED   ( ( reduce_data ) && ( reduce_func ) )
7589 
7590     retval = critical_reduce_block;
7591 
7592     team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7593 
7594     if( team_size == 1 ) {
7595 
7596         retval = empty_reduce_block;
7597 
7598     } else {
7599 
7600         int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7601         int tree_available   = FAST_REDUCTION_TREE_METHOD_GENERATED;
7602 
7603         #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7604 
7605             #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7606 
7607 	    int teamsize_cutoff = 4;
7608 
7609 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7610                 if( __kmp_mic_type != non_mic ) {
7611                     teamsize_cutoff = 8;
7612                 }
7613 #endif
7614                 if( tree_available ) {
7615                     if( team_size <= teamsize_cutoff ) {
7616                         if ( atomic_available ) {
7617                             retval = atomic_reduce_block;
7618                         }
7619                     } else {
7620                         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7621                     }
7622                 } else if ( atomic_available ) {
7623                     retval = atomic_reduce_block;
7624                 }
7625             #else
7626                 #error "Unknown or unsupported OS"
7627             #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7628 
7629         #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7630 
7631             #if KMP_OS_LINUX || KMP_OS_WINDOWS
7632 
7633                 // basic tuning
7634 
7635                 if( atomic_available ) {
7636                     if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7637                         retval = atomic_reduce_block;
7638                     }
7639                 } // otherwise: use critical section
7640 
7641             #elif KMP_OS_DARWIN
7642 
7643                 if( atomic_available && ( num_vars <= 3 ) ) {
7644                         retval = atomic_reduce_block;
7645                 } else if( tree_available ) {
7646                     if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7647                         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7648                     }
7649                 } // otherwise: use critical section
7650 
7651             #else
7652                 #error "Unknown or unsupported OS"
7653             #endif
7654 
7655         #else
7656             #error "Unknown or unsupported architecture"
7657         #endif
7658 
7659     }
7660 
7661     // KMP_FORCE_REDUCTION
7662 
7663     // If the team is serialized (team_size == 1), ignore the forced reduction
7664     // method and stay with the unsynchronized method (empty_reduce_block)
7665     if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7666 
7667         PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7668 
7669         int atomic_available, tree_available;
7670 
7671         switch( ( forced_retval = __kmp_force_reduction_method ) )
7672         {
7673         case critical_reduce_block:
7674                 KMP_ASSERT( lck );              // lck should be != 0
7675                 break;
7676 
7677             case atomic_reduce_block:
7678                 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7679                 if( ! atomic_available ) {
7680                     KMP_WARNING(RedMethodNotSupported, "atomic");
7681                     forced_retval = critical_reduce_block;
7682                 }
7683                 break;
7684 
7685             case tree_reduce_block:
7686                 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7687                 if( ! tree_available ) {
7688                     KMP_WARNING(RedMethodNotSupported, "tree");
7689                     forced_retval = critical_reduce_block;
7690                 } else {
7691                     #if KMP_FAST_REDUCTION_BARRIER
7692                     forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7693                     #endif
7694                 }
7695                 break;
7696 
7697             default:
7698                 KMP_ASSERT( 0 ); // "unsupported method specified"
7699         }
7700 
7701         retval = forced_retval;
7702     }
7703 
7704     KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7705 
7706     #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7707     #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7708 
7709     return ( retval );
7710 }
7711 
7712 // this function is for testing set/get/determine reduce method
7713 kmp_int32
7714 __kmp_get_reduce_method( void ) {
7715     return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7716 }
7717 
7718 /* ------------------------------------------------------------------------ */
7719