1 /*
2  * kmp_runtime.cpp -- KPTS runtime support library
3  */
4 
5 
6 //===----------------------------------------------------------------------===//
7 //
8 //                     The LLVM Compiler Infrastructure
9 //
10 // This file is dual licensed under the MIT and the University of Illinois Open
11 // Source Licenses. See LICENSE.txt for details.
12 //
13 //===----------------------------------------------------------------------===//
14 
15 
16 #include "kmp.h"
17 #include "kmp_atomic.h"
18 #include "kmp_wrapper_getpid.h"
19 #include "kmp_environment.h"
20 #include "kmp_itt.h"
21 #include "kmp_str.h"
22 #include "kmp_settings.h"
23 #include "kmp_i18n.h"
24 #include "kmp_io.h"
25 #include "kmp_error.h"
26 #include "kmp_stats.h"
27 #include "kmp_wait_release.h"
28 #include "kmp_affinity.h"
29 
30 #if OMPT_SUPPORT
31 #include "ompt-specific.h"
32 #endif
33 
34 /* these are temporary issues to be dealt with */
35 #define KMP_USE_PRCTL 0
36 
37 #if KMP_OS_WINDOWS
38 #include <process.h>
39 #endif
40 
41 #include "tsan_annotations.h"
42 
43 #if defined(KMP_GOMP_COMPAT)
44 char const __kmp_version_alt_comp[] = KMP_VERSION_PREFIX "alternative compiler support: yes";
45 #endif /* defined(KMP_GOMP_COMPAT) */
46 
47 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: "
48 #if OMP_50_ENABLED
49     "5.0 (201611)";
50 #elif OMP_45_ENABLED
51     "4.5 (201511)";
52 #elif OMP_40_ENABLED
53     "4.0 (201307)";
54 #else
55     "3.1 (201107)";
56 #endif
57 
58 #ifdef KMP_DEBUG
59 char const __kmp_version_lock[] = KMP_VERSION_PREFIX "lock type: run time selectable";
60 #endif /* KMP_DEBUG */
61 
62 #define KMP_MIN( x, y ) ( (x) < (y) ? (x) : (y) )
63 
64 /* ------------------------------------------------------------------------ */
65 /* ------------------------------------------------------------------------ */
66 
67 kmp_info_t __kmp_monitor;
68 
69 /* ------------------------------------------------------------------------ */
70 /* ------------------------------------------------------------------------ */
71 
72 /* Forward declarations */
73 
74 void __kmp_cleanup( void );
75 
76 static void __kmp_initialize_info( kmp_info_t *, kmp_team_t *, int tid, int gtid );
77 static void __kmp_initialize_team( kmp_team_t * team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t * loc );
78 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
79 static void __kmp_partition_places( kmp_team_t *team, int update_master_only=0 );
80 #endif
81 static void __kmp_do_serial_initialize( void );
82 void __kmp_fork_barrier( int gtid, int tid );
83 void __kmp_join_barrier( int gtid );
84 void __kmp_setup_icv_copy( kmp_team_t *team, int new_nproc, kmp_internal_control_t * new_icvs, ident_t *loc );
85 
86 #ifdef USE_LOAD_BALANCE
87 static int __kmp_load_balance_nproc( kmp_root_t * root, int set_nproc );
88 #endif
89 
90 static int __kmp_expand_threads(int nWish, int nNeed);
91 #if KMP_OS_WINDOWS
92 static int __kmp_unregister_root_other_thread( int gtid );
93 #endif
94 static void __kmp_unregister_library( void ); // called by __kmp_internal_end()
95 static void __kmp_reap_thread( kmp_info_t * thread, int is_root );
96 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL;
97 
98 /* ------------------------------------------------------------------------ */
99 /* ------------------------------------------------------------------------ */
100 
101 /* Calculate the identifier of the current thread */
102 /* fast (and somewhat portable) way to get unique */
103 /* identifier of executing thread.                */
104 /* returns KMP_GTID_DNE if we haven't been assigned a gtid   */
105 
106 int
107 __kmp_get_global_thread_id( )
108 {
109     int i;
110     kmp_info_t   **other_threads;
111     size_t         stack_data;
112     char          *stack_addr;
113     size_t         stack_size;
114     char          *stack_base;
115 
116     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: entering, nproc=%d  all_nproc=%d\n",
117                       __kmp_nth, __kmp_all_nth ));
118 
119     /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to a
120              parallel region, made it return KMP_GTID_DNE to force serial_initialize by
121              caller.  Had to handle KMP_GTID_DNE at all call-sites, or else guarantee
122              __kmp_init_gtid for this to work.  */
123 
124     if ( !TCR_4(__kmp_init_gtid) ) return KMP_GTID_DNE;
125 
126 #ifdef KMP_TDATA_GTID
127     if ( TCR_4(__kmp_gtid_mode) >= 3) {
128         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using TDATA\n" ));
129         return __kmp_gtid;
130     }
131 #endif
132     if ( TCR_4(__kmp_gtid_mode) >= 2) {
133         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using keyed TLS\n" ));
134         return __kmp_gtid_get_specific();
135     }
136     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: using internal alg.\n" ));
137 
138     stack_addr    = (char*) & stack_data;
139     other_threads = __kmp_threads;
140 
141     /*
142         ATT: The code below is a source of potential bugs due to unsynchronized access to
143         __kmp_threads array. For example:
144             1. Current thread loads other_threads[i] to thr and checks it, it is non-NULL.
145             2. Current thread is suspended by OS.
146             3. Another thread unregisters and finishes (debug versions of free() may fill memory
147                with something like 0xEF).
148             4. Current thread is resumed.
149             5. Current thread reads junk from *thr.
150         TODO: Fix it.
151         --ln
152     */
153 
154     for( i = 0 ; i < __kmp_threads_capacity ; i++ ) {
155 
156         kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]);
157         if( !thr ) continue;
158 
159         stack_size =  (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize);
160         stack_base =  (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase);
161 
162         /* stack grows down -- search through all of the active threads */
163 
164         if( stack_addr <= stack_base ) {
165             size_t stack_diff = stack_base - stack_addr;
166 
167             if( stack_diff <= stack_size ) {
168                 /* The only way we can be closer than the allocated */
169                 /* stack size is if we are running on this thread. */
170                 KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == i );
171                 return i;
172             }
173         }
174     }
175 
176     /* get specific to try and determine our gtid */
177     KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id: internal alg. failed to find "
178                       "thread, using TLS\n" ));
179     i = __kmp_gtid_get_specific();
180 
181     /*fprintf( stderr, "=== %d\n", i );  */ /* GROO */
182 
183     /* if we havn't been assigned a gtid, then return code */
184     if( i<0 ) return i;
185 
186     /* dynamically updated stack window for uber threads to avoid get_specific call */
187     if( ! TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow) ) {
188         KMP_FATAL( StackOverflow, i );
189     }
190 
191     stack_base = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
192     if( stack_addr > stack_base ) {
193         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr);
194         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize,
195           other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - stack_base);
196     } else {
197         TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, stack_base - stack_addr);
198     }
199 
200     /* Reprint stack bounds for ubermaster since they have been refined */
201     if ( __kmp_storage_map ) {
202         char *stack_end = (char *) other_threads[i]->th.th_info.ds.ds_stackbase;
203         char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize;
204         __kmp_print_storage_map_gtid( i, stack_beg, stack_end,
205                                       other_threads[i]->th.th_info.ds.ds_stacksize,
206                                       "th_%d stack (refinement)", i );
207     }
208     return i;
209 }
210 
211 int
212 __kmp_get_global_thread_id_reg( )
213 {
214     int gtid;
215 
216     if ( !__kmp_init_serial ) {
217         gtid = KMP_GTID_DNE;
218     } else
219 #ifdef KMP_TDATA_GTID
220     if ( TCR_4(__kmp_gtid_mode) >= 3 ) {
221         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using TDATA\n" ));
222         gtid = __kmp_gtid;
223     } else
224 #endif
225     if ( TCR_4(__kmp_gtid_mode) >= 2 ) {
226         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using keyed TLS\n" ));
227         gtid = __kmp_gtid_get_specific();
228     } else {
229         KA_TRACE( 1000, ( "*** __kmp_get_global_thread_id_reg: using internal alg.\n" ));
230         gtid = __kmp_get_global_thread_id();
231     }
232 
233     /* we must be a new uber master sibling thread */
234     if( gtid == KMP_GTID_DNE ) {
235         KA_TRACE( 10, ( "__kmp_get_global_thread_id_reg: Encountered new root thread. "
236                         "Registering a new gtid.\n" ));
237         __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
238         if( !__kmp_init_serial ) {
239             __kmp_do_serial_initialize();
240             gtid = __kmp_gtid_get_specific();
241         } else {
242             gtid = __kmp_register_root(FALSE);
243         }
244         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
245         /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */
246     }
247 
248     KMP_DEBUG_ASSERT( gtid >=0 );
249 
250     return gtid;
251 }
252 
253 /* caller must hold forkjoin_lock */
254 void
255 __kmp_check_stack_overlap( kmp_info_t *th )
256 {
257     int f;
258     char *stack_beg = NULL;
259     char *stack_end = NULL;
260     int gtid;
261 
262     KA_TRACE(10,("__kmp_check_stack_overlap: called\n"));
263     if ( __kmp_storage_map ) {
264         stack_end = (char *) th->th.th_info.ds.ds_stackbase;
265         stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
266 
267         gtid = __kmp_gtid_from_thread( th );
268 
269         if (gtid == KMP_GTID_MONITOR) {
270             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
271                                      "th_%s stack (%s)", "mon",
272                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
273         } else {
274             __kmp_print_storage_map_gtid( gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize,
275                                      "th_%d stack (%s)", gtid,
276                                      ( th->th.th_info.ds.ds_stackgrow ) ? "initial" : "actual" );
277         }
278     }
279 
280     /* No point in checking ubermaster threads since they use refinement and cannot overlap */
281     gtid = __kmp_gtid_from_thread( th );
282     if ( __kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid))
283     {
284         KA_TRACE(10,("__kmp_check_stack_overlap: performing extensive checking\n"));
285         if ( stack_beg == NULL ) {
286             stack_end = (char *) th->th.th_info.ds.ds_stackbase;
287             stack_beg = stack_end - th->th.th_info.ds.ds_stacksize;
288         }
289 
290         for( f=0 ; f < __kmp_threads_capacity ; f++ ) {
291             kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]);
292 
293             if( f_th && f_th != th ) {
294                 char *other_stack_end = (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase);
295                 char *other_stack_beg = other_stack_end -
296                                         (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize);
297                 if((stack_beg > other_stack_beg && stack_beg < other_stack_end) ||
298                    (stack_end > other_stack_beg && stack_end < other_stack_end)) {
299 
300                     /* Print the other stack values before the abort */
301                     if ( __kmp_storage_map )
302                         __kmp_print_storage_map_gtid( -1, other_stack_beg, other_stack_end,
303                             (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize),
304                             "th_%d stack (overlapped)",
305                                                  __kmp_gtid_from_thread( f_th ) );
306 
307                     __kmp_msg( kmp_ms_fatal, KMP_MSG( StackOverlap ), KMP_HNT( ChangeStackLimit ), __kmp_msg_null );
308                 }
309             }
310         }
311     }
312     KA_TRACE(10,("__kmp_check_stack_overlap: returning\n"));
313 }
314 
315 
316 /* ------------------------------------------------------------------------ */
317 
318 /* ------------------------------------------------------------------------ */
319 
320 void
321 __kmp_infinite_loop( void )
322 {
323     static int done = FALSE;
324 
325     while (! done) {
326         KMP_YIELD( 1 );
327     }
328 }
329 
330 #define MAX_MESSAGE     512
331 
332 void
333 __kmp_print_storage_map_gtid( int gtid, void *p1, void *p2, size_t size, char const *format, ...) {
334     char buffer[MAX_MESSAGE];
335     va_list ap;
336 
337     va_start( ap, format);
338     KMP_SNPRINTF( buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, p2, (unsigned long) size, format );
339     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
340     __kmp_vprintf( kmp_err, buffer, ap );
341 #if KMP_PRINT_DATA_PLACEMENT
342     int node;
343     if(gtid >= 0) {
344         if(p1 <= p2 && (char*)p2 - (char*)p1 == size) {
345             if( __kmp_storage_map_verbose ) {
346                 node = __kmp_get_host_node(p1);
347                 if(node < 0)  /* doesn't work, so don't try this next time */
348                     __kmp_storage_map_verbose = FALSE;
349                 else {
350                     char *last;
351                     int lastNode;
352                     int localProc = __kmp_get_cpu_from_gtid(gtid);
353 
354                     const int page_size = KMP_GET_PAGE_SIZE();
355 
356                     p1 = (void *)( (size_t)p1 & ~((size_t)page_size - 1) );
357                     p2 = (void *)( ((size_t) p2 - 1) & ~((size_t)page_size - 1) );
358                     if(localProc >= 0)
359                         __kmp_printf_no_lock("  GTID %d localNode %d\n", gtid, localProc>>1);
360                     else
361                         __kmp_printf_no_lock("  GTID %d\n", gtid);
362 # if KMP_USE_PRCTL
363 /* The more elaborate format is disabled for now because of the prctl hanging bug. */
364                     do {
365                         last = p1;
366                         lastNode = node;
367                         /* This loop collates adjacent pages with the same host node. */
368                         do {
369                             (char*)p1 += page_size;
370                         } while(p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode);
371                         __kmp_printf_no_lock("    %p-%p memNode %d\n", last,
372                                              (char*)p1 - 1, lastNode);
373                     } while(p1 <= p2);
374 # else
375                     __kmp_printf_no_lock("    %p-%p memNode %d\n", p1,
376                                          (char*)p1 + (page_size - 1), __kmp_get_host_node(p1));
377                     if(p1 < p2)  {
378                         __kmp_printf_no_lock("    %p-%p memNode %d\n", p2,
379                                              (char*)p2 + (page_size - 1), __kmp_get_host_node(p2));
380                     }
381 # endif
382                 }
383             }
384         } else
385             __kmp_printf_no_lock("  %s\n", KMP_I18N_STR( StorageMapWarning ) );
386     }
387 #endif /* KMP_PRINT_DATA_PLACEMENT */
388     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
389 }
390 
391 void
392 __kmp_warn( char const * format, ... )
393 {
394     char buffer[MAX_MESSAGE];
395     va_list ap;
396 
397     if ( __kmp_generate_warnings == kmp_warnings_off ) {
398         return;
399     }
400 
401     va_start( ap, format );
402 
403     KMP_SNPRINTF( buffer, sizeof(buffer) , "OMP warning: %s\n", format );
404     __kmp_acquire_bootstrap_lock( & __kmp_stdio_lock );
405     __kmp_vprintf( kmp_err, buffer, ap );
406     __kmp_release_bootstrap_lock( & __kmp_stdio_lock );
407 
408     va_end( ap );
409 }
410 
411 void
412 __kmp_abort_process()
413 {
414 
415     // Later threads may stall here, but that's ok because abort() will kill them.
416     __kmp_acquire_bootstrap_lock( & __kmp_exit_lock );
417 
418     if ( __kmp_debug_buf ) {
419         __kmp_dump_debug_buffer();
420     }; // if
421 
422     if ( KMP_OS_WINDOWS ) {
423         // Let other threads know of abnormal termination and prevent deadlock
424         // if abort happened during library initialization or shutdown
425         __kmp_global.g.g_abort = SIGABRT;
426 
427         /*
428             On Windows* OS by default abort() causes pop-up error box, which stalls nightly testing.
429             Unfortunately, we cannot reliably suppress pop-up error boxes. _set_abort_behavior()
430             works well, but this function is not available in VS7 (this is not problem for DLL, but
431             it is a problem for static OpenMP RTL). SetErrorMode (and so, timelimit utility) does
432             not help, at least in some versions of MS C RTL.
433 
434             It seems following sequence is the only way to simulate abort() and avoid pop-up error
435             box.
436         */
437         raise( SIGABRT );
438         _exit( 3 );    // Just in case, if signal ignored, exit anyway.
439     } else {
440         abort();
441     }; // if
442 
443     __kmp_infinite_loop();
444     __kmp_release_bootstrap_lock( & __kmp_exit_lock );
445 
446 } // __kmp_abort_process
447 
448 void
449 __kmp_abort_thread( void )
450 {
451     // TODO: Eliminate g_abort global variable and this function.
452     // In case of abort just call abort(), it will kill all the threads.
453     __kmp_infinite_loop();
454 } // __kmp_abort_thread
455 
456 /* ------------------------------------------------------------------------ */
457 
458 /*
459  * Print out the storage map for the major kmp_info_t thread data structures
460  * that are allocated together.
461  */
462 
463 static void
464 __kmp_print_thread_storage_map( kmp_info_t *thr, int gtid )
465 {
466     __kmp_print_storage_map_gtid( gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", gtid );
467 
468     __kmp_print_storage_map_gtid( gtid, &thr->th.th_info, &thr->th.th_team, sizeof(kmp_desc_t),
469                              "th_%d.th_info", gtid );
470 
471     __kmp_print_storage_map_gtid( gtid, &thr->th.th_local, &thr->th.th_pri_head, sizeof(kmp_local_t),
472                              "th_%d.th_local", gtid );
473 
474     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier],
475                              sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid );
476 
477     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_plain_barrier],
478                              &thr->th.th_bar[bs_plain_barrier+1],
479                              sizeof(kmp_balign_t), "th_%d.th_bar[plain]", gtid);
480 
481     __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_forkjoin_barrier],
482                              &thr->th.th_bar[bs_forkjoin_barrier+1],
483                              sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", gtid);
484 
485     #if KMP_FAST_REDUCTION_BARRIER
486         __kmp_print_storage_map_gtid( gtid, &thr->th.th_bar[bs_reduction_barrier],
487                              &thr->th.th_bar[bs_reduction_barrier+1],
488                              sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", gtid);
489     #endif // KMP_FAST_REDUCTION_BARRIER
490 }
491 
492 /*
493  * Print out the storage map for the major kmp_team_t team data structures
494  * that are allocated together.
495  */
496 
497 static void
498 __kmp_print_team_storage_map( const char *header, kmp_team_t *team, int team_id, int num_thr )
499 {
500     int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2;
501     __kmp_print_storage_map_gtid( -1, team, team + 1, sizeof(kmp_team_t), "%s_%d",
502                              header, team_id );
503 
504     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[0], &team->t.t_bar[bs_last_barrier],
505                              sizeof(kmp_balign_team_t) * bs_last_barrier, "%s_%d.t_bar", header, team_id );
506 
507 
508     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_plain_barrier], &team->t.t_bar[bs_plain_barrier+1],
509                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", header, team_id );
510 
511     __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_forkjoin_barrier], &team->t.t_bar[bs_forkjoin_barrier+1],
512                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[forkjoin]", header, team_id );
513 
514     #if KMP_FAST_REDUCTION_BARRIER
515         __kmp_print_storage_map_gtid( -1, &team->t.t_bar[bs_reduction_barrier], &team->t.t_bar[bs_reduction_barrier+1],
516                              sizeof(kmp_balign_team_t), "%s_%d.t_bar[reduction]", header, team_id );
517     #endif // KMP_FAST_REDUCTION_BARRIER
518 
519     __kmp_print_storage_map_gtid( -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr],
520                              sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id );
521 
522     __kmp_print_storage_map_gtid( -1, &team->t.t_threads[0], &team->t.t_threads[num_thr],
523                              sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id );
524 
525     __kmp_print_storage_map_gtid( -1, &team->t.t_disp_buffer[0], &team->t.t_disp_buffer[num_disp_buff],
526                              sizeof(dispatch_shared_info_t) * num_disp_buff, "%s_%d.t_disp_buffer",
527                              header, team_id );
528 
529 
530     __kmp_print_storage_map_gtid( -1, &team->t.t_taskq, &team->t.t_copypriv_data,
531                              sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, team_id );
532 }
533 
534 static void __kmp_init_allocator() {}
535 static void __kmp_fini_allocator() {}
536 
537 /* ------------------------------------------------------------------------ */
538 
539 #ifdef KMP_DYNAMIC_LIB
540 # if KMP_OS_WINDOWS
541 
542 static void
543 __kmp_reset_lock( kmp_bootstrap_lock_t* lck ) {
544     // TODO: Change to __kmp_break_bootstrap_lock().
545     __kmp_init_bootstrap_lock( lck ); // make the lock released
546 }
547 
548 static void
549 __kmp_reset_locks_on_process_detach( int gtid_req ) {
550     int i;
551     int thread_count;
552 
553     // PROCESS_DETACH is expected to be called by a thread
554     // that executes ProcessExit() or FreeLibrary().
555     // OS terminates other threads (except the one calling ProcessExit or FreeLibrary).
556     // So, it might be safe to access the __kmp_threads[] without taking the forkjoin_lock.
557     // However, in fact, some threads can be still alive here, although being about to be terminated.
558     // The threads in the array with ds_thread==0 are most suspicious.
559     // Actually, it can be not safe to access the __kmp_threads[].
560 
561     // TODO: does it make sense to check __kmp_roots[] ?
562 
563     // Let's check that there are no other alive threads registered with the OMP lib.
564     while( 1 ) {
565         thread_count = 0;
566         for( i = 0; i < __kmp_threads_capacity; ++i ) {
567             if( !__kmp_threads ) continue;
568             kmp_info_t* th = __kmp_threads[ i ];
569             if( th == NULL ) continue;
570             int gtid = th->th.th_info.ds.ds_gtid;
571             if( gtid == gtid_req ) continue;
572             if( gtid < 0 ) continue;
573             DWORD exit_val;
574             int alive = __kmp_is_thread_alive( th, &exit_val );
575             if( alive ) {
576             ++thread_count;
577             }
578         }
579         if( thread_count == 0 ) break; // success
580     }
581 
582     // Assume that I'm alone.
583 
584     // Now it might be probably safe to check and reset locks.
585     // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset.
586     __kmp_reset_lock( &__kmp_forkjoin_lock );
587     #ifdef KMP_DEBUG
588     __kmp_reset_lock( &__kmp_stdio_lock );
589     #endif // KMP_DEBUG
590 }
591 
592 BOOL WINAPI
593 DllMain( HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved ) {
594     //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
595 
596     switch( fdwReason ) {
597 
598         case DLL_PROCESS_ATTACH:
599             KA_TRACE( 10, ("DllMain: PROCESS_ATTACH\n" ));
600 
601             return TRUE;
602 
603         case DLL_PROCESS_DETACH:
604             KA_TRACE( 10, ("DllMain: PROCESS_DETACH T#%d\n",
605                         __kmp_gtid_get_specific() ));
606 
607             if( lpReserved != NULL )
608             {
609                 // lpReserved is used for telling the difference:
610                 //  lpReserved == NULL when FreeLibrary() was called,
611                 //  lpReserved != NULL when the process terminates.
612                 // When FreeLibrary() is called, worker threads remain alive.
613                 // So they will release the forkjoin lock by themselves.
614                 // When the process terminates, worker threads disappear triggering
615                 // the problem of unreleased forkjoin lock as described below.
616 
617                 // A worker thread can take the forkjoin lock.
618                 // The problem comes up if that worker thread becomes dead
619                 // before it releases the forkjoin lock.
620                 // The forkjoin lock remains taken, while the thread
621                 // executing DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below
622                 // will try to take the forkjoin lock and will always fail,
623                 // so that the application will never finish [normally].
624                 // This scenario is possible if __kmpc_end() has not been executed.
625                 // It looks like it's not a corner case, but common cases:
626                 // - the main function was compiled by an alternative compiler;
627                 // - the main function was compiled by icl but without /Qopenmp (application with plugins);
628                 // - application terminates by calling C exit(), Fortran CALL EXIT() or Fortran STOP.
629                 // - alive foreign thread prevented __kmpc_end from doing cleanup.
630 
631                 // This is a hack to work around the problem.
632                 // TODO: !!! to figure out something better.
633                 __kmp_reset_locks_on_process_detach( __kmp_gtid_get_specific() );
634             }
635 
636             __kmp_internal_end_library( __kmp_gtid_get_specific() );
637 
638             return TRUE;
639 
640         case DLL_THREAD_ATTACH:
641             KA_TRACE( 10, ("DllMain: THREAD_ATTACH\n" ));
642 
643             /* if we wanted to register new siblings all the time here call
644              * __kmp_get_gtid(); */
645             return TRUE;
646 
647         case DLL_THREAD_DETACH:
648             KA_TRACE( 10, ("DllMain: THREAD_DETACH T#%d\n",
649                         __kmp_gtid_get_specific() ));
650 
651             __kmp_internal_end_thread( __kmp_gtid_get_specific() );
652             return TRUE;
653     }
654 
655     return TRUE;
656 }
657 
658 # endif /* KMP_OS_WINDOWS */
659 #endif /* KMP_DYNAMIC_LIB */
660 
661 
662 /* ------------------------------------------------------------------------ */
663 
664 /* Change the library type to "status" and return the old type */
665 /* called from within initialization routines where __kmp_initz_lock is held */
666 int
667 __kmp_change_library( int status )
668 {
669     int old_status;
670 
671     old_status = __kmp_yield_init & 1;  // check whether KMP_LIBRARY=throughput (even init count)
672 
673     if (status) {
674         __kmp_yield_init |= 1;  // throughput => turnaround (odd init count)
675     }
676     else {
677         __kmp_yield_init &= ~1; // turnaround => throughput (even init count)
678     }
679 
680     return old_status;  // return previous setting of whether KMP_LIBRARY=throughput
681 }
682 
683 /* ------------------------------------------------------------------------ */
684 /* ------------------------------------------------------------------------ */
685 
686 /* __kmp_parallel_deo --
687  * Wait until it's our turn.
688  */
689 void
690 __kmp_parallel_deo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
691 {
692     int gtid = *gtid_ref;
693 #ifdef BUILD_PARALLEL_ORDERED
694     kmp_team_t *team = __kmp_team_from_gtid( gtid );
695 #endif /* BUILD_PARALLEL_ORDERED */
696 
697     if( __kmp_env_consistency_check ) {
698         if( __kmp_threads[gtid]->th.th_root->r.r_active )
699 #if KMP_USE_DYNAMIC_LOCK
700             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL, 0 );
701 #else
702             __kmp_push_sync( gtid, ct_ordered_in_parallel, loc_ref, NULL );
703 #endif
704     }
705 #ifdef BUILD_PARALLEL_ORDERED
706     if( !team->t.t_serialized ) {
707         KMP_MB();
708         KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid( gtid ), KMP_EQ, NULL);
709         KMP_MB();
710     }
711 #endif /* BUILD_PARALLEL_ORDERED */
712 }
713 
714 /* __kmp_parallel_dxo --
715  * Signal the next task.
716  */
717 
718 void
719 __kmp_parallel_dxo( int *gtid_ref, int *cid_ref, ident_t *loc_ref )
720 {
721     int gtid = *gtid_ref;
722 #ifdef BUILD_PARALLEL_ORDERED
723     int tid =  __kmp_tid_from_gtid( gtid );
724     kmp_team_t *team = __kmp_team_from_gtid( gtid );
725 #endif /* BUILD_PARALLEL_ORDERED */
726 
727     if( __kmp_env_consistency_check ) {
728         if( __kmp_threads[gtid]->th.th_root->r.r_active )
729             __kmp_pop_sync( gtid, ct_ordered_in_parallel, loc_ref );
730     }
731 #ifdef BUILD_PARALLEL_ORDERED
732     if ( ! team->t.t_serialized ) {
733         KMP_MB();       /* Flush all pending memory write invalidates.  */
734 
735         /* use the tid of the next thread in this team */
736         /* TODO repleace with general release procedure */
737         team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc );
738 
739 #if OMPT_SUPPORT && OMPT_BLAME
740         if (ompt_enabled &&
741             ompt_callbacks.ompt_callback(ompt_event_release_ordered)) {
742             /* accept blame for "ordered" waiting */
743             kmp_info_t *this_thread = __kmp_threads[gtid];
744             ompt_callbacks.ompt_callback(ompt_event_release_ordered)(
745                 this_thread->th.ompt_thread_info.wait_id);
746         }
747 #endif
748 
749         KMP_MB();       /* Flush all pending memory write invalidates.  */
750     }
751 #endif /* BUILD_PARALLEL_ORDERED */
752 }
753 
754 /* ------------------------------------------------------------------------ */
755 /* ------------------------------------------------------------------------ */
756 
757 /* ------------------------------------------------------------------------ */
758 /* ------------------------------------------------------------------------ */
759 
760 /* The BARRIER for a SINGLE process section is always explicit   */
761 
762 int
763 __kmp_enter_single( int gtid, ident_t *id_ref, int push_ws )
764 {
765     int status;
766     kmp_info_t *th;
767     kmp_team_t *team;
768 
769     if( ! TCR_4(__kmp_init_parallel) )
770         __kmp_parallel_initialize();
771 
772     th   = __kmp_threads[ gtid ];
773     team = th->th.th_team;
774     status = 0;
775 
776     th->th.th_ident = id_ref;
777 
778     if ( team->t.t_serialized ) {
779         status = 1;
780     } else {
781         kmp_int32 old_this = th->th.th_local.this_construct;
782 
783         ++th->th.th_local.this_construct;
784         /* try to set team count to thread count--success means thread got the
785            single block
786         */
787         /* TODO: Should this be acquire or release? */
788         if (team->t.t_construct == old_this) {
789             status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this,
790                                                  th->th.th_local.this_construct);
791         }
792 #if USE_ITT_BUILD
793         if ( __itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && KMP_MASTER_GTID(gtid) &&
794 #if OMP_40_ENABLED
795             th->th.th_teams_microtask == NULL &&
796 #endif
797             team->t.t_active_level == 1 )
798         {   // Only report metadata by master of active team at level 1
799             __kmp_itt_metadata_single( id_ref );
800         }
801 #endif /* USE_ITT_BUILD */
802     }
803 
804     if( __kmp_env_consistency_check ) {
805         if (status && push_ws) {
806             __kmp_push_workshare( gtid, ct_psingle, id_ref );
807         } else {
808             __kmp_check_workshare( gtid, ct_psingle, id_ref );
809         }
810     }
811 #if USE_ITT_BUILD
812     if ( status ) {
813         __kmp_itt_single_start( gtid );
814     }
815 #endif /* USE_ITT_BUILD */
816     return status;
817 }
818 
819 void
820 __kmp_exit_single( int gtid )
821 {
822 #if USE_ITT_BUILD
823     __kmp_itt_single_end( gtid );
824 #endif /* USE_ITT_BUILD */
825     if( __kmp_env_consistency_check )
826         __kmp_pop_workshare( gtid, ct_psingle, NULL );
827 }
828 
829 
830 /*
831  * determine if we can go parallel or must use a serialized parallel region and
832  * how many threads we can use
833  * set_nproc is the number of threads requested for the team
834  * returns 0 if we should serialize or only use one thread,
835  * otherwise the number of threads to use
836  * The forkjoin lock is held by the caller.
837  */
838 static int
839 __kmp_reserve_threads( kmp_root_t *root, kmp_team_t *parent_team,
840    int master_tid, int set_nthreads
841 #if OMP_40_ENABLED
842   , int enter_teams
843 #endif /* OMP_40_ENABLED */
844 )
845 {
846     int capacity;
847     int new_nthreads;
848     KMP_DEBUG_ASSERT( __kmp_init_serial );
849     KMP_DEBUG_ASSERT( root && parent_team );
850 
851     //
852     // If dyn-var is set, dynamically adjust the number of desired threads,
853     // according to the method specified by dynamic_mode.
854     //
855     new_nthreads = set_nthreads;
856     if ( ! get__dynamic_2( parent_team, master_tid ) ) {
857         ;
858     }
859 #ifdef USE_LOAD_BALANCE
860     else if ( __kmp_global.g.g_dynamic_mode == dynamic_load_balance ) {
861         new_nthreads = __kmp_load_balance_nproc( root, set_nthreads );
862         if ( new_nthreads == 1 ) {
863             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to 1 thread\n",
864               master_tid ));
865             return 1;
866         }
867         if ( new_nthreads < set_nthreads ) {
868             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d load balance reduced reservation to %d threads\n",
869               master_tid, new_nthreads ));
870         }
871     }
872 #endif /* USE_LOAD_BALANCE */
873     else if ( __kmp_global.g.g_dynamic_mode == dynamic_thread_limit ) {
874         new_nthreads = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
875           : root->r.r_hot_team->t.t_nproc);
876         if ( new_nthreads <= 1 ) {
877             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to 1 thread\n",
878               master_tid ));
879             return 1;
880         }
881         if ( new_nthreads < set_nthreads ) {
882             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d thread limit reduced reservation to %d threads\n",
883               master_tid, new_nthreads ));
884         }
885         else {
886             new_nthreads = set_nthreads;
887         }
888     }
889     else if ( __kmp_global.g.g_dynamic_mode == dynamic_random ) {
890         if ( set_nthreads > 2 ) {
891             new_nthreads = __kmp_get_random( parent_team->t.t_threads[master_tid] );
892             new_nthreads = ( new_nthreads % set_nthreads ) + 1;
893             if ( new_nthreads == 1 ) {
894                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to 1 thread\n",
895                   master_tid ));
896                 return 1;
897             }
898             if ( new_nthreads < set_nthreads ) {
899                 KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d dynamic random reduced reservation to %d threads\n",
900                   master_tid, new_nthreads ));
901             }
902         }
903     }
904     else {
905         KMP_ASSERT( 0 );
906     }
907 
908     //
909     // Respect KMP_ALL_THREADS, KMP_MAX_THREADS, OMP_THREAD_LIMIT.
910     //
911     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
912       root->r.r_hot_team->t.t_nproc ) > __kmp_max_nth ) {
913         int tl_nthreads = __kmp_max_nth - __kmp_nth + ( root->r.r_active ? 1 :
914           root->r.r_hot_team->t.t_nproc );
915         if ( tl_nthreads <= 0 ) {
916             tl_nthreads = 1;
917         }
918 
919         //
920         // If dyn-var is false, emit a 1-time warning.
921         //
922         if ( ! get__dynamic_2( parent_team, master_tid )
923           && ( ! __kmp_reserve_warn ) ) {
924             __kmp_reserve_warn = 1;
925             __kmp_msg(
926                 kmp_ms_warning,
927                 KMP_MSG( CantFormThrTeam, set_nthreads, tl_nthreads ),
928                 KMP_HNT( Unset_ALL_THREADS ),
929                 __kmp_msg_null
930             );
931         }
932         if ( tl_nthreads == 1 ) {
933             KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to 1 thread\n",
934               master_tid ));
935             return 1;
936         }
937         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d KMP_ALL_THREADS reduced reservation to %d threads\n",
938           master_tid, tl_nthreads ));
939         new_nthreads = tl_nthreads;
940     }
941 
942     //
943     // Check if the threads array is large enough, or needs expanding.
944     //
945     // See comment in __kmp_register_root() about the adjustment if
946     // __kmp_threads[0] == NULL.
947     //
948     capacity = __kmp_threads_capacity;
949     if ( TCR_PTR(__kmp_threads[0]) == NULL ) {
950         --capacity;
951     }
952     if ( __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
953       root->r.r_hot_team->t.t_nproc ) > capacity ) {
954         //
955         // Expand the threads array.
956         //
957         int slotsRequired = __kmp_nth + new_nthreads - ( root->r.r_active ? 1 :
958           root->r.r_hot_team->t.t_nproc ) - capacity;
959         int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired);
960         if ( slotsAdded < slotsRequired ) {
961             //
962             // The threads array was not expanded enough.
963             //
964             new_nthreads -= ( slotsRequired - slotsAdded );
965             KMP_ASSERT( new_nthreads >= 1 );
966 
967             //
968             // If dyn-var is false, emit a 1-time warning.
969             //
970             if ( ! get__dynamic_2( parent_team, master_tid )
971               && ( ! __kmp_reserve_warn ) ) {
972                 __kmp_reserve_warn = 1;
973                 if ( __kmp_tp_cached ) {
974                     __kmp_msg(
975                         kmp_ms_warning,
976                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
977                         KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
978                         KMP_HNT( PossibleSystemLimitOnThreads ),
979                         __kmp_msg_null
980                     );
981                 }
982                 else {
983                     __kmp_msg(
984                         kmp_ms_warning,
985                         KMP_MSG( CantFormThrTeam, set_nthreads, new_nthreads ),
986                         KMP_HNT( SystemLimitOnThreads ),
987                         __kmp_msg_null
988                     );
989                 }
990             }
991         }
992     }
993 
994     if ( new_nthreads == 1 ) {
995         KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d serializing team after reclaiming dead roots and rechecking; requested %d threads\n",
996                         __kmp_get_gtid(), set_nthreads ) );
997         return 1;
998     }
999 
1000     KC_TRACE( 10, ( "__kmp_reserve_threads: T#%d allocating %d threads; requested %d threads\n",
1001                     __kmp_get_gtid(), new_nthreads, set_nthreads ));
1002     return new_nthreads;
1003 }
1004 
1005 /* ------------------------------------------------------------------------ */
1006 /* ------------------------------------------------------------------------ */
1007 
1008 /* allocate threads from the thread pool and assign them to the new team */
1009 /* we are assured that there are enough threads available, because we
1010  * checked on that earlier within critical section forkjoin */
1011 
1012 static void
1013 __kmp_fork_team_threads( kmp_root_t *root, kmp_team_t *team,
1014                          kmp_info_t *master_th, int master_gtid )
1015 {
1016     int         i;
1017     int use_hot_team;
1018 
1019     KA_TRACE( 10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc ) );
1020     KMP_DEBUG_ASSERT( master_gtid == __kmp_get_gtid() );
1021     KMP_MB();
1022 
1023     /* first, let's setup the master thread */
1024     master_th->th.th_info.ds.ds_tid  = 0;
1025     master_th->th.th_team            = team;
1026     master_th->th.th_team_nproc      = team->t.t_nproc;
1027     master_th->th.th_team_master     = master_th;
1028     master_th->th.th_team_serialized = FALSE;
1029     master_th->th.th_dispatch        = & team->t.t_dispatch[ 0 ];
1030 
1031     /* make sure we are not the optimized hot team */
1032 #if KMP_NESTED_HOT_TEAMS
1033     use_hot_team = 0;
1034     kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams;
1035     if( hot_teams ) {  // hot teams array is not allocated if KMP_HOT_TEAMS_MAX_LEVEL=0
1036         int level = team->t.t_active_level - 1;    // index in array of hot teams
1037         if( master_th->th.th_teams_microtask ) {    // are we inside the teams?
1038             if( master_th->th.th_teams_size.nteams > 1 ) {
1039                 ++level; // level was not increased in teams construct for team_of_masters
1040             }
1041             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
1042                 master_th->th.th_teams_level == team->t.t_level ) {
1043                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
1044             }            // team->t.t_level will be increased inside parallel
1045         }
1046         if( level < __kmp_hot_teams_max_level ) {
1047             if( hot_teams[level].hot_team ) {
1048                 // hot team has already been allocated for given level
1049                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team);
1050                 use_hot_team = 1; // the team is ready to use
1051             } else {
1052                 use_hot_team = 0; // AC: threads are not allocated yet
1053                 hot_teams[level].hot_team = team; // remember new hot team
1054                 hot_teams[level].hot_team_nth = team->t.t_nproc;
1055             }
1056         } else {
1057             use_hot_team = 0;
1058         }
1059     }
1060 #else
1061     use_hot_team = team == root->r.r_hot_team;
1062 #endif
1063     if ( !use_hot_team ) {
1064 
1065         /* install the master thread */
1066         team->t.t_threads[ 0 ]    = master_th;
1067         __kmp_initialize_info( master_th, team, 0, master_gtid );
1068 
1069         /* now, install the worker threads */
1070         for ( i=1 ;  i < team->t.t_nproc ; i++ ) {
1071 
1072             /* fork or reallocate a new thread and install it in team */
1073             kmp_info_t *thr = __kmp_allocate_thread( root, team, i );
1074             team->t.t_threads[ i ] = thr;
1075             KMP_DEBUG_ASSERT( thr );
1076             KMP_DEBUG_ASSERT( thr->th.th_team == team );
1077             /* align team and thread arrived states */
1078             KA_TRACE( 20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived T#%d(%d:%d) join =%llu, plain=%llu\n",
1079                             __kmp_gtid_from_tid( 0, team ), team->t.t_id, 0,
1080                             __kmp_gtid_from_tid( i, team ), team->t.t_id, i,
1081                             team->t.t_bar[ bs_forkjoin_barrier ].b_arrived,
1082                             team->t.t_bar[ bs_plain_barrier ].b_arrived ) );
1083 #if OMP_40_ENABLED
1084             thr->th.th_teams_microtask = master_th->th.th_teams_microtask;
1085             thr->th.th_teams_level     = master_th->th.th_teams_level;
1086             thr->th.th_teams_size      = master_th->th.th_teams_size;
1087 #endif
1088             { // Initialize threads' barrier data.
1089                 int b;
1090                 kmp_balign_t * balign = team->t.t_threads[ i ]->th.th_bar;
1091                 for ( b = 0; b < bs_last_barrier; ++ b ) {
1092                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
1093                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
1094 #if USE_DEBUGGER
1095                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
1096 #endif
1097                 }; // for b
1098             }
1099         }
1100 
1101 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
1102         __kmp_partition_places( team );
1103 #endif
1104 
1105     }
1106 
1107     KMP_MB();
1108 }
1109 
1110 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
1111 //
1112 // Propagate any changes to the floating point control registers out to the team
1113 // We try to avoid unnecessary writes to the relevant cache line in the team structure,
1114 // so we don't make changes unless they are needed.
1115 //
1116 inline static void
1117 propagateFPControl(kmp_team_t * team)
1118 {
1119     if ( __kmp_inherit_fp_control ) {
1120         kmp_int16 x87_fpu_control_word;
1121         kmp_uint32 mxcsr;
1122 
1123         // Get master values of FPU control flags (both X87 and vector)
1124         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1125         __kmp_store_mxcsr( &mxcsr );
1126         mxcsr &= KMP_X86_MXCSR_MASK;
1127 
1128         // There is no point looking at t_fp_control_saved here.
1129         // If it is TRUE, we still have to update the values if they are different from those we now have.
1130         // If it is FALSE we didn't save anything yet, but our objective is the same. We have to ensure
1131         // that the values in the team are the same as those we have.
1132         // So, this code achieves what we need whether or not t_fp_control_saved is true.
1133         // By checking whether the value needs updating we avoid unnecessary writes that would put the
1134         // cache-line into a written state, causing all threads in the team to have to read it again.
1135         KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word);
1136         KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr);
1137         // Although we don't use this value, other code in the runtime wants to know whether it should restore them.
1138         // So we must ensure it is correct.
1139         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE);
1140     }
1141     else {
1142         // Similarly here. Don't write to this cache-line in the team structure unless we have to.
1143         KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE);
1144     }
1145 }
1146 
1147 // Do the opposite, setting the hardware registers to the updated values from the team.
1148 inline static void
1149 updateHWFPControl(kmp_team_t * team)
1150 {
1151     if ( __kmp_inherit_fp_control && team->t.t_fp_control_saved ) {
1152         //
1153         // Only reset the fp control regs if they have been changed in the team.
1154         // the parallel region that we are exiting.
1155         //
1156         kmp_int16 x87_fpu_control_word;
1157         kmp_uint32 mxcsr;
1158         __kmp_store_x87_fpu_control_word( &x87_fpu_control_word );
1159         __kmp_store_mxcsr( &mxcsr );
1160         mxcsr &= KMP_X86_MXCSR_MASK;
1161 
1162         if ( team->t.t_x87_fpu_control_word != x87_fpu_control_word ) {
1163             __kmp_clear_x87_fpu_status_word();
1164             __kmp_load_x87_fpu_control_word( &team->t.t_x87_fpu_control_word );
1165         }
1166 
1167         if ( team->t.t_mxcsr != mxcsr ) {
1168             __kmp_load_mxcsr( &team->t.t_mxcsr );
1169         }
1170     }
1171 }
1172 #else
1173 # define propagateFPControl(x) ((void)0)
1174 # define updateHWFPControl(x)  ((void)0)
1175 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
1176 
1177 static void
1178 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc ); // forward declaration
1179 
1180 /*
1181  * Run a parallel region that has been serialized, so runs only in a team of the single master thread.
1182  */
1183 void
1184 __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid)
1185 {
1186     kmp_info_t *this_thr;
1187     kmp_team_t *serial_team;
1188 
1189     KC_TRACE( 10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid ) );
1190 
1191     /* Skip all this code for autopar serialized loops since it results in
1192        unacceptable overhead */
1193     if( loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR ) )
1194         return;
1195 
1196     if( ! TCR_4( __kmp_init_parallel ) )
1197         __kmp_parallel_initialize();
1198 
1199     this_thr     = __kmp_threads[ global_tid ];
1200     serial_team  = this_thr->th.th_serial_team;
1201 
1202     /* utilize the serialized team held by this thread */
1203     KMP_DEBUG_ASSERT( serial_team );
1204     KMP_MB();
1205 
1206     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1207         KMP_DEBUG_ASSERT(this_thr->th.th_task_team == this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]);
1208         KMP_DEBUG_ASSERT( serial_team->t.t_task_team[this_thr->th.th_task_state] == NULL );
1209         KA_TRACE( 20, ( "__kmpc_serialized_parallel: T#%d pushing task_team %p / team %p, new task_team = NULL\n",
1210                         global_tid, this_thr->th.th_task_team, this_thr->th.th_team ) );
1211         this_thr->th.th_task_team = NULL;
1212     }
1213 
1214 #if OMP_40_ENABLED
1215     kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind;
1216     if ( this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1217         proc_bind = proc_bind_false;
1218     }
1219     else if ( proc_bind == proc_bind_default ) {
1220         //
1221         // No proc_bind clause was specified, so use the current value
1222         // of proc-bind-var for this parallel region.
1223         //
1224         proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind;
1225     }
1226     //
1227     // Reset for next parallel region
1228     //
1229     this_thr->th.th_set_proc_bind = proc_bind_default;
1230 #endif /* OMP_40_ENABLED */
1231 
1232     if( this_thr->th.th_team != serial_team ) {
1233         // Nested level will be an index in the nested nthreads array
1234         int level = this_thr->th.th_team->t.t_level;
1235 
1236         if( serial_team->t.t_serialized ) {
1237             /* this serial team was already used
1238              * TODO increase performance by making this locks more specific */
1239             kmp_team_t *new_team;
1240 
1241             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1242 
1243 #if OMPT_SUPPORT
1244             ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1245 #endif
1246 
1247             new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1,
1248 #if OMPT_SUPPORT
1249                                            ompt_parallel_id,
1250 #endif
1251 #if OMP_40_ENABLED
1252                                            proc_bind,
1253 #endif
1254                                            & this_thr->th.th_current_task->td_icvs,
1255                                            0 USE_NESTED_HOT_ARG(NULL) );
1256             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1257             KMP_ASSERT( new_team );
1258 
1259             /* setup new serialized team and install it */
1260             new_team->t.t_threads[0] = this_thr;
1261             new_team->t.t_parent = this_thr->th.th_team;
1262             serial_team = new_team;
1263             this_thr->th.th_serial_team = serial_team;
1264 
1265             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d allocated new serial team %p\n",
1266                             global_tid, serial_team ) );
1267 
1268 
1269             /* TODO the above breaks the requirement that if we run out of
1270              * resources, then we can still guarantee that serialized teams
1271              * are ok, since we may need to allocate a new one */
1272         } else {
1273             KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n",
1274                             global_tid, serial_team ) );
1275         }
1276 
1277         /* we have to initialize this serial team */
1278         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1279         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1280         KMP_DEBUG_ASSERT( this_thr->th.th_team != serial_team );
1281         serial_team->t.t_ident         = loc;
1282         serial_team->t.t_serialized    = 1;
1283         serial_team->t.t_nproc         = 1;
1284         serial_team->t.t_parent        = this_thr->th.th_team;
1285         serial_team->t.t_sched         = this_thr->th.th_team->t.t_sched;
1286         this_thr->th.th_team           = serial_team;
1287         serial_team->t.t_master_tid    = this_thr->th.th_info.ds.ds_tid;
1288 
1289         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#d curtask=%p\n",
1290                         global_tid, this_thr->th.th_current_task ) );
1291         KMP_ASSERT( this_thr->th.th_current_task->td_flags.executing == 1 );
1292         this_thr->th.th_current_task->td_flags.executing = 0;
1293 
1294         __kmp_push_current_task_to_thread( this_thr, serial_team, 0 );
1295 
1296         /* TODO: GEH: do the ICVs work for nested serialized teams?  Don't we need an implicit task for
1297            each serialized task represented by team->t.t_serialized? */
1298         copy_icvs(
1299                   & this_thr->th.th_current_task->td_icvs,
1300                   & this_thr->th.th_current_task->td_parent->td_icvs );
1301 
1302         // Thread value exists in the nested nthreads array for the next nested level
1303         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1304             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1305         }
1306 
1307 #if OMP_40_ENABLED
1308         if ( __kmp_nested_proc_bind.used && ( level + 1 < __kmp_nested_proc_bind.used ) ) {
1309             this_thr->th.th_current_task->td_icvs.proc_bind
1310                 = __kmp_nested_proc_bind.bind_types[ level + 1 ];
1311         }
1312 #endif /* OMP_40_ENABLED */
1313 
1314 #if USE_DEBUGGER
1315         serial_team->t.t_pkfn = (microtask_t)( ~0 ); // For the debugger.
1316 #endif
1317         this_thr->th.th_info.ds.ds_tid = 0;
1318 
1319         /* set thread cache values */
1320         this_thr->th.th_team_nproc     = 1;
1321         this_thr->th.th_team_master    = this_thr;
1322         this_thr->th.th_team_serialized = 1;
1323 
1324         serial_team->t.t_level        = serial_team->t.t_parent->t.t_level + 1;
1325         serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level;
1326 
1327         propagateFPControl (serial_team);
1328 
1329         /* check if we need to allocate dispatch buffers stack */
1330         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1331         if ( !serial_team->t.t_dispatch->th_disp_buffer ) {
1332             serial_team->t.t_dispatch->th_disp_buffer = (dispatch_private_info_t *)
1333                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1334         }
1335         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1336 
1337 #if OMPT_SUPPORT
1338         ompt_parallel_id_t ompt_parallel_id = __ompt_parallel_id_new(global_tid);
1339         __ompt_team_assign_id(serial_team, ompt_parallel_id);
1340 #endif
1341 
1342         KMP_MB();
1343 
1344     } else {
1345         /* this serialized team is already being used,
1346          * that's fine, just add another nested level */
1347         KMP_DEBUG_ASSERT( this_thr->th.th_team == serial_team );
1348         KMP_DEBUG_ASSERT( serial_team->t.t_threads );
1349         KMP_DEBUG_ASSERT( serial_team->t.t_threads[0] == this_thr );
1350         ++ serial_team->t.t_serialized;
1351         this_thr->th.th_team_serialized = serial_team->t.t_serialized;
1352 
1353         // Nested level will be an index in the nested nthreads array
1354         int level = this_thr->th.th_team->t.t_level;
1355         // Thread value exists in the nested nthreads array for the next nested level
1356         if ( __kmp_nested_nth.used && ( level + 1 < __kmp_nested_nth.used ) ) {
1357             this_thr->th.th_current_task->td_icvs.nproc = __kmp_nested_nth.nth[ level + 1 ];
1358         }
1359         serial_team->t.t_level++;
1360         KF_TRACE( 10, ( "__kmpc_serialized_parallel: T#%d increasing nesting level of serial team %p to %d\n",
1361                         global_tid, serial_team, serial_team->t.t_level ) );
1362 
1363         /* allocate/push dispatch buffers stack */
1364         KMP_DEBUG_ASSERT(serial_team->t.t_dispatch);
1365         {
1366             dispatch_private_info_t * disp_buffer = (dispatch_private_info_t *)
1367                 __kmp_allocate( sizeof( dispatch_private_info_t ) );
1368             disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer;
1369             serial_team->t.t_dispatch->th_disp_buffer = disp_buffer;
1370         }
1371         this_thr->th.th_dispatch = serial_team->t.t_dispatch;
1372 
1373         KMP_MB();
1374     }
1375 #if OMP_40_ENABLED
1376     KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq);
1377 #endif
1378 
1379     if ( __kmp_env_consistency_check )
1380         __kmp_push_parallel( global_tid, NULL );
1381 
1382 }
1383 
1384 /* most of the work for a fork */
1385 /* return true if we really went parallel, false if serialized */
1386 int
1387 __kmp_fork_call(
1388     ident_t   * loc,
1389     int         gtid,
1390     enum fork_context_e  call_context, // Intel, GNU, ...
1391     kmp_int32   argc,
1392 #if OMPT_SUPPORT
1393     void       *unwrapped_task,
1394 #endif
1395     microtask_t microtask,
1396     launch_t    invoker,
1397 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1398 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1399     va_list   * ap
1400 #else
1401     va_list     ap
1402 #endif
1403     )
1404 {
1405     void          **argv;
1406     int             i;
1407     int             master_tid;
1408     int             master_this_cons;
1409     kmp_team_t     *team;
1410     kmp_team_t     *parent_team;
1411     kmp_info_t     *master_th;
1412     kmp_root_t     *root;
1413     int             nthreads;
1414     int             master_active;
1415     int             master_set_numthreads;
1416     int             level;
1417 #if OMP_40_ENABLED
1418     int             active_level;
1419     int             teams_level;
1420 #endif
1421 #if KMP_NESTED_HOT_TEAMS
1422     kmp_hot_team_ptr_t **p_hot_teams;
1423 #endif
1424     { // KMP_TIME_BLOCK
1425     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call);
1426     KMP_COUNT_VALUE(OMP_PARALLEL_args, argc);
1427 
1428     KA_TRACE( 20, ("__kmp_fork_call: enter T#%d\n", gtid ));
1429     if ( __kmp_stkpadding > 0 &&  __kmp_root[gtid] != NULL ) {
1430         /* Some systems prefer the stack for the root thread(s) to start with */
1431         /* some gap from the parent stack to prevent false sharing. */
1432         void *dummy = KMP_ALLOCA(__kmp_stkpadding);
1433         /* These 2 lines below are so this does not get optimized out */
1434         if ( __kmp_stkpadding > KMP_MAX_STKPADDING )
1435             __kmp_stkpadding += (short)((kmp_int64)dummy);
1436     }
1437 
1438     /* initialize if needed */
1439     KMP_DEBUG_ASSERT( __kmp_init_serial ); // AC: potentially unsafe, not in sync with shutdown
1440     if( ! TCR_4(__kmp_init_parallel) )
1441         __kmp_parallel_initialize();
1442 
1443     /* setup current data */
1444     master_th     = __kmp_threads[ gtid ]; // AC: potentially unsafe, not in sync with shutdown
1445     parent_team   = master_th->th.th_team;
1446     master_tid    = master_th->th.th_info.ds.ds_tid;
1447     master_this_cons = master_th->th.th_local.this_construct;
1448     root          = master_th->th.th_root;
1449     master_active = root->r.r_active;
1450     master_set_numthreads = master_th->th.th_set_nproc;
1451 
1452 #if OMPT_SUPPORT
1453     ompt_parallel_id_t ompt_parallel_id;
1454     ompt_task_id_t ompt_task_id;
1455     ompt_frame_t *ompt_frame;
1456     ompt_task_id_t my_task_id;
1457     ompt_parallel_id_t my_parallel_id;
1458 
1459     if (ompt_enabled) {
1460         ompt_parallel_id = __ompt_parallel_id_new(gtid);
1461         ompt_task_id = __ompt_get_task_id_internal(0);
1462         ompt_frame = __ompt_get_task_frame_internal(0);
1463     }
1464 #endif
1465 
1466     // Nested level will be an index in the nested nthreads array
1467     level         = parent_team->t.t_level;
1468     active_level  = parent_team->t.t_active_level; // is used to launch non-serial teams even if nested is not allowed
1469 #if OMP_40_ENABLED
1470     teams_level    = master_th->th.th_teams_level; // needed to check nesting inside the teams
1471 #endif
1472 #if KMP_NESTED_HOT_TEAMS
1473     p_hot_teams   = &master_th->th.th_hot_teams;
1474     if( *p_hot_teams == NULL && __kmp_hot_teams_max_level > 0 ) {
1475         *p_hot_teams = (kmp_hot_team_ptr_t*)__kmp_allocate(
1476                 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level);
1477         (*p_hot_teams)[0].hot_team = root->r.r_hot_team;
1478         (*p_hot_teams)[0].hot_team_nth = 1; // it is either actual or not needed (when active_level > 0)
1479     }
1480 #endif
1481 
1482 #if OMPT_SUPPORT
1483     if (ompt_enabled &&
1484         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)) {
1485         int team_size = master_set_numthreads;
1486 
1487         ompt_callbacks.ompt_callback(ompt_event_parallel_begin)(
1488             ompt_task_id, ompt_frame, ompt_parallel_id,
1489             team_size, unwrapped_task, OMPT_INVOKER(call_context));
1490     }
1491 #endif
1492 
1493     master_th->th.th_ident = loc;
1494 
1495 #if OMP_40_ENABLED
1496     if ( master_th->th.th_teams_microtask &&
1497          ap && microtask != (microtask_t)__kmp_teams_master && level == teams_level ) {
1498         // AC: This is start of parallel that is nested inside teams construct.
1499         //     The team is actual (hot), all workers are ready at the fork barrier.
1500         //     No lock needed to initialize the team a bit, then free workers.
1501         parent_team->t.t_ident = loc;
1502         __kmp_alloc_argv_entries( argc, parent_team, TRUE );
1503         parent_team->t.t_argc  = argc;
1504         argv = (void**)parent_team->t.t_argv;
1505         for( i=argc-1; i >= 0; --i )
1506 /* TODO: revert workaround for Intel(R) 64 tracker #96 */
1507 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1508             *argv++ = va_arg( *ap, void * );
1509 #else
1510             *argv++ = va_arg( ap, void * );
1511 #endif
1512         /* Increment our nested depth levels, but not increase the serialization */
1513         if ( parent_team == master_th->th.th_serial_team ) {
1514             // AC: we are in serialized parallel
1515             __kmpc_serialized_parallel(loc, gtid);
1516             KMP_DEBUG_ASSERT( parent_team->t.t_serialized > 1 );
1517             parent_team->t.t_serialized--; // AC: need this in order enquiry functions
1518                                            //     work correctly, will restore at join time
1519 
1520 #if OMPT_SUPPORT
1521             void *dummy;
1522             void **exit_runtime_p;
1523 
1524             ompt_lw_taskteam_t lw_taskteam;
1525 
1526             if (ompt_enabled) {
1527                 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1528                     unwrapped_task, ompt_parallel_id);
1529                 lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1530                 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1531 
1532                 __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1533 
1534 #if OMPT_TRACE
1535                 /* OMPT implicit task begin */
1536                 my_task_id = lw_taskteam.ompt_task_info.task_id;
1537                 my_parallel_id = parent_team->t.ompt_team_info.parallel_id;
1538                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1539                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1540                         my_parallel_id, my_task_id);
1541                 }
1542 #endif
1543 
1544                 /* OMPT state */
1545                 master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1546             } else {
1547                 exit_runtime_p = &dummy;
1548             }
1549 #endif
1550 
1551             {
1552                 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1553                 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1554                 __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1555 #if OMPT_SUPPORT
1556                                         , exit_runtime_p
1557 #endif
1558                                         );
1559             }
1560 
1561 #if OMPT_SUPPORT
1562             *exit_runtime_p = NULL;
1563             if (ompt_enabled) {
1564 #if OMPT_TRACE
1565                 lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1566 
1567                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1568                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1569                         ompt_parallel_id, ompt_task_id);
1570                 }
1571 
1572                 __ompt_lw_taskteam_unlink(master_th);
1573                 // reset clear the task id only after unlinking the task
1574                 lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1575 #endif
1576 
1577                 if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1578                     ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1579                         ompt_parallel_id, ompt_task_id,
1580                         OMPT_INVOKER(call_context));
1581                 }
1582                 master_th->th.ompt_thread_info.state = ompt_state_overhead;
1583             }
1584 #endif
1585             return TRUE;
1586         }
1587 
1588         parent_team->t.t_pkfn  = microtask;
1589 #if OMPT_SUPPORT
1590         parent_team->t.ompt_team_info.microtask = unwrapped_task;
1591 #endif
1592         parent_team->t.t_invoke = invoker;
1593         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1594         parent_team->t.t_active_level ++;
1595         parent_team->t.t_level ++;
1596 
1597         /* Change number of threads in the team if requested */
1598         if ( master_set_numthreads ) {   // The parallel has num_threads clause
1599             if ( master_set_numthreads < master_th->th.th_teams_size.nth ) {
1600                 // AC: only can reduce the number of threads dynamically, cannot increase
1601                 kmp_info_t **other_threads = parent_team->t.t_threads;
1602                 parent_team->t.t_nproc = master_set_numthreads;
1603                 for ( i = 0; i < master_set_numthreads; ++i ) {
1604                     other_threads[i]->th.th_team_nproc = master_set_numthreads;
1605                 }
1606                 // Keep extra threads hot in the team for possible next parallels
1607             }
1608             master_th->th.th_set_nproc = 0;
1609         }
1610 
1611 #if USE_DEBUGGER
1612     if ( __kmp_debugging ) {    // Let debugger override number of threads.
1613         int nth = __kmp_omp_num_threads( loc );
1614         if ( nth > 0 ) {        // 0 means debugger does not want to change number of threads.
1615             master_set_numthreads = nth;
1616         }; // if
1617     }; // if
1618 #endif
1619 
1620         KF_TRACE( 10, ( "__kmp_fork_call: before internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1621         __kmp_internal_fork( loc, gtid, parent_team );
1622         KF_TRACE( 10, ( "__kmp_fork_call: after internal fork: root=%p, team=%p, master_th=%p, gtid=%d\n", root, parent_team, master_th, gtid ) );
1623 
1624         /* Invoke microtask for MASTER thread */
1625         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
1626                     gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1627 
1628         {
1629             KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1630             KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1631             if (! parent_team->t.t_invoke( gtid )) {
1632                 KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
1633             }
1634         }
1635         KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
1636             gtid, parent_team->t.t_id, parent_team->t.t_pkfn ) );
1637         KMP_MB();       /* Flush all pending memory write invalidates.  */
1638 
1639         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
1640 
1641         return TRUE;
1642     } // Parallel closely nested in teams construct
1643 #endif /* OMP_40_ENABLED */
1644 
1645 #if KMP_DEBUG
1646     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
1647         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
1648     }
1649 #endif
1650 
1651     if ( parent_team->t.t_active_level >= master_th->th.th_current_task->td_icvs.max_active_levels ) {
1652         nthreads = 1;
1653     } else {
1654 #if OMP_40_ENABLED
1655         int enter_teams = ((ap==NULL && active_level==0)||(ap && teams_level>0 && teams_level==level));
1656 #endif
1657         nthreads = master_set_numthreads ?
1658             master_set_numthreads : get__nproc_2( parent_team, master_tid ); // TODO: get nproc directly from current task
1659 
1660         // Check if we need to take forkjoin lock? (no need for serialized parallel out of teams construct).
1661         // This code moved here from __kmp_reserve_threads() to speedup nested serialized parallels.
1662         if (nthreads > 1) {
1663             if ( ( !get__nested(master_th) && (root->r.r_in_parallel
1664 #if OMP_40_ENABLED
1665                 && !enter_teams
1666 #endif /* OMP_40_ENABLED */
1667             ) ) || ( __kmp_library == library_serial ) ) {
1668                 KC_TRACE( 10, ( "__kmp_fork_call: T#%d serializing team; requested %d threads\n",
1669                                 gtid, nthreads ));
1670                 nthreads = 1;
1671             }
1672         }
1673         if ( nthreads > 1 ) {
1674             /* determine how many new threads we can use */
1675             __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
1676 
1677             nthreads = __kmp_reserve_threads(root, parent_team, master_tid, nthreads
1678 #if OMP_40_ENABLED
1679 /* AC: If we execute teams from parallel region (on host), then teams should be created
1680    but each can only have 1 thread if nesting is disabled. If teams called from serial region,
1681    then teams and their threads should be created regardless of the nesting setting. */
1682                                          , enter_teams
1683 #endif /* OMP_40_ENABLED */
1684                                          );
1685             if ( nthreads == 1 ) {
1686                 // Free lock for single thread execution here;
1687                 // for multi-thread execution it will be freed later
1688                 // after team of threads created and initialized
1689                 __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
1690             }
1691         }
1692     }
1693     KMP_DEBUG_ASSERT( nthreads > 0 );
1694 
1695     /* If we temporarily changed the set number of threads then restore it now */
1696     master_th->th.th_set_nproc = 0;
1697 
1698     /* create a serialized parallel region? */
1699     if ( nthreads == 1 ) {
1700         /* josh todo: hypothetical question: what do we do for OS X*? */
1701 #if KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64)
1702         void *   args[ argc ];
1703 #else
1704         void * * args = (void**) KMP_ALLOCA( argc * sizeof( void * ) );
1705 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) */
1706 
1707         KA_TRACE( 20, ("__kmp_fork_call: T#%d serializing parallel region\n", gtid ));
1708 
1709         __kmpc_serialized_parallel(loc, gtid);
1710 
1711         if ( call_context == fork_context_intel ) {
1712             /* TODO this sucks, use the compiler itself to pass args! :) */
1713             master_th->th.th_serial_team->t.t_ident = loc;
1714 #if OMP_40_ENABLED
1715             if ( !ap ) {
1716                 // revert change made in __kmpc_serialized_parallel()
1717                 master_th->th.th_serial_team->t.t_level--;
1718                 // Get args from parent team for teams construct
1719 
1720 #if OMPT_SUPPORT
1721                 void *dummy;
1722                 void **exit_runtime_p;
1723 
1724                 ompt_lw_taskteam_t lw_taskteam;
1725 
1726                 if (ompt_enabled) {
1727                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1728                         unwrapped_task, ompt_parallel_id);
1729                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1730                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1731 
1732                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1733 
1734 #if OMPT_TRACE
1735                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1736                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1737                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1738                             ompt_parallel_id, my_task_id);
1739                     }
1740 #endif
1741 
1742                     /* OMPT state */
1743                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1744                 } else {
1745                     exit_runtime_p = &dummy;
1746                 }
1747 #endif
1748 
1749                 {
1750                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1751                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1752                     __kmp_invoke_microtask( microtask, gtid, 0, argc, parent_team->t.t_argv
1753 #if OMPT_SUPPORT
1754                         , exit_runtime_p
1755 #endif
1756                     );
1757                 }
1758 
1759 #if OMPT_SUPPORT
1760                 *exit_runtime_p = NULL;
1761                 if (ompt_enabled) {
1762                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1763 
1764 #if OMPT_TRACE
1765                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1766                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1767                             ompt_parallel_id, ompt_task_id);
1768                     }
1769 #endif
1770 
1771                     __ompt_lw_taskteam_unlink(master_th);
1772                     // reset clear the task id only after unlinking the task
1773                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1774 
1775                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1776                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1777                             ompt_parallel_id, ompt_task_id,
1778                             OMPT_INVOKER(call_context));
1779                     }
1780                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1781                 }
1782 #endif
1783             } else if ( microtask == (microtask_t)__kmp_teams_master ) {
1784                 KMP_DEBUG_ASSERT( master_th->th.th_team == master_th->th.th_serial_team );
1785                 team = master_th->th.th_team;
1786                 //team->t.t_pkfn = microtask;
1787                 team->t.t_invoke = invoker;
1788                 __kmp_alloc_argv_entries( argc, team, TRUE );
1789                 team->t.t_argc = argc;
1790                 argv = (void**) team->t.t_argv;
1791                 if ( ap ) {
1792                     for( i=argc-1; i >= 0; --i )
1793 // TODO: revert workaround for Intel(R) 64 tracker #96
1794 # if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1795                         *argv++ = va_arg( *ap, void * );
1796 # else
1797                         *argv++ = va_arg( ap, void * );
1798 # endif
1799                 } else {
1800                     for( i=0; i < argc; ++i )
1801                         // Get args from parent team for teams construct
1802                         argv[i] = parent_team->t.t_argv[i];
1803                 }
1804                 // AC: revert change made in __kmpc_serialized_parallel()
1805                 //     because initial code in teams should have level=0
1806                 team->t.t_level--;
1807                 // AC: call special invoker for outer "parallel" of the teams construct
1808                 {
1809                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1810                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1811                     invoker(gtid);
1812                 }
1813             } else {
1814 #endif /* OMP_40_ENABLED */
1815                 argv = args;
1816                 for( i=argc-1; i >= 0; --i )
1817 // TODO: revert workaround for Intel(R) 64 tracker #96
1818 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
1819                     *argv++ = va_arg( *ap, void * );
1820 #else
1821                     *argv++ = va_arg( ap, void * );
1822 #endif
1823                 KMP_MB();
1824 
1825 #if OMPT_SUPPORT
1826                 void *dummy;
1827                 void **exit_runtime_p;
1828 
1829                 ompt_lw_taskteam_t lw_taskteam;
1830 
1831                 if (ompt_enabled) {
1832                     __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid,
1833                         unwrapped_task, ompt_parallel_id);
1834                     lw_taskteam.ompt_task_info.task_id = __ompt_task_id_new(gtid);
1835                     exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame);
1836 
1837                     __ompt_lw_taskteam_link(&lw_taskteam, master_th);
1838 
1839 #if OMPT_TRACE
1840                     /* OMPT implicit task begin */
1841                     my_task_id = lw_taskteam.ompt_task_info.task_id;
1842                     my_parallel_id = ompt_parallel_id;
1843                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
1844                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
1845                             my_parallel_id, my_task_id);
1846                     }
1847 #endif
1848 
1849                     /* OMPT state */
1850                     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
1851                 } else {
1852                     exit_runtime_p = &dummy;
1853                 }
1854 #endif
1855 
1856                 {
1857                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
1858                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
1859                     __kmp_invoke_microtask( microtask, gtid, 0, argc, args
1860 #if OMPT_SUPPORT
1861                         , exit_runtime_p
1862 #endif
1863                     );
1864                 }
1865 
1866 #if OMPT_SUPPORT
1867                 *exit_runtime_p = NULL;
1868                 if (ompt_enabled) {
1869 #if OMPT_TRACE
1870                     lw_taskteam.ompt_task_info.frame.exit_runtime_frame = NULL;
1871 
1872                     if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
1873                         ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
1874                             my_parallel_id, my_task_id);
1875                     }
1876 #endif
1877 
1878                     __ompt_lw_taskteam_unlink(master_th);
1879                     // reset clear the task id only after unlinking the task
1880                     lw_taskteam.ompt_task_info.task_id = ompt_task_id_none;
1881 
1882                     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
1883                         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
1884                             ompt_parallel_id, ompt_task_id,
1885                             OMPT_INVOKER(call_context));
1886                     }
1887                     master_th->th.ompt_thread_info.state = ompt_state_overhead;
1888                 }
1889 #endif
1890 #if OMP_40_ENABLED
1891             }
1892 #endif /* OMP_40_ENABLED */
1893         }
1894         else if ( call_context == fork_context_gnu ) {
1895 #if OMPT_SUPPORT
1896             ompt_lw_taskteam_t *lwt = (ompt_lw_taskteam_t *)
1897                 __kmp_allocate(sizeof(ompt_lw_taskteam_t));
1898             __ompt_lw_taskteam_init(lwt, master_th, gtid,
1899                 unwrapped_task, ompt_parallel_id);
1900 
1901             lwt->ompt_task_info.task_id = __ompt_task_id_new(gtid);
1902             lwt->ompt_task_info.frame.exit_runtime_frame = NULL;
1903             __ompt_lw_taskteam_link(lwt, master_th);
1904 #endif
1905 
1906             // we were called from GNU native code
1907             KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1908             return FALSE;
1909         }
1910         else {
1911             KMP_ASSERT2( call_context < fork_context_last, "__kmp_fork_call: unknown fork_context parameter" );
1912         }
1913 
1914 
1915         KA_TRACE( 20, ("__kmp_fork_call: T#%d serial exit\n", gtid ));
1916         KMP_MB();
1917         return FALSE;
1918     }
1919 
1920     // GEH: only modify the executing flag in the case when not serialized
1921     //      serialized case is handled in kmpc_serialized_parallel
1922     KF_TRACE( 10, ( "__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, curtask=%p, curtask_max_aclevel=%d\n",
1923                   parent_team->t.t_active_level, master_th, master_th->th.th_current_task,
1924                   master_th->th.th_current_task->td_icvs.max_active_levels ) );
1925     // TODO: GEH - cannot do this assertion because root thread not set up as executing
1926     // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 );
1927     master_th->th.th_current_task->td_flags.executing = 0;
1928 
1929 #if OMP_40_ENABLED
1930     if ( !master_th->th.th_teams_microtask || level > teams_level )
1931 #endif /* OMP_40_ENABLED */
1932     {
1933         /* Increment our nested depth level */
1934         KMP_TEST_THEN_INC32( (kmp_int32*) &root->r.r_in_parallel );
1935     }
1936 
1937     // See if we need to make a copy of the ICVs.
1938     int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc;
1939     if ((level+1 < __kmp_nested_nth.used) && (__kmp_nested_nth.nth[level+1] != nthreads_icv)) {
1940         nthreads_icv = __kmp_nested_nth.nth[level+1];
1941     }
1942     else {
1943         nthreads_icv = 0;  // don't update
1944     }
1945 
1946 #if OMP_40_ENABLED
1947     // Figure out the proc_bind_policy for the new team.
1948     kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind;
1949     kmp_proc_bind_t proc_bind_icv = proc_bind_default; // proc_bind_default means don't update
1950     if ( master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false ) {
1951         proc_bind = proc_bind_false;
1952     }
1953     else {
1954         if (proc_bind == proc_bind_default) {
1955             // No proc_bind clause specified; use current proc-bind-var for this parallel region
1956             proc_bind = master_th->th.th_current_task->td_icvs.proc_bind;
1957         }
1958         /* else: The proc_bind policy was specified explicitly on parallel clause. This
1959            overrides proc-bind-var for this parallel region, but does not change proc-bind-var. */
1960         // Figure the value of proc-bind-var for the child threads.
1961         if ((level+1 < __kmp_nested_proc_bind.used)
1962             && (__kmp_nested_proc_bind.bind_types[level+1] != master_th->th.th_current_task->td_icvs.proc_bind)) {
1963             proc_bind_icv = __kmp_nested_proc_bind.bind_types[level+1];
1964         }
1965     }
1966 
1967     // Reset for next parallel region
1968     master_th->th.th_set_proc_bind = proc_bind_default;
1969 #endif /* OMP_40_ENABLED */
1970 
1971     if ((nthreads_icv > 0)
1972 #if OMP_40_ENABLED
1973         || (proc_bind_icv != proc_bind_default)
1974 #endif /* OMP_40_ENABLED */
1975         ) {
1976         kmp_internal_control_t new_icvs;
1977         copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs);
1978         new_icvs.next = NULL;
1979         if (nthreads_icv > 0) {
1980             new_icvs.nproc = nthreads_icv;
1981         }
1982 
1983 #if OMP_40_ENABLED
1984         if (proc_bind_icv != proc_bind_default) {
1985             new_icvs.proc_bind = proc_bind_icv;
1986         }
1987 #endif /* OMP_40_ENABLED */
1988 
1989         /* allocate a new parallel team */
1990         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
1991         team = __kmp_allocate_team(root, nthreads, nthreads,
1992 #if OMPT_SUPPORT
1993                                    ompt_parallel_id,
1994 #endif
1995 #if OMP_40_ENABLED
1996                                    proc_bind,
1997 #endif
1998                                    &new_icvs, argc USE_NESTED_HOT_ARG(master_th) );
1999     } else {
2000         /* allocate a new parallel team */
2001         KF_TRACE( 10, ( "__kmp_fork_call: before __kmp_allocate_team\n" ) );
2002         team = __kmp_allocate_team(root, nthreads, nthreads,
2003 #if OMPT_SUPPORT
2004                                    ompt_parallel_id,
2005 #endif
2006 #if OMP_40_ENABLED
2007                                    proc_bind,
2008 #endif
2009                                    &master_th->th.th_current_task->td_icvs, argc
2010                                    USE_NESTED_HOT_ARG(master_th) );
2011     }
2012     KF_TRACE( 10, ( "__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team ) );
2013 
2014     /* setup the new team */
2015     KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid);
2016     KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons);
2017     KMP_CHECK_UPDATE(team->t.t_ident, loc);
2018     KMP_CHECK_UPDATE(team->t.t_parent, parent_team);
2019     KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask);
2020 #if OMPT_SUPPORT
2021     KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.microtask, unwrapped_task);
2022 #endif
2023     KMP_CHECK_UPDATE(team->t.t_invoke, invoker);  /* TODO move this to root, maybe */
2024     // TODO: parent_team->t.t_level == INT_MAX ???
2025 #if OMP_40_ENABLED
2026     if ( !master_th->th.th_teams_microtask || level > teams_level ) {
2027 #endif /* OMP_40_ENABLED */
2028         int new_level = parent_team->t.t_level + 1;
2029         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2030         new_level = parent_team->t.t_active_level + 1;
2031         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2032 #if OMP_40_ENABLED
2033     } else {
2034         // AC: Do not increase parallel level at start of the teams construct
2035         int new_level = parent_team->t.t_level;
2036         KMP_CHECK_UPDATE(team->t.t_level, new_level);
2037         new_level = parent_team->t.t_active_level;
2038         KMP_CHECK_UPDATE(team->t.t_active_level, new_level);
2039     }
2040 #endif /* OMP_40_ENABLED */
2041     kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid);
2042     if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || team->t.t_sched.chunk != new_sched.chunk)
2043         team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
2044 
2045 #if OMP_40_ENABLED
2046     KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq);
2047 #endif
2048 
2049     // Update the floating point rounding in the team if required.
2050     propagateFPControl(team);
2051 
2052     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2053         // Set master's task team to team's task team. Unless this is hot team, it should be NULL.
2054 #if 0
2055         // Patch out an assertion that trips while the runtime seems to operate correctly.
2056         // Avoiding the preconditions that cause the assertion to trip has been promised as a forthcoming patch.
2057         KMP_DEBUG_ASSERT(master_th->th.th_task_team == parent_team->t.t_task_team[master_th->th.th_task_state]);
2058 #endif
2059         KA_TRACE( 20, ( "__kmp_fork_call: Master T#%d pushing task_team %p / team %p, new task_team %p / team %p\n",
2060                       __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team,
2061                       parent_team, team->t.t_task_team[master_th->th.th_task_state], team ) );
2062 
2063         if ( active_level || master_th->th.th_task_team ) {
2064             // Take a memo of master's task_state
2065             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2066             if (master_th->th.th_task_state_top >= master_th->th.th_task_state_stack_sz) { // increase size
2067                 kmp_uint32 new_size = 2*master_th->th.th_task_state_stack_sz;
2068                 kmp_uint8 *old_stack, *new_stack;
2069                 kmp_uint32 i;
2070                 new_stack = (kmp_uint8 *)__kmp_allocate(new_size);
2071                 for (i=0; i<master_th->th.th_task_state_stack_sz; ++i) {
2072                     new_stack[i] = master_th->th.th_task_state_memo_stack[i];
2073                 }
2074                 for (i=master_th->th.th_task_state_stack_sz; i<new_size; ++i) { // zero-init rest of stack
2075                     new_stack[i] = 0;
2076                 }
2077                 old_stack = master_th->th.th_task_state_memo_stack;
2078                 master_th->th.th_task_state_memo_stack = new_stack;
2079                 master_th->th.th_task_state_stack_sz = new_size;
2080                 __kmp_free(old_stack);
2081             }
2082             // Store master's task_state on stack
2083             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2084             master_th->th.th_task_state_top++;
2085 #if KMP_NESTED_HOT_TEAMS
2086             if (team == master_th->th.th_hot_teams[active_level].hot_team) { // Restore master's nested state if nested hot team
2087                 master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2088             }
2089             else {
2090 #endif
2091                 master_th->th.th_task_state = 0;
2092 #if KMP_NESTED_HOT_TEAMS
2093             }
2094 #endif
2095         }
2096 #if !KMP_NESTED_HOT_TEAMS
2097         KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || (team == root->r.r_hot_team));
2098 #endif
2099     }
2100 
2101     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n",
2102                 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, team->t.t_nproc ));
2103     KMP_DEBUG_ASSERT( team != root->r.r_hot_team ||
2104                       ( team->t.t_master_tid == 0 &&
2105                         ( team->t.t_parent == root->r.r_root_team || team->t.t_parent->t.t_serialized ) ));
2106     KMP_MB();
2107 
2108     /* now, setup the arguments */
2109     argv = (void**)team->t.t_argv;
2110 #if OMP_40_ENABLED
2111     if ( ap ) {
2112 #endif /* OMP_40_ENABLED */
2113         for ( i=argc-1; i >= 0; --i ) {
2114 // TODO: revert workaround for Intel(R) 64 tracker #96
2115 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX
2116             void *new_argv = va_arg(*ap, void *);
2117 #else
2118             void *new_argv = va_arg(ap, void *);
2119 #endif
2120             KMP_CHECK_UPDATE(*argv, new_argv);
2121             argv++;
2122         }
2123 #if OMP_40_ENABLED
2124     } else {
2125         for ( i=0; i < argc; ++i ) {
2126             // Get args from parent team for teams construct
2127             KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]);
2128         }
2129     }
2130 #endif /* OMP_40_ENABLED */
2131 
2132     /* now actually fork the threads */
2133     KMP_CHECK_UPDATE(team->t.t_master_active, master_active);
2134     if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong
2135         root->r.r_active = TRUE;
2136 
2137     __kmp_fork_team_threads( root, team, master_th, gtid );
2138     __kmp_setup_icv_copy( team, nthreads, &master_th->th.th_current_task->td_icvs, loc );
2139 
2140 #if OMPT_SUPPORT
2141     master_th->th.ompt_thread_info.state = ompt_state_work_parallel;
2142 #endif
2143 
2144     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2145 
2146 #if USE_ITT_BUILD
2147     if ( team->t.t_active_level == 1 // only report frames at level 1
2148 # if OMP_40_ENABLED
2149         && !master_th->th.th_teams_microtask // not in teams construct
2150 # endif /* OMP_40_ENABLED */
2151     ) {
2152 #if USE_ITT_NOTIFY
2153         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) &&
2154              ( __kmp_forkjoin_frames_mode == 3 || __kmp_forkjoin_frames_mode == 1 ) )
2155         {
2156             kmp_uint64 tmp_time = 0;
2157             if ( __itt_get_timestamp_ptr )
2158                 tmp_time = __itt_get_timestamp();
2159             // Internal fork - report frame begin
2160             master_th->th.th_frame_time  = tmp_time;
2161             if ( __kmp_forkjoin_frames_mode == 3 )
2162                 team->t.t_region_time = tmp_time;
2163         } else // only one notification scheme (either "submit" or "forking/joined", not both)
2164 #endif /* USE_ITT_NOTIFY */
2165         if ( ( __itt_frame_begin_v3_ptr || KMP_ITT_DEBUG ) &&
2166              __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode )
2167         { // Mark start of "parallel" region for VTune.
2168             __kmp_itt_region_forking(gtid, team->t.t_nproc, 0);
2169         }
2170     }
2171 #endif /* USE_ITT_BUILD */
2172 
2173     /* now go on and do the work */
2174     KMP_DEBUG_ASSERT( team == __kmp_threads[gtid]->th.th_team );
2175     KMP_MB();
2176     KF_TRACE(10, ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n",
2177                   root, team, master_th, gtid));
2178 
2179 #if USE_ITT_BUILD
2180     if ( __itt_stack_caller_create_ptr ) {
2181         team->t.t_stack_id = __kmp_itt_stack_caller_create(); // create new stack stitching id before entering fork barrier
2182     }
2183 #endif /* USE_ITT_BUILD */
2184 
2185 #if OMP_40_ENABLED
2186     if ( ap )   // AC: skip __kmp_internal_fork at teams construct, let only master threads execute
2187 #endif /* OMP_40_ENABLED */
2188     {
2189         __kmp_internal_fork( loc, gtid, team );
2190         KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, master_th=%p, gtid=%d\n",
2191                       root, team, master_th, gtid));
2192     }
2193 
2194     if (call_context == fork_context_gnu) {
2195         KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2196         return TRUE;
2197     }
2198 
2199     /* Invoke microtask for MASTER thread */
2200     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n",
2201                 gtid, team->t.t_id, team->t.t_pkfn ) );
2202     }  // END of timer KMP_fork_call block
2203 
2204     {
2205         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
2206         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
2207         if (! team->t.t_invoke( gtid )) {
2208             KMP_ASSERT2( 0, "cannot invoke microtask for MASTER thread" );
2209         }
2210     }
2211     KA_TRACE( 20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n",
2212         gtid, team->t.t_id, team->t.t_pkfn ) );
2213     KMP_MB();       /* Flush all pending memory write invalidates.  */
2214 
2215     KA_TRACE( 20, ("__kmp_fork_call: parallel exit T#%d\n", gtid ));
2216 
2217 #if OMPT_SUPPORT
2218     if (ompt_enabled) {
2219         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2220     }
2221 #endif
2222 
2223     return TRUE;
2224 }
2225 
2226 #if OMPT_SUPPORT
2227 static inline void
2228 __kmp_join_restore_state(
2229     kmp_info_t *thread,
2230     kmp_team_t *team)
2231 {
2232     // restore state outside the region
2233     thread->th.ompt_thread_info.state = ((team->t.t_serialized) ?
2234         ompt_state_work_serial : ompt_state_work_parallel);
2235 }
2236 
2237 static inline void
2238 __kmp_join_ompt(
2239     kmp_info_t *thread,
2240     kmp_team_t *team,
2241     ompt_parallel_id_t parallel_id,
2242     fork_context_e fork_context)
2243 {
2244     ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2245     if (ompt_callbacks.ompt_callback(ompt_event_parallel_end)) {
2246         ompt_callbacks.ompt_callback(ompt_event_parallel_end)(
2247             parallel_id, task_info->task_id, OMPT_INVOKER(fork_context));
2248     }
2249 
2250     task_info->frame.reenter_runtime_frame = NULL;
2251     __kmp_join_restore_state(thread,team);
2252 }
2253 #endif
2254 
2255 void
2256 __kmp_join_call(ident_t *loc, int gtid
2257 #if OMPT_SUPPORT
2258                , enum fork_context_e fork_context
2259 #endif
2260 #if OMP_40_ENABLED
2261                , int exit_teams
2262 #endif /* OMP_40_ENABLED */
2263 )
2264 {
2265     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call);
2266     kmp_team_t     *team;
2267     kmp_team_t     *parent_team;
2268     kmp_info_t     *master_th;
2269     kmp_root_t     *root;
2270     int             master_active;
2271     int             i;
2272 
2273     KA_TRACE( 20, ("__kmp_join_call: enter T#%d\n", gtid ));
2274 
2275     /* setup current data */
2276     master_th     = __kmp_threads[ gtid ];
2277     root          = master_th->th.th_root;
2278     team          = master_th->th.th_team;
2279     parent_team   = team->t.t_parent;
2280 
2281     master_th->th.th_ident = loc;
2282 
2283 #if OMPT_SUPPORT
2284     if (ompt_enabled) {
2285         master_th->th.ompt_thread_info.state = ompt_state_overhead;
2286     }
2287 #endif
2288 
2289 #if KMP_DEBUG
2290     if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) {
2291         KA_TRACE( 20, ( "__kmp_join_call: T#%d, old team = %p old task_team = %p, th_task_team = %p\n",
2292                          __kmp_gtid_from_thread( master_th ), team,
2293                          team->t.t_task_team[master_th->th.th_task_state], master_th->th.th_task_team) );
2294         KMP_DEBUG_ASSERT( master_th->th.th_task_team == team->t.t_task_team[master_th->th.th_task_state] );
2295     }
2296 #endif
2297 
2298     if( team->t.t_serialized ) {
2299 #if OMP_40_ENABLED
2300         if ( master_th->th.th_teams_microtask ) {
2301             // We are in teams construct
2302             int level = team->t.t_level;
2303             int tlevel = master_th->th.th_teams_level;
2304             if ( level == tlevel ) {
2305                 // AC: we haven't incremented it earlier at start of teams construct,
2306                 //     so do it here - at the end of teams construct
2307                 team->t.t_level++;
2308             } else if ( level == tlevel + 1 ) {
2309                 // AC: we are exiting parallel inside teams, need to increment serialization
2310                 //     in order to restore it in the next call to __kmpc_end_serialized_parallel
2311                 team->t.t_serialized++;
2312             }
2313         }
2314 #endif /* OMP_40_ENABLED */
2315         __kmpc_end_serialized_parallel( loc, gtid );
2316 
2317 #if OMPT_SUPPORT
2318         if (ompt_enabled) {
2319             __kmp_join_restore_state(master_th, parent_team);
2320         }
2321 #endif
2322 
2323         return;
2324     }
2325 
2326     master_active = team->t.t_master_active;
2327 
2328 #if OMP_40_ENABLED
2329     if (!exit_teams)
2330 #endif /* OMP_40_ENABLED */
2331     {
2332         // AC: No barrier for internal teams at exit from teams construct.
2333         //     But there is barrier for external team (league).
2334         __kmp_internal_join( loc, gtid, team );
2335     }
2336 #if OMP_40_ENABLED
2337     else {
2338         master_th->th.th_task_state = 0; // AC: no tasking in teams (out of any parallel)
2339     }
2340 #endif /* OMP_40_ENABLED */
2341 
2342     KMP_MB();
2343 
2344 #if OMPT_SUPPORT
2345     ompt_parallel_id_t parallel_id = team->t.ompt_team_info.parallel_id;
2346 #endif
2347 
2348 #if USE_ITT_BUILD
2349     if ( __itt_stack_caller_create_ptr ) {
2350         __kmp_itt_stack_caller_destroy( (__itt_caller)team->t.t_stack_id ); // destroy the stack stitching id after join barrier
2351     }
2352 
2353     // Mark end of "parallel" region for VTune.
2354     if ( team->t.t_active_level == 1
2355 # if OMP_40_ENABLED
2356         && !master_th->th.th_teams_microtask /* not in teams construct */
2357 # endif /* OMP_40_ENABLED */
2358     ) {
2359         master_th->th.th_ident = loc;
2360         // only one notification scheme (either "submit" or "forking/joined", not both)
2361         if ( ( __itt_frame_submit_v3_ptr || KMP_ITT_DEBUG ) && __kmp_forkjoin_frames_mode == 3 )
2362             __kmp_itt_frame_submit( gtid, team->t.t_region_time, master_th->th.th_frame_time,
2363                                     0, loc, master_th->th.th_team_nproc, 1 );
2364         else if ( ( __itt_frame_end_v3_ptr || KMP_ITT_DEBUG ) &&
2365             ! __kmp_forkjoin_frames_mode && __kmp_forkjoin_frames )
2366             __kmp_itt_region_joined( gtid );
2367     } // active_level == 1
2368 #endif /* USE_ITT_BUILD */
2369 
2370 #if OMP_40_ENABLED
2371     if ( master_th->th.th_teams_microtask &&
2372          !exit_teams &&
2373          team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
2374          team->t.t_level == master_th->th.th_teams_level + 1 ) {
2375         // AC: We need to leave the team structure intact at the end
2376         //     of parallel inside the teams construct, so that at the next
2377         //     parallel same (hot) team works, only adjust nesting levels
2378 
2379         /* Decrement our nested depth level */
2380         team->t.t_level --;
2381         team->t.t_active_level --;
2382         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2383 
2384         /* Restore number of threads in the team if needed */
2385         if ( master_th->th.th_team_nproc < master_th->th.th_teams_size.nth ) {
2386             int old_num = master_th->th.th_team_nproc;
2387             int new_num = master_th->th.th_teams_size.nth;
2388             kmp_info_t **other_threads = team->t.t_threads;
2389             team->t.t_nproc = new_num;
2390             for ( i = 0; i < old_num; ++i ) {
2391                 other_threads[i]->th.th_team_nproc = new_num;
2392             }
2393             // Adjust states of non-used threads of the team
2394             for ( i = old_num; i < new_num; ++i ) {
2395                 // Re-initialize thread's barrier data.
2396                 int b;
2397                 kmp_balign_t * balign = other_threads[i]->th.th_bar;
2398                 for ( b = 0; b < bs_last_barrier; ++ b ) {
2399                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
2400                     KMP_DEBUG_ASSERT(balign[ b ].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
2401 #if USE_DEBUGGER
2402                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
2403 #endif
2404                 }
2405                 if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2406                     // Synchronize thread's task state
2407                     other_threads[i]->th.th_task_state = master_th->th.th_task_state;
2408                 }
2409             }
2410         }
2411 
2412 #if OMPT_SUPPORT
2413         if (ompt_enabled) {
2414             __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2415         }
2416 #endif
2417 
2418         return;
2419     }
2420 #endif /* OMP_40_ENABLED */
2421 
2422     /* do cleanup and restore the parent team */
2423     master_th->th.th_info .ds.ds_tid = team->t.t_master_tid;
2424     master_th->th.th_local.this_construct = team->t.t_master_this_cons;
2425 
2426     master_th->th.th_dispatch =
2427                 & parent_team->t.t_dispatch[ team->t.t_master_tid ];
2428 
2429     /* jc: The following lock has instructions with REL and ACQ semantics,
2430        separating the parallel user code called in this parallel region
2431        from the serial user code called after this function returns.
2432     */
2433     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2434 
2435 #if OMP_40_ENABLED
2436     if ( !master_th->th.th_teams_microtask || team->t.t_level > master_th->th.th_teams_level )
2437 #endif /* OMP_40_ENABLED */
2438     {
2439         /* Decrement our nested depth level */
2440         KMP_TEST_THEN_DEC32( (kmp_int32*) &root->r.r_in_parallel );
2441     }
2442     KMP_DEBUG_ASSERT( root->r.r_in_parallel >= 0 );
2443 
2444 #if OMPT_SUPPORT && OMPT_TRACE
2445     if(ompt_enabled){
2446         ompt_task_info_t *task_info = __ompt_get_taskinfo(0);
2447         if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
2448              ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
2449                parallel_id, task_info->task_id);
2450         }
2451         task_info->frame.exit_runtime_frame = NULL;
2452         task_info->task_id = 0;
2453     }
2454 #endif
2455 
2456     KF_TRACE( 10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n",
2457                    0, master_th, team ) );
2458     __kmp_pop_current_task_from_thread( master_th );
2459 
2460 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
2461     //
2462     // Restore master thread's partition.
2463     //
2464     master_th->th.th_first_place = team->t.t_first_place;
2465     master_th->th.th_last_place = team->t.t_last_place;
2466 #endif /* OMP_40_ENABLED */
2467 
2468     updateHWFPControl (team);
2469 
2470     if ( root->r.r_active != master_active )
2471         root->r.r_active = master_active;
2472 
2473     __kmp_free_team( root, team USE_NESTED_HOT_ARG(master_th) ); // this will free worker threads
2474 
2475     /* this race was fun to find.  make sure the following is in the critical
2476      * region otherwise assertions may fail occasionally since the old team
2477      * may be reallocated and the hierarchy appears inconsistent.  it is
2478      * actually safe to run and won't cause any bugs, but will cause those
2479      * assertion failures.  it's only one deref&assign so might as well put this
2480      * in the critical region */
2481     master_th->th.th_team        =   parent_team;
2482     master_th->th.th_team_nproc  =   parent_team->t.t_nproc;
2483     master_th->th.th_team_master =   parent_team->t.t_threads[0];
2484     master_th->th.th_team_serialized = parent_team->t.t_serialized;
2485 
2486     /* restore serialized team, if need be */
2487     if( parent_team->t.t_serialized &&
2488         parent_team != master_th->th.th_serial_team &&
2489         parent_team != root->r.r_root_team ) {
2490             __kmp_free_team( root, master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL) );
2491             master_th->th.th_serial_team = parent_team;
2492     }
2493 
2494     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
2495         if (master_th->th.th_task_state_top > 0) { // Restore task state from memo stack
2496             KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack);
2497             // Remember master's state if we re-use this nested hot team
2498             master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = master_th->th.th_task_state;
2499             --master_th->th.th_task_state_top; // pop
2500             // Now restore state at this level
2501             master_th->th.th_task_state = master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top];
2502         }
2503         // Copy the task team from the parent team to the master thread
2504         master_th->th.th_task_team = parent_team->t.t_task_team[master_th->th.th_task_state];
2505         KA_TRACE( 20, ( "__kmp_join_call: Master T#%d restoring task_team %p / team %p\n",
2506                         __kmp_gtid_from_thread( master_th ), master_th->th.th_task_team, parent_team ) );
2507     }
2508 
2509      // TODO: GEH - cannot do this assertion because root thread not set up as executing
2510      // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 );
2511      master_th->th.th_current_task->td_flags.executing = 1;
2512 
2513     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2514 
2515 #if OMPT_SUPPORT
2516     if (ompt_enabled) {
2517         __kmp_join_ompt(master_th, parent_team, parallel_id, fork_context);
2518     }
2519 #endif
2520 
2521     KMP_MB();
2522     KA_TRACE( 20, ("__kmp_join_call: exit T#%d\n", gtid ));
2523 }
2524 
2525 /* ------------------------------------------------------------------------ */
2526 /* ------------------------------------------------------------------------ */
2527 
2528 /* Check whether we should push an internal control record onto the
2529    serial team stack.  If so, do it.  */
2530 void
2531 __kmp_save_internal_controls ( kmp_info_t * thread )
2532 {
2533 
2534     if ( thread->th.th_team != thread->th.th_serial_team ) {
2535         return;
2536     }
2537     if (thread->th.th_team->t.t_serialized > 1) {
2538         int push = 0;
2539 
2540         if (thread->th.th_team->t.t_control_stack_top == NULL) {
2541             push = 1;
2542         } else {
2543             if ( thread->th.th_team->t.t_control_stack_top->serial_nesting_level !=
2544                  thread->th.th_team->t.t_serialized ) {
2545                 push = 1;
2546             }
2547         }
2548         if (push) {  /* push a record on the serial team's stack */
2549             kmp_internal_control_t * control = (kmp_internal_control_t *) __kmp_allocate(sizeof(kmp_internal_control_t));
2550 
2551             copy_icvs( control, & thread->th.th_current_task->td_icvs );
2552 
2553             control->serial_nesting_level = thread->th.th_team->t.t_serialized;
2554 
2555             control->next = thread->th.th_team->t.t_control_stack_top;
2556             thread->th.th_team->t.t_control_stack_top = control;
2557         }
2558     }
2559 }
2560 
2561 /* Changes set_nproc */
2562 void
2563 __kmp_set_num_threads( int new_nth, int gtid )
2564 {
2565     kmp_info_t *thread;
2566     kmp_root_t *root;
2567 
2568     KF_TRACE( 10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth ));
2569     KMP_DEBUG_ASSERT( __kmp_init_serial );
2570 
2571     if (new_nth < 1)
2572         new_nth = 1;
2573     else if (new_nth > __kmp_max_nth)
2574         new_nth = __kmp_max_nth;
2575 
2576     KMP_COUNT_VALUE(OMP_set_numthreads, new_nth);
2577     thread = __kmp_threads[gtid];
2578 
2579     __kmp_save_internal_controls( thread );
2580 
2581     set__nproc( thread, new_nth );
2582 
2583     //
2584     // If this omp_set_num_threads() call will cause the hot team size to be
2585     // reduced (in the absence of a num_threads clause), then reduce it now,
2586     // rather than waiting for the next parallel region.
2587     //
2588     root = thread->th.th_root;
2589     if ( __kmp_init_parallel && ( ! root->r.r_active )
2590       && ( root->r.r_hot_team->t.t_nproc > new_nth )
2591 #if KMP_NESTED_HOT_TEAMS
2592       && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode
2593 #endif
2594     ) {
2595         kmp_team_t *hot_team = root->r.r_hot_team;
2596         int f;
2597 
2598         __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
2599 
2600         // Release the extra threads we don't need any more.
2601         for ( f = new_nth;  f < hot_team->t.t_nproc; f++ ) {
2602             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2603             if ( __kmp_tasking_mode != tskm_immediate_exec) {
2604                 // When decreasing team size, threads no longer in the team should unref task team.
2605                 hot_team->t.t_threads[f]->th.th_task_team = NULL;
2606             }
2607             __kmp_free_thread( hot_team->t.t_threads[f] );
2608             hot_team->t.t_threads[f] =  NULL;
2609         }
2610         hot_team->t.t_nproc = new_nth;
2611 #if KMP_NESTED_HOT_TEAMS
2612         if( thread->th.th_hot_teams ) {
2613             KMP_DEBUG_ASSERT( hot_team == thread->th.th_hot_teams[0].hot_team );
2614             thread->th.th_hot_teams[0].hot_team_nth = new_nth;
2615         }
2616 #endif
2617 
2618         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
2619 
2620         //
2621         // Update the t_nproc field in the threads that are still active.
2622         //
2623         for( f=0 ; f < new_nth; f++ ) {
2624             KMP_DEBUG_ASSERT( hot_team->t.t_threads[f] != NULL );
2625             hot_team->t.t_threads[f]->th.th_team_nproc = new_nth;
2626         }
2627         // Special flag in case omp_set_num_threads() call
2628         hot_team->t.t_size_changed = -1;
2629     }
2630 }
2631 
2632 /* Changes max_active_levels */
2633 void
2634 __kmp_set_max_active_levels( int gtid, int max_active_levels )
2635 {
2636     kmp_info_t *thread;
2637 
2638     KF_TRACE( 10, ( "__kmp_set_max_active_levels: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2639     KMP_DEBUG_ASSERT( __kmp_init_serial );
2640 
2641     // validate max_active_levels
2642     if( max_active_levels < 0 ) {
2643         KMP_WARNING( ActiveLevelsNegative, max_active_levels );
2644         // We ignore this call if the user has specified a negative value.
2645         // The current setting won't be changed. The last valid setting will be used.
2646         // A warning will be issued (if warnings are allowed as controlled by the KMP_WARNINGS env var).
2647         KF_TRACE( 10, ( "__kmp_set_max_active_levels: the call is ignored: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2648         return;
2649     }
2650     if( max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT ) {
2651         // it's OK, the max_active_levels is within the valid range: [ 0; KMP_MAX_ACTIVE_LEVELS_LIMIT ]
2652         // We allow a zero value. (implementation defined behavior)
2653     } else {
2654         KMP_WARNING( ActiveLevelsExceedLimit, max_active_levels, KMP_MAX_ACTIVE_LEVELS_LIMIT  );
2655         max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT;
2656         // Current upper limit is MAX_INT. (implementation defined behavior)
2657         // If the input exceeds the upper limit, we correct the input to be the upper limit. (implementation defined behavior)
2658         // Actually, the flow should never get here until we use MAX_INT limit.
2659     }
2660     KF_TRACE( 10, ( "__kmp_set_max_active_levels: after validation: new max_active_levels for thread %d = (%d)\n", gtid, max_active_levels ) );
2661 
2662     thread = __kmp_threads[ gtid ];
2663 
2664     __kmp_save_internal_controls( thread );
2665 
2666     set__max_active_levels( thread, max_active_levels );
2667 
2668 }
2669 
2670 /* Gets max_active_levels */
2671 int
2672 __kmp_get_max_active_levels( int gtid )
2673 {
2674     kmp_info_t *thread;
2675 
2676     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d\n", gtid ) );
2677     KMP_DEBUG_ASSERT( __kmp_init_serial );
2678 
2679     thread = __kmp_threads[ gtid ];
2680     KMP_DEBUG_ASSERT( thread->th.th_current_task );
2681     KF_TRACE( 10, ( "__kmp_get_max_active_levels: thread %d, curtask=%p, curtask_maxaclevel=%d\n",
2682         gtid, thread->th.th_current_task, thread->th.th_current_task->td_icvs.max_active_levels ) );
2683     return thread->th.th_current_task->td_icvs.max_active_levels;
2684 }
2685 
2686 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */
2687 void
2688 __kmp_set_schedule( int gtid, kmp_sched_t kind, int chunk )
2689 {
2690     kmp_info_t *thread;
2691 //    kmp_team_t *team;
2692 
2693     KF_TRACE( 10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", gtid, (int)kind, chunk ));
2694     KMP_DEBUG_ASSERT( __kmp_init_serial );
2695 
2696     // Check if the kind parameter is valid, correct if needed.
2697     // Valid parameters should fit in one of two intervals - standard or extended:
2698     //       <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper>
2699     // 2008-01-25: 0,  1 - 4,       5,         100,     101 - 102, 103
2700     if ( kind <= kmp_sched_lower || kind >= kmp_sched_upper ||
2701        ( kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std ) )
2702     {
2703         // TODO: Hint needs attention in case we change the default schedule.
2704         __kmp_msg(
2705             kmp_ms_warning,
2706             KMP_MSG( ScheduleKindOutOfRange, kind ),
2707             KMP_HNT( DefaultScheduleKindUsed, "static, no chunk" ),
2708             __kmp_msg_null
2709         );
2710         kind = kmp_sched_default;
2711         chunk = 0;         // ignore chunk value in case of bad kind
2712     }
2713 
2714     thread = __kmp_threads[ gtid ];
2715 
2716     __kmp_save_internal_controls( thread );
2717 
2718     if ( kind < kmp_sched_upper_std ) {
2719         if ( kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK ) {
2720             // differ static chunked vs. unchunked:
2721             // chunk should be invalid to indicate unchunked schedule (which is the default)
2722             thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static;
2723         } else {
2724             thread->th.th_current_task->td_icvs.sched.r_sched_type = __kmp_sch_map[ kind - kmp_sched_lower - 1 ];
2725         }
2726     } else {
2727         //    __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2728         thread->th.th_current_task->td_icvs.sched.r_sched_type =
2729             __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - kmp_sched_lower - 2 ];
2730     }
2731     if ( kind == kmp_sched_auto ) {
2732         // ignore parameter chunk for schedule auto
2733         thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK;
2734     } else {
2735         thread->th.th_current_task->td_icvs.sched.chunk = chunk;
2736     }
2737 }
2738 
2739 /* Gets def_sched_var ICV values */
2740 void
2741 __kmp_get_schedule( int gtid, kmp_sched_t * kind, int * chunk )
2742 {
2743     kmp_info_t     *thread;
2744     enum sched_type th_type;
2745 
2746     KF_TRACE( 10, ("__kmp_get_schedule: thread %d\n", gtid ));
2747     KMP_DEBUG_ASSERT( __kmp_init_serial );
2748 
2749     thread = __kmp_threads[ gtid ];
2750 
2751     th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type;
2752 
2753     switch ( th_type ) {
2754     case kmp_sch_static:
2755     case kmp_sch_static_greedy:
2756     case kmp_sch_static_balanced:
2757         *kind = kmp_sched_static;
2758         *chunk = 0;   // chunk was not set, try to show this fact via zero value
2759         return;
2760     case kmp_sch_static_chunked:
2761         *kind = kmp_sched_static;
2762         break;
2763     case kmp_sch_dynamic_chunked:
2764         *kind = kmp_sched_dynamic;
2765         break;
2766     case kmp_sch_guided_chunked:
2767     case kmp_sch_guided_iterative_chunked:
2768     case kmp_sch_guided_analytical_chunked:
2769         *kind = kmp_sched_guided;
2770         break;
2771     case kmp_sch_auto:
2772         *kind = kmp_sched_auto;
2773         break;
2774     case kmp_sch_trapezoidal:
2775         *kind = kmp_sched_trapezoidal;
2776         break;
2777 #if KMP_STATIC_STEAL_ENABLED
2778     case kmp_sch_static_steal:
2779         *kind = kmp_sched_static_steal;
2780         break;
2781 #endif
2782     default:
2783         KMP_FATAL( UnknownSchedulingType, th_type );
2784     }
2785 
2786     *chunk = thread->th.th_current_task->td_icvs.sched.chunk;
2787 }
2788 
2789 int
2790 __kmp_get_ancestor_thread_num( int gtid, int level ) {
2791 
2792     int ii, dd;
2793     kmp_team_t *team;
2794     kmp_info_t *thr;
2795 
2796     KF_TRACE( 10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level ));
2797     KMP_DEBUG_ASSERT( __kmp_init_serial );
2798 
2799     // validate level
2800     if( level == 0 ) return 0;
2801     if( level < 0 ) return -1;
2802     thr = __kmp_threads[ gtid ];
2803     team = thr->th.th_team;
2804     ii = team->t.t_level;
2805     if( level > ii ) return -1;
2806 
2807 #if OMP_40_ENABLED
2808     if( thr->th.th_teams_microtask ) {
2809         // AC: we are in teams region where multiple nested teams have same level
2810         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2811         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2812             KMP_DEBUG_ASSERT( ii >= tlevel );
2813             // AC: As we need to pass by the teams league, we need to artificially increase ii
2814             if ( ii == tlevel ) {
2815                 ii += 2; // three teams have same level
2816             } else {
2817                 ii ++;   // two teams have same level
2818             }
2819         }
2820     }
2821 #endif
2822 
2823     if( ii == level ) return __kmp_tid_from_gtid( gtid );
2824 
2825     dd = team->t.t_serialized;
2826     level++;
2827     while( ii > level )
2828     {
2829         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2830         {
2831         }
2832         if( ( team->t.t_serialized ) && ( !dd ) ) {
2833             team = team->t.t_parent;
2834             continue;
2835         }
2836         if( ii > level ) {
2837             team = team->t.t_parent;
2838             dd = team->t.t_serialized;
2839             ii--;
2840         }
2841     }
2842 
2843     return ( dd > 1 ) ? ( 0 ) : ( team->t.t_master_tid );
2844 }
2845 
2846 int
2847 __kmp_get_team_size( int gtid, int level ) {
2848 
2849     int ii, dd;
2850     kmp_team_t *team;
2851     kmp_info_t *thr;
2852 
2853     KF_TRACE( 10, ("__kmp_get_team_size: thread %d %d\n", gtid, level ));
2854     KMP_DEBUG_ASSERT( __kmp_init_serial );
2855 
2856     // validate level
2857     if( level == 0 ) return 1;
2858     if( level < 0 ) return -1;
2859     thr = __kmp_threads[ gtid ];
2860     team = thr->th.th_team;
2861     ii = team->t.t_level;
2862     if( level > ii ) return -1;
2863 
2864 #if OMP_40_ENABLED
2865     if( thr->th.th_teams_microtask ) {
2866         // AC: we are in teams region where multiple nested teams have same level
2867         int tlevel = thr->th.th_teams_level; // the level of the teams construct
2868         if( level <= tlevel ) { // otherwise usual algorithm works (will not touch the teams)
2869             KMP_DEBUG_ASSERT( ii >= tlevel );
2870             // AC: As we need to pass by the teams league, we need to artificially increase ii
2871             if ( ii == tlevel ) {
2872                 ii += 2; // three teams have same level
2873             } else {
2874                 ii ++;   // two teams have same level
2875             }
2876         }
2877     }
2878 #endif
2879 
2880     while( ii > level )
2881     {
2882         for( dd = team->t.t_serialized; ( dd > 0 ) && ( ii > level ); dd--, ii-- )
2883         {
2884         }
2885         if( team->t.t_serialized && ( !dd ) ) {
2886             team = team->t.t_parent;
2887             continue;
2888         }
2889         if( ii > level ) {
2890             team = team->t.t_parent;
2891             ii--;
2892         }
2893     }
2894 
2895     return team->t.t_nproc;
2896 }
2897 
2898 kmp_r_sched_t
2899 __kmp_get_schedule_global() {
2900 // This routine created because pairs (__kmp_sched, __kmp_chunk) and (__kmp_static, __kmp_guided)
2901 // may be changed by kmp_set_defaults independently. So one can get the updated schedule here.
2902 
2903     kmp_r_sched_t r_sched;
2904 
2905     // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, __kmp_guided
2906     // __kmp_sched should keep original value, so that user can set KMP_SCHEDULE multiple times,
2907     // and thus have different run-time schedules in different roots (even in OMP 2.5)
2908     if ( __kmp_sched == kmp_sch_static ) {
2909         r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed schedule (balanced or greedy)
2910     } else if ( __kmp_sched == kmp_sch_guided_chunked ) {
2911         r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed schedule (iterative or analytical)
2912     } else {
2913         r_sched.r_sched_type = __kmp_sched;  // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other
2914     }
2915 
2916     if ( __kmp_chunk < KMP_DEFAULT_CHUNK ) { // __kmp_chunk may be wrong here (if it was not ever set)
2917         r_sched.chunk = KMP_DEFAULT_CHUNK;
2918     } else {
2919         r_sched.chunk = __kmp_chunk;
2920     }
2921 
2922     return r_sched;
2923 }
2924 
2925 /* ------------------------------------------------------------------------ */
2926 /* ------------------------------------------------------------------------ */
2927 
2928 
2929 /*
2930  * Allocate (realloc == FALSE) * or reallocate (realloc == TRUE)
2931  * at least argc number of *t_argv entries for the requested team.
2932  */
2933 static void
2934 __kmp_alloc_argv_entries( int argc, kmp_team_t *team, int realloc )
2935 {
2936 
2937     KMP_DEBUG_ASSERT( team );
2938     if( !realloc || argc > team->t.t_max_argc ) {
2939 
2940         KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: needed entries=%d, current entries=%d\n",
2941                          team->t.t_id, argc, ( realloc ) ? team->t.t_max_argc : 0 ));
2942         /* if previously allocated heap space for args, free them */
2943         if ( realloc && team->t.t_argv != &team->t.t_inline_argv[0] )
2944             __kmp_free( (void *) team->t.t_argv );
2945 
2946         if ( argc <= KMP_INLINE_ARGV_ENTRIES ) {
2947             /* use unused space in the cache line for arguments */
2948             team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES;
2949             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: inline allocate %d argv entries\n",
2950                              team->t.t_id, team->t.t_max_argc ));
2951             team->t.t_argv = &team->t.t_inline_argv[0];
2952             if ( __kmp_storage_map ) {
2953                 __kmp_print_storage_map_gtid( -1, &team->t.t_inline_argv[0],
2954                                          &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES],
2955                                          (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES),
2956                                          "team_%d.t_inline_argv",
2957                                          team->t.t_id );
2958             }
2959         } else {
2960             /* allocate space for arguments in the heap */
2961             team->t.t_max_argc = ( argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1 )) ?
2962                                      KMP_MIN_MALLOC_ARGV_ENTRIES : 2 * argc;
2963             KA_TRACE( 100, ( "__kmp_alloc_argv_entries: team %d: dynamic allocate %d argv entries\n",
2964                              team->t.t_id, team->t.t_max_argc ));
2965             team->t.t_argv     = (void**) __kmp_page_allocate( sizeof(void*) * team->t.t_max_argc );
2966             if ( __kmp_storage_map ) {
2967                 __kmp_print_storage_map_gtid( -1, &team->t.t_argv[0], &team->t.t_argv[team->t.t_max_argc],
2968                                          sizeof(void *) * team->t.t_max_argc, "team_%d.t_argv",
2969                                          team->t.t_id );
2970             }
2971         }
2972     }
2973 }
2974 
2975 static void
2976 __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth)
2977 {
2978     int i;
2979     int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2;
2980     team->t.t_threads = (kmp_info_t**) __kmp_allocate( sizeof(kmp_info_t*) * max_nth );
2981     team->t.t_disp_buffer = (dispatch_shared_info_t*)
2982         __kmp_allocate( sizeof(dispatch_shared_info_t) * num_disp_buff );
2983     team->t.t_dispatch = (kmp_disp_t*) __kmp_allocate( sizeof(kmp_disp_t) * max_nth );
2984     team->t.t_implicit_task_taskdata = (kmp_taskdata_t*) __kmp_allocate( sizeof(kmp_taskdata_t) * max_nth );
2985     team->t.t_max_nproc = max_nth;
2986 
2987     /* setup dispatch buffers */
2988     for(i = 0 ; i < num_disp_buff; ++i) {
2989         team->t.t_disp_buffer[i].buffer_index = i;
2990 #if OMP_45_ENABLED
2991         team->t.t_disp_buffer[i].doacross_buf_idx = i;
2992 #endif
2993     }
2994 }
2995 
2996 static void
2997 __kmp_free_team_arrays(kmp_team_t *team) {
2998     /* Note: this does not free the threads in t_threads (__kmp_free_threads) */
2999     int i;
3000     for ( i = 0; i < team->t.t_max_nproc; ++ i ) {
3001         if ( team->t.t_dispatch[ i ].th_disp_buffer != NULL ) {
3002             __kmp_free( team->t.t_dispatch[ i ].th_disp_buffer );
3003             team->t.t_dispatch[ i ].th_disp_buffer = NULL;
3004         }; // if
3005     }; // for
3006     __kmp_free(team->t.t_threads);
3007     __kmp_free(team->t.t_disp_buffer);
3008     __kmp_free(team->t.t_dispatch);
3009     __kmp_free(team->t.t_implicit_task_taskdata);
3010     team->t.t_threads     = NULL;
3011     team->t.t_disp_buffer = NULL;
3012     team->t.t_dispatch    = NULL;
3013     team->t.t_implicit_task_taskdata = 0;
3014 }
3015 
3016 static void
3017 __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) {
3018     kmp_info_t **oldThreads = team->t.t_threads;
3019 
3020     __kmp_free(team->t.t_disp_buffer);
3021     __kmp_free(team->t.t_dispatch);
3022     __kmp_free(team->t.t_implicit_task_taskdata);
3023     __kmp_allocate_team_arrays(team, max_nth);
3024 
3025     KMP_MEMCPY(team->t.t_threads, oldThreads, team->t.t_nproc * sizeof (kmp_info_t*));
3026 
3027     __kmp_free(oldThreads);
3028 }
3029 
3030 static kmp_internal_control_t
3031 __kmp_get_global_icvs( void ) {
3032 
3033     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3034 
3035 #if OMP_40_ENABLED
3036     KMP_DEBUG_ASSERT( __kmp_nested_proc_bind.used > 0 );
3037 #endif /* OMP_40_ENABLED */
3038 
3039     kmp_internal_control_t g_icvs = {
3040       0,                            //int serial_nesting_level; //corresponds to the value of the th_team_serialized field
3041       (kmp_int8)__kmp_dflt_nested,            //int nested;               //internal control for nested parallelism (per thread)
3042       (kmp_int8)__kmp_global.g.g_dynamic,                                 //internal control for dynamic adjustment of threads (per thread)
3043       (kmp_int8)__kmp_env_blocktime,          //int bt_set;               //internal control for whether blocktime is explicitly set
3044       __kmp_dflt_blocktime,         //int blocktime;            //internal control for blocktime
3045 #if KMP_USE_MONITOR
3046       __kmp_bt_intervals,           //int bt_intervals;         //internal control for blocktime intervals
3047 #endif
3048       __kmp_dflt_team_nth,          //int nproc;                //internal control for # of threads for next parallel region (per thread)
3049                                     // (use a max ub on value if __kmp_parallel_initialize not called yet)
3050       __kmp_dflt_max_active_levels, //int max_active_levels;    //internal control for max_active_levels
3051       r_sched,                      //kmp_r_sched_t sched;      //internal control for runtime schedule {sched,chunk} pair
3052 #if OMP_40_ENABLED
3053       __kmp_nested_proc_bind.bind_types[0],
3054       __kmp_default_device,
3055 #endif /* OMP_40_ENABLED */
3056       NULL                          //struct kmp_internal_control *next;
3057     };
3058 
3059     return g_icvs;
3060 }
3061 
3062 static kmp_internal_control_t
3063 __kmp_get_x_global_icvs( const kmp_team_t *team ) {
3064 
3065     kmp_internal_control_t gx_icvs;
3066     gx_icvs.serial_nesting_level = 0; // probably =team->t.t_serial like in save_inter_controls
3067     copy_icvs( & gx_icvs, & team->t.t_threads[0]->th.th_current_task->td_icvs );
3068     gx_icvs.next = NULL;
3069 
3070     return gx_icvs;
3071 }
3072 
3073 static void
3074 __kmp_initialize_root( kmp_root_t *root )
3075 {
3076     int           f;
3077     kmp_team_t   *root_team;
3078     kmp_team_t   *hot_team;
3079     int           hot_team_max_nth;
3080     kmp_r_sched_t r_sched = __kmp_get_schedule_global(); // get current state of scheduling globals
3081     kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3082     KMP_DEBUG_ASSERT( root );
3083     KMP_ASSERT( ! root->r.r_begin );
3084 
3085     /* setup the root state structure */
3086     __kmp_init_lock( &root->r.r_begin_lock );
3087     root->r.r_begin        = FALSE;
3088     root->r.r_active       = FALSE;
3089     root->r.r_in_parallel  = 0;
3090     root->r.r_blocktime    = __kmp_dflt_blocktime;
3091     root->r.r_nested       = __kmp_dflt_nested;
3092 
3093     /* setup the root team for this task */
3094     /* allocate the root team structure */
3095     KF_TRACE( 10, ( "__kmp_initialize_root: before root_team\n" ) );
3096 
3097     root_team =
3098         __kmp_allocate_team(
3099             root,
3100             1,                                                         // new_nproc
3101             1,                                                         // max_nproc
3102 #if OMPT_SUPPORT
3103             0, // root parallel id
3104 #endif
3105 #if OMP_40_ENABLED
3106             __kmp_nested_proc_bind.bind_types[0],
3107 #endif
3108             &r_icvs,
3109             0                                                          // argc
3110             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3111         );
3112 #if USE_DEBUGGER
3113     // Non-NULL value should be assigned to make the debugger display the root team.
3114     TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)( ~ 0 ));
3115 #endif
3116 
3117     KF_TRACE( 10, ( "__kmp_initialize_root: after root_team = %p\n", root_team ) );
3118 
3119     root->r.r_root_team = root_team;
3120     root_team->t.t_control_stack_top = NULL;
3121 
3122     /* initialize root team */
3123     root_team->t.t_threads[0] = NULL;
3124     root_team->t.t_nproc      = 1;
3125     root_team->t.t_serialized = 1;
3126     // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3127     root_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3128     root_team->t.t_sched.chunk        = r_sched.chunk;
3129     KA_TRACE( 20, ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n",
3130                     root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
3131 
3132     /* setup the  hot team for this task */
3133     /* allocate the hot team structure */
3134     KF_TRACE( 10, ( "__kmp_initialize_root: before hot_team\n" ) );
3135 
3136     hot_team =
3137         __kmp_allocate_team(
3138             root,
3139             1,                                                         // new_nproc
3140             __kmp_dflt_team_nth_ub * 2,                                // max_nproc
3141 #if OMPT_SUPPORT
3142             0, // root parallel id
3143 #endif
3144 #if OMP_40_ENABLED
3145             __kmp_nested_proc_bind.bind_types[0],
3146 #endif
3147             &r_icvs,
3148             0                                                          // argc
3149             USE_NESTED_HOT_ARG(NULL)                                   // master thread is unknown
3150         );
3151     KF_TRACE( 10, ( "__kmp_initialize_root: after hot_team = %p\n", hot_team ) );
3152 
3153     root->r.r_hot_team = hot_team;
3154     root_team->t.t_control_stack_top = NULL;
3155 
3156     /* first-time initialization */
3157     hot_team->t.t_parent = root_team;
3158 
3159     /* initialize hot team */
3160     hot_team_max_nth = hot_team->t.t_max_nproc;
3161     for ( f = 0; f < hot_team_max_nth; ++ f ) {
3162         hot_team->t.t_threads[ f ] = NULL;
3163     }; // for
3164     hot_team->t.t_nproc = 1;
3165     // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels;
3166     hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type;
3167     hot_team->t.t_sched.chunk        = r_sched.chunk;
3168     hot_team->t.t_size_changed = 0;
3169 }
3170 
3171 #ifdef KMP_DEBUG
3172 
3173 
3174 typedef struct kmp_team_list_item {
3175     kmp_team_p const *           entry;
3176     struct kmp_team_list_item *  next;
3177 } kmp_team_list_item_t;
3178 typedef kmp_team_list_item_t * kmp_team_list_t;
3179 
3180 
3181 static void
3182 __kmp_print_structure_team_accum(    // Add team to list of teams.
3183     kmp_team_list_t     list,        // List of teams.
3184     kmp_team_p const *  team         // Team to add.
3185 ) {
3186 
3187     // List must terminate with item where both entry and next are NULL.
3188     // Team is added to the list only once.
3189     // List is sorted in ascending order by team id.
3190     // Team id is *not* a key.
3191 
3192     kmp_team_list_t l;
3193 
3194     KMP_DEBUG_ASSERT( list != NULL );
3195     if ( team == NULL ) {
3196         return;
3197     }; // if
3198 
3199     __kmp_print_structure_team_accum( list, team->t.t_parent );
3200     __kmp_print_structure_team_accum( list, team->t.t_next_pool );
3201 
3202     // Search list for the team.
3203     l = list;
3204     while ( l->next != NULL && l->entry != team ) {
3205         l = l->next;
3206     }; // while
3207     if ( l->next != NULL ) {
3208         return;  // Team has been added before, exit.
3209     }; // if
3210 
3211     // Team is not found. Search list again for insertion point.
3212     l = list;
3213     while ( l->next != NULL && l->entry->t.t_id <= team->t.t_id ) {
3214         l = l->next;
3215     }; // while
3216 
3217     // Insert team.
3218     {
3219         kmp_team_list_item_t * item =
3220             (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof(  kmp_team_list_item_t ) );
3221         * item = * l;
3222         l->entry = team;
3223         l->next  = item;
3224     }
3225 
3226 }
3227 
3228 static void
3229 __kmp_print_structure_team(
3230     char const *       title,
3231     kmp_team_p const * team
3232 
3233 ) {
3234     __kmp_printf( "%s", title );
3235     if ( team != NULL ) {
3236         __kmp_printf( "%2x %p\n", team->t.t_id, team );
3237     } else {
3238         __kmp_printf( " - (nil)\n" );
3239     }; // if
3240 }
3241 
3242 static void
3243 __kmp_print_structure_thread(
3244     char const *       title,
3245     kmp_info_p const * thread
3246 
3247 ) {
3248     __kmp_printf( "%s", title );
3249     if ( thread != NULL ) {
3250         __kmp_printf( "%2d %p\n", thread->th.th_info.ds.ds_gtid, thread );
3251     } else {
3252         __kmp_printf( " - (nil)\n" );
3253     }; // if
3254 }
3255 
3256 void
3257 __kmp_print_structure(
3258     void
3259 ) {
3260 
3261     kmp_team_list_t list;
3262 
3263     // Initialize list of teams.
3264     list = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( sizeof( kmp_team_list_item_t ) );
3265     list->entry = NULL;
3266     list->next  = NULL;
3267 
3268     __kmp_printf( "\n------------------------------\nGlobal Thread Table\n------------------------------\n" );
3269     {
3270         int gtid;
3271         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3272             __kmp_printf( "%2d", gtid );
3273             if ( __kmp_threads != NULL ) {
3274                 __kmp_printf( " %p", __kmp_threads[ gtid ] );
3275             }; // if
3276             if ( __kmp_root != NULL ) {
3277                 __kmp_printf( " %p", __kmp_root[ gtid ] );
3278             }; // if
3279             __kmp_printf( "\n" );
3280         }; // for gtid
3281     }
3282 
3283     // Print out __kmp_threads array.
3284     __kmp_printf( "\n------------------------------\nThreads\n------------------------------\n" );
3285     if ( __kmp_threads != NULL ) {
3286         int gtid;
3287         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3288             kmp_info_t const * thread = __kmp_threads[ gtid ];
3289             if ( thread != NULL ) {
3290                 __kmp_printf( "GTID %2d %p:\n", gtid, thread );
3291                 __kmp_printf(                 "    Our Root:        %p\n", thread->th.th_root );
3292                 __kmp_print_structure_team(   "    Our Team:     ",        thread->th.th_team );
3293                 __kmp_print_structure_team(   "    Serial Team:  ",        thread->th.th_serial_team );
3294                 __kmp_printf(                 "    Threads:      %2d\n",   thread->th.th_team_nproc );
3295                 __kmp_print_structure_thread( "    Master:       ",        thread->th.th_team_master );
3296                 __kmp_printf(                 "    Serialized?:  %2d\n",   thread->th.th_team_serialized );
3297                 __kmp_printf(                 "    Set NProc:    %2d\n",   thread->th.th_set_nproc );
3298 #if OMP_40_ENABLED
3299                 __kmp_printf(                 "    Set Proc Bind: %2d\n",  thread->th.th_set_proc_bind );
3300 #endif
3301                 __kmp_print_structure_thread( "    Next in pool: ",        thread->th.th_next_pool );
3302                 __kmp_printf( "\n" );
3303                 __kmp_print_structure_team_accum( list, thread->th.th_team );
3304                 __kmp_print_structure_team_accum( list, thread->th.th_serial_team );
3305             }; // if
3306         }; // for gtid
3307     } else {
3308         __kmp_printf( "Threads array is not allocated.\n" );
3309     }; // if
3310 
3311     // Print out __kmp_root array.
3312     __kmp_printf( "\n------------------------------\nUbers\n------------------------------\n" );
3313     if ( __kmp_root != NULL ) {
3314         int gtid;
3315         for ( gtid = 0; gtid < __kmp_threads_capacity; ++ gtid ) {
3316             kmp_root_t const * root = __kmp_root[ gtid ];
3317             if ( root != NULL ) {
3318                 __kmp_printf( "GTID %2d %p:\n", gtid, root );
3319                 __kmp_print_structure_team(   "    Root Team:    ",      root->r.r_root_team );
3320                 __kmp_print_structure_team(   "    Hot Team:     ",      root->r.r_hot_team );
3321                 __kmp_print_structure_thread( "    Uber Thread:  ",      root->r.r_uber_thread );
3322                 __kmp_printf(                 "    Active?:      %2d\n", root->r.r_active );
3323                 __kmp_printf(                 "    Nested?:      %2d\n", root->r.r_nested );
3324                 __kmp_printf(                 "    In Parallel:  %2d\n", root->r.r_in_parallel );
3325                 __kmp_printf( "\n" );
3326                 __kmp_print_structure_team_accum( list, root->r.r_root_team );
3327                 __kmp_print_structure_team_accum( list, root->r.r_hot_team );
3328             }; // if
3329         }; // for gtid
3330     } else {
3331         __kmp_printf( "Ubers array is not allocated.\n" );
3332     }; // if
3333 
3334     __kmp_printf( "\n------------------------------\nTeams\n------------------------------\n" );
3335     while ( list->next != NULL ) {
3336         kmp_team_p const * team = list->entry;
3337         int i;
3338         __kmp_printf( "Team %2x %p:\n", team->t.t_id, team );
3339         __kmp_print_structure_team( "    Parent Team:      ",      team->t.t_parent );
3340         __kmp_printf(               "    Master TID:       %2d\n", team->t.t_master_tid );
3341         __kmp_printf(               "    Max threads:      %2d\n", team->t.t_max_nproc );
3342         __kmp_printf(               "    Levels of serial: %2d\n", team->t.t_serialized );
3343         __kmp_printf(               "    Number threads:   %2d\n", team->t.t_nproc );
3344         for ( i = 0; i < team->t.t_nproc; ++ i ) {
3345             __kmp_printf(           "    Thread %2d:      ", i );
3346             __kmp_print_structure_thread( "", team->t.t_threads[ i ] );
3347         }; // for i
3348         __kmp_print_structure_team( "    Next in pool:     ",      team->t.t_next_pool );
3349         __kmp_printf( "\n" );
3350         list = list->next;
3351     }; // while
3352 
3353     // Print out __kmp_thread_pool and __kmp_team_pool.
3354     __kmp_printf( "\n------------------------------\nPools\n------------------------------\n" );
3355     __kmp_print_structure_thread(   "Thread pool:          ", (kmp_info_t *)__kmp_thread_pool );
3356     __kmp_print_structure_team(     "Team pool:            ", (kmp_team_t *)__kmp_team_pool );
3357     __kmp_printf( "\n" );
3358 
3359     // Free team list.
3360     while ( list != NULL ) {
3361         kmp_team_list_item_t * item = list;
3362         list = list->next;
3363         KMP_INTERNAL_FREE( item );
3364     }; // while
3365 
3366 }
3367 
3368 #endif
3369 
3370 
3371 //---------------------------------------------------------------------------
3372 //  Stuff for per-thread fast random number generator
3373 //  Table of primes
3374 
3375 static const unsigned __kmp_primes[] = {
3376   0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5,
3377   0xba5703f5, 0xb495a877, 0xe1626741, 0x79695e6b,
3378   0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231,
3379   0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b,
3380   0xbe4d6fe9, 0x5f15e201, 0x99afc3fd, 0xf3f16801,
3381   0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3,
3382   0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed,
3383   0x085a3d61, 0x46eb5ea7, 0x3d9910ed, 0x2e687b5b,
3384   0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9,
3385   0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7,
3386   0x54581edb, 0xf2480f45, 0x0bb9288f, 0xef1affc7,
3387   0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7,
3388   0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b,
3389   0xfc411073, 0xc3749363, 0xb892d829, 0x3549366b,
3390   0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3,
3391   0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f
3392 };
3393 
3394 //---------------------------------------------------------------------------
3395 //  __kmp_get_random: Get a random number using a linear congruential method.
3396 
3397 unsigned short
3398 __kmp_get_random( kmp_info_t * thread )
3399 {
3400   unsigned x = thread->th.th_x;
3401   unsigned short r = x>>16;
3402 
3403   thread->th.th_x = x*thread->th.th_a+1;
3404 
3405   KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n",
3406          thread->th.th_info.ds.ds_tid, r) );
3407 
3408   return r;
3409 }
3410 //--------------------------------------------------------
3411 // __kmp_init_random: Initialize a random number generator
3412 
3413 void
3414 __kmp_init_random( kmp_info_t * thread )
3415 {
3416   unsigned seed = thread->th.th_info.ds.ds_tid;
3417 
3418   thread->th.th_a = __kmp_primes[seed%(sizeof(__kmp_primes)/sizeof(__kmp_primes[0]))];
3419   thread->th.th_x = (seed+1)*thread->th.th_a+1;
3420   KA_TRACE(30, ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a) );
3421 }
3422 
3423 
3424 #if KMP_OS_WINDOWS
3425 /* reclaim array entries for root threads that are already dead, returns number reclaimed */
3426 static int
3427 __kmp_reclaim_dead_roots(void) {
3428     int i, r = 0;
3429 
3430     for(i = 0; i < __kmp_threads_capacity; ++i) {
3431         if( KMP_UBER_GTID( i ) &&
3432           !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) &&
3433           !__kmp_root[i]->r.r_active ) { // AC: reclaim only roots died in non-active state
3434             r += __kmp_unregister_root_other_thread(i);
3435         }
3436     }
3437     return r;
3438 }
3439 #endif
3440 
3441 /*
3442    This function attempts to create free entries in __kmp_threads and __kmp_root, and returns the number of
3443    free entries generated.
3444 
3445    For Windows* OS static library, the first mechanism used is to reclaim array entries for root threads that are
3446    already dead.
3447 
3448    On all platforms, expansion is attempted on the arrays __kmp_threads_ and __kmp_root, with appropriate
3449    update to __kmp_threads_capacity.  Array capacity is increased by doubling with clipping to
3450     __kmp_tp_capacity, if threadprivate cache array has been created.
3451    Synchronization with __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock.
3452 
3453    After any dead root reclamation, if the clipping value allows array expansion to result in the generation
3454    of a total of nWish free slots, the function does that expansion.  If not, but the clipping value allows
3455    array expansion to result in the generation of a total of nNeed free slots, the function does that expansion.
3456    Otherwise, nothing is done beyond the possible initial root thread reclamation.  However, if nNeed is zero,
3457    a best-effort attempt is made to fulfil nWish as far as possible, i.e. the function will attempt to create
3458    as many free slots as possible up to nWish.
3459 
3460    If any argument is negative, the behavior is undefined.
3461 */
3462 static int
3463 __kmp_expand_threads(int nWish, int nNeed) {
3464     int added = 0;
3465     int old_tp_cached;
3466     int __kmp_actual_max_nth;
3467 
3468     if(nNeed > nWish) /* normalize the arguments */
3469         nWish = nNeed;
3470 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB
3471 /* only for Windows static library */
3472     /* reclaim array entries for root threads that are already dead */
3473     added = __kmp_reclaim_dead_roots();
3474 
3475     if(nNeed) {
3476         nNeed -= added;
3477         if(nNeed < 0)
3478             nNeed = 0;
3479     }
3480     if(nWish) {
3481         nWish -= added;
3482         if(nWish < 0)
3483             nWish = 0;
3484     }
3485 #endif
3486     if(nWish <= 0)
3487         return added;
3488 
3489     while(1) {
3490         int nTarget;
3491         int minimumRequiredCapacity;
3492         int newCapacity;
3493         kmp_info_t **newThreads;
3494         kmp_root_t **newRoot;
3495 
3496         //
3497         // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth.
3498         // If __kmp_max_nth is set to some value less than __kmp_sys_max_nth
3499         // by the user via OMP_THREAD_LIMIT, then __kmp_threads_capacity may
3500         // become > __kmp_max_nth in one of two ways:
3501         //
3502         // 1) The initialization thread (gtid = 0) exits.  __kmp_threads[0]
3503         //    may not be resused by another thread, so we may need to increase
3504         //    __kmp_threads_capacity to __kmp_max_threads + 1.
3505         //
3506         // 2) New foreign root(s) are encountered.  We always register new
3507         //    foreign roots.  This may cause a smaller # of threads to be
3508         //    allocated at subsequent parallel regions, but the worker threads
3509         //    hang around (and eventually go to sleep) and need slots in the
3510         //    __kmp_threads[] array.
3511         //
3512         // Anyway, that is the reason for moving the check to see if
3513         // __kmp_max_threads was exceeded into __kmp_reseerve_threads()
3514         // instead of having it performed here. -BB
3515         //
3516         old_tp_cached = __kmp_tp_cached;
3517         __kmp_actual_max_nth = old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth;
3518         KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity);
3519 
3520         /* compute expansion headroom to check if we can expand and whether to aim for nWish or nNeed */
3521         nTarget = nWish;
3522         if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3523             /* can't fulfil nWish, so try nNeed */
3524             if(nNeed) {
3525                 nTarget = nNeed;
3526                 if(__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) {
3527                     /* possible expansion too small -- give up */
3528                     break;
3529                 }
3530             } else {
3531                 /* best-effort */
3532                 nTarget = __kmp_actual_max_nth - __kmp_threads_capacity;
3533                 if(!nTarget) {
3534                     /* can expand at all -- give up */
3535                     break;
3536                 }
3537             }
3538         }
3539         minimumRequiredCapacity = __kmp_threads_capacity + nTarget;
3540 
3541         newCapacity = __kmp_threads_capacity;
3542         do{
3543             newCapacity =
3544                 newCapacity <= (__kmp_actual_max_nth >> 1) ?
3545                 (newCapacity << 1) :
3546                 __kmp_actual_max_nth;
3547         } while(newCapacity < minimumRequiredCapacity);
3548         newThreads = (kmp_info_t**) __kmp_allocate((sizeof(kmp_info_t*) + sizeof(kmp_root_t*)) * newCapacity + CACHE_LINE);
3549         newRoot = (kmp_root_t**) ((char*)newThreads + sizeof(kmp_info_t*) * newCapacity );
3550         KMP_MEMCPY(newThreads, __kmp_threads, __kmp_threads_capacity * sizeof(kmp_info_t*));
3551         KMP_MEMCPY(newRoot, __kmp_root, __kmp_threads_capacity * sizeof(kmp_root_t*));
3552         memset(newThreads + __kmp_threads_capacity, 0,
3553                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t*));
3554         memset(newRoot + __kmp_threads_capacity, 0,
3555                (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t*));
3556 
3557         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3558             /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has allocated a threadprivate cache
3559                while we were allocating the expanded array, and our new capacity is larger than the threadprivate
3560                cache capacity, so we should deallocate the expanded arrays and try again.  This is the first check
3561                of a double-check pair.
3562             */
3563             __kmp_free(newThreads);
3564             continue; /* start over and try again */
3565         }
3566         __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock);
3567         if(!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) {
3568             /* Same check as above, but this time with the lock so we can be sure if we can succeed. */
3569             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3570             __kmp_free(newThreads);
3571             continue; /* start over and try again */
3572         } else {
3573             /* success */
3574             // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be investigated.
3575             //
3576             *(kmp_info_t**volatile*)&__kmp_threads = newThreads;
3577             *(kmp_root_t**volatile*)&__kmp_root = newRoot;
3578             added += newCapacity - __kmp_threads_capacity;
3579             *(volatile int*)&__kmp_threads_capacity = newCapacity;
3580             __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock);
3581             break; /* succeeded, so we can exit the loop */
3582         }
3583     }
3584     return added;
3585 }
3586 
3587 /* register the current thread as a root thread and obtain our gtid */
3588 /* we must have the __kmp_initz_lock held at this point */
3589 /* Argument TRUE only if are the thread that calls from __kmp_do_serial_initialize() */
3590 int
3591 __kmp_register_root( int initial_thread )
3592 {
3593     kmp_info_t *root_thread;
3594     kmp_root_t *root;
3595     int         gtid;
3596     int         capacity;
3597     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3598     KA_TRACE( 20, ("__kmp_register_root: entered\n"));
3599     KMP_MB();
3600 
3601 
3602     /*
3603         2007-03-02:
3604 
3605         If initial thread did not invoke OpenMP RTL yet, and this thread is not an initial one,
3606         "__kmp_all_nth >= __kmp_threads_capacity" condition does not work as expected -- it may
3607         return false (that means there is at least one empty slot in __kmp_threads array), but it
3608         is possible the only free slot is #0, which is reserved for initial thread and so cannot be
3609         used for this one. Following code workarounds this bug.
3610 
3611         However, right solution seems to be not reserving slot #0 for initial thread because:
3612             (1) there is no magic in slot #0,
3613             (2) we cannot detect initial thread reliably (the first thread which does serial
3614                 initialization may be not a real initial thread).
3615     */
3616     capacity = __kmp_threads_capacity;
3617     if ( ! initial_thread && TCR_PTR(__kmp_threads[0]) == NULL ) {
3618         -- capacity;
3619     }; // if
3620 
3621     /* see if there are too many threads */
3622     if ( __kmp_all_nth >= capacity && !__kmp_expand_threads( 1, 1 ) ) {
3623         if ( __kmp_tp_cached ) {
3624             __kmp_msg(
3625                 kmp_ms_fatal,
3626                 KMP_MSG( CantRegisterNewThread ),
3627                 KMP_HNT( Set_ALL_THREADPRIVATE, __kmp_tp_capacity ),
3628                 KMP_HNT( PossibleSystemLimitOnThreads ),
3629                 __kmp_msg_null
3630             );
3631         }
3632         else {
3633             __kmp_msg(
3634                 kmp_ms_fatal,
3635                 KMP_MSG( CantRegisterNewThread ),
3636                 KMP_HNT( SystemLimitOnThreads ),
3637                 __kmp_msg_null
3638             );
3639         }
3640     }; // if
3641 
3642     /* find an available thread slot */
3643     /* Don't reassign the zero slot since we need that to only be used by initial
3644        thread */
3645     for( gtid=(initial_thread ? 0 : 1) ; TCR_PTR(__kmp_threads[gtid]) != NULL ; gtid++ )
3646         ;
3647     KA_TRACE( 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid ));
3648     KMP_ASSERT( gtid < __kmp_threads_capacity );
3649 
3650     /* update global accounting */
3651     __kmp_all_nth ++;
3652     TCW_4(__kmp_nth, __kmp_nth + 1);
3653 
3654     //
3655     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
3656     // for low numbers of procs, and method #2 (keyed API call) for higher
3657     // numbers of procs.
3658     //
3659     if ( __kmp_adjust_gtid_mode ) {
3660         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
3661             if ( TCR_4(__kmp_gtid_mode) != 2) {
3662                 TCW_4(__kmp_gtid_mode, 2);
3663             }
3664         }
3665         else {
3666             if (TCR_4(__kmp_gtid_mode) != 1 ) {
3667                 TCW_4(__kmp_gtid_mode, 1);
3668             }
3669         }
3670     }
3671 
3672 #ifdef KMP_ADJUST_BLOCKTIME
3673     /* Adjust blocktime to zero if necessary            */
3674     /* Middle initialization might not have occurred yet */
3675     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
3676         if ( __kmp_nth > __kmp_avail_proc ) {
3677             __kmp_zero_bt = TRUE;
3678         }
3679     }
3680 #endif /* KMP_ADJUST_BLOCKTIME */
3681 
3682     /* setup this new hierarchy */
3683     if( ! ( root = __kmp_root[gtid] )) {
3684         root = __kmp_root[gtid] = (kmp_root_t*) __kmp_allocate( sizeof(kmp_root_t) );
3685         KMP_DEBUG_ASSERT( ! root->r.r_root_team );
3686     }
3687 
3688 #if KMP_STATS_ENABLED
3689     // Initialize stats as soon as possible (right after gtid assignment).
3690     __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid);
3691     KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life);
3692     KMP_SET_THREAD_STATE(SERIAL_REGION);
3693     KMP_INIT_PARTITIONED_TIMERS(OMP_serial);
3694 #endif
3695     __kmp_initialize_root( root );
3696 
3697     /* setup new root thread structure */
3698     if( root->r.r_uber_thread ) {
3699         root_thread = root->r.r_uber_thread;
3700     } else {
3701         root_thread = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
3702         if ( __kmp_storage_map ) {
3703             __kmp_print_thread_storage_map( root_thread, gtid );
3704         }
3705         root_thread->th.th_info .ds.ds_gtid = gtid;
3706         root_thread->th.th_root =  root;
3707         if( __kmp_env_consistency_check ) {
3708             root_thread->th.th_cons = __kmp_allocate_cons_stack( gtid );
3709         }
3710         #if USE_FAST_MEMORY
3711             __kmp_initialize_fast_memory( root_thread );
3712         #endif /* USE_FAST_MEMORY */
3713 
3714         #if KMP_USE_BGET
3715             KMP_DEBUG_ASSERT( root_thread->th.th_local.bget_data == NULL );
3716             __kmp_initialize_bget( root_thread );
3717         #endif
3718         __kmp_init_random( root_thread );  // Initialize random number generator
3719     }
3720 
3721     /* setup the serial team held in reserve by the root thread */
3722     if( ! root_thread->th.th_serial_team ) {
3723         kmp_internal_control_t r_icvs = __kmp_get_global_icvs();
3724         KF_TRACE( 10, ( "__kmp_register_root: before serial_team\n" ) );
3725 
3726         root_thread->th.th_serial_team = __kmp_allocate_team( root, 1, 1,
3727 #if OMPT_SUPPORT
3728           0, // root parallel id
3729 #endif
3730 #if OMP_40_ENABLED
3731           proc_bind_default,
3732 #endif
3733           &r_icvs,
3734           0 USE_NESTED_HOT_ARG(NULL) );
3735     }
3736     KMP_ASSERT( root_thread->th.th_serial_team );
3737     KF_TRACE( 10, ( "__kmp_register_root: after serial_team = %p\n",
3738       root_thread->th.th_serial_team ) );
3739 
3740     /* drop root_thread into place */
3741     TCW_SYNC_PTR(__kmp_threads[gtid], root_thread);
3742 
3743     root->r.r_root_team->t.t_threads[0] = root_thread;
3744     root->r.r_hot_team ->t.t_threads[0] = root_thread;
3745     root_thread->th.th_serial_team->t.t_threads[0] = root_thread;
3746     root_thread->th.th_serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for execution (it is unused for now).
3747     root->r.r_uber_thread = root_thread;
3748 
3749     /* initialize the thread, get it ready to go */
3750     __kmp_initialize_info( root_thread, root->r.r_root_team, 0, gtid );
3751     TCW_4(__kmp_init_gtid, TRUE);
3752 
3753     /* prepare the master thread for get_gtid() */
3754     __kmp_gtid_set_specific( gtid );
3755 
3756 #if USE_ITT_BUILD
3757     __kmp_itt_thread_name( gtid );
3758 #endif /* USE_ITT_BUILD */
3759 
3760     #ifdef KMP_TDATA_GTID
3761         __kmp_gtid = gtid;
3762     #endif
3763     __kmp_create_worker( gtid, root_thread, __kmp_stksize );
3764     KMP_DEBUG_ASSERT( __kmp_gtid_get_specific() == gtid );
3765 
3766     KA_TRACE( 20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, plain=%u\n",
3767                     gtid, __kmp_gtid_from_tid( 0, root->r.r_hot_team ),
3768                     root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE,
3769                     KMP_INIT_BARRIER_STATE ) );
3770     { // Initialize barrier data.
3771         int b;
3772         for ( b = 0; b < bs_last_barrier; ++ b ) {
3773             root_thread->th.th_bar[ b ].bb.b_arrived        = KMP_INIT_BARRIER_STATE;
3774 #if USE_DEBUGGER
3775             root_thread->th.th_bar[ b ].bb.b_worker_arrived = 0;
3776 #endif
3777         }; // for
3778     }
3779     KMP_DEBUG_ASSERT( root->r.r_hot_team->t.t_bar[ bs_forkjoin_barrier ].b_arrived == KMP_INIT_BARRIER_STATE );
3780 
3781 #if KMP_AFFINITY_SUPPORTED
3782 # if OMP_40_ENABLED
3783     root_thread->th.th_current_place = KMP_PLACE_UNDEFINED;
3784     root_thread->th.th_new_place = KMP_PLACE_UNDEFINED;
3785     root_thread->th.th_first_place = KMP_PLACE_UNDEFINED;
3786     root_thread->th.th_last_place = KMP_PLACE_UNDEFINED;
3787 # endif
3788 
3789     if ( TCR_4(__kmp_init_middle) ) {
3790         __kmp_affinity_set_init_mask( gtid, TRUE );
3791     }
3792 #endif /* KMP_AFFINITY_SUPPORTED */
3793 
3794     __kmp_root_counter ++;
3795 
3796     KMP_MB();
3797     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3798 
3799     return gtid;
3800 }
3801 
3802 #if KMP_NESTED_HOT_TEAMS
3803 static int
3804 __kmp_free_hot_teams( kmp_root_t *root, kmp_info_t *thr, int level, const int max_level )
3805 {
3806     int i, n, nth;
3807     kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams;
3808     if( !hot_teams || !hot_teams[level].hot_team ) {
3809         return 0;
3810     }
3811     KMP_DEBUG_ASSERT( level < max_level );
3812     kmp_team_t *team = hot_teams[level].hot_team;
3813     nth = hot_teams[level].hot_team_nth;
3814     n = nth - 1;                   // master is not freed
3815     if( level < max_level - 1 ) {
3816         for( i = 0; i < nth; ++i ) {
3817             kmp_info_t *th = team->t.t_threads[i];
3818             n += __kmp_free_hot_teams( root, th, level + 1, max_level );
3819             if( i > 0 && th->th.th_hot_teams ) {
3820                 __kmp_free( th->th.th_hot_teams );
3821                 th->th.th_hot_teams = NULL;
3822             }
3823         }
3824     }
3825     __kmp_free_team( root, team, NULL );
3826     return n;
3827 }
3828 #endif
3829 
3830 /* Resets a root thread and clear its root and hot teams.
3831    Returns the number of __kmp_threads entries directly and indirectly freed.
3832 */
3833 static int
3834 __kmp_reset_root(int gtid, kmp_root_t *root)
3835 {
3836     kmp_team_t * root_team = root->r.r_root_team;
3837     kmp_team_t * hot_team  = root->r.r_hot_team;
3838     int          n         = hot_team->t.t_nproc;
3839     int i;
3840 
3841     KMP_DEBUG_ASSERT( ! root->r.r_active );
3842 
3843     root->r.r_root_team = NULL;
3844     root->r.r_hot_team  = NULL;
3845         // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team before call
3846         // to __kmp_free_team().
3847     __kmp_free_team( root, root_team USE_NESTED_HOT_ARG(NULL) );
3848 #if KMP_NESTED_HOT_TEAMS
3849     if( __kmp_hot_teams_max_level > 0 ) {  // need to free nested hot teams and their threads if any
3850         for( i = 0; i < hot_team->t.t_nproc; ++i ) {
3851             kmp_info_t *th = hot_team->t.t_threads[i];
3852             if( __kmp_hot_teams_max_level > 1 ) {
3853                 n += __kmp_free_hot_teams( root, th, 1, __kmp_hot_teams_max_level );
3854             }
3855             if( th->th.th_hot_teams ) {
3856                 __kmp_free( th->th.th_hot_teams );
3857                 th->th.th_hot_teams = NULL;
3858             }
3859         }
3860     }
3861 #endif
3862     __kmp_free_team( root, hot_team USE_NESTED_HOT_ARG(NULL) );
3863 
3864     //
3865     // Before we can reap the thread, we need to make certain that all
3866     // other threads in the teams that had this root as ancestor have stopped trying to steal tasks.
3867     //
3868     if ( __kmp_tasking_mode != tskm_immediate_exec ) {
3869         __kmp_wait_to_unref_task_teams();
3870     }
3871 
3872     #if KMP_OS_WINDOWS
3873         /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */
3874         KA_TRACE( 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC "\n",
3875             (LPVOID)&(root->r.r_uber_thread->th),
3876             root->r.r_uber_thread->th.th_info.ds.ds_thread ) );
3877         __kmp_free_handle( root->r.r_uber_thread->th.th_info.ds.ds_thread );
3878     #endif /* KMP_OS_WINDOWS */
3879 
3880 #if OMPT_SUPPORT
3881     if (ompt_enabled &&
3882         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
3883         int gtid = __kmp_get_gtid();
3884         __ompt_thread_end(ompt_thread_initial, gtid);
3885     }
3886 #endif
3887 
3888     TCW_4(__kmp_nth, __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth.
3889     __kmp_reap_thread( root->r.r_uber_thread, 1 );
3890 
3891         // We canot put root thread to __kmp_thread_pool, so we have to reap it istead of freeing.
3892     root->r.r_uber_thread = NULL;
3893     /* mark root as no longer in use */
3894     root->r.r_begin = FALSE;
3895 
3896     return n;
3897 }
3898 
3899 void
3900 __kmp_unregister_root_current_thread( int gtid )
3901 {
3902     KA_TRACE( 1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid ));
3903     /* this lock should be ok, since unregister_root_current_thread is never called during
3904      * and abort, only during a normal close.  furthermore, if you have the
3905      * forkjoin lock, you should never try to get the initz lock */
3906 
3907     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
3908     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
3909         KC_TRACE( 10, ("__kmp_unregister_root_current_thread: already finished, exiting T#%d\n", gtid ));
3910         __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3911         return;
3912     }
3913     kmp_root_t *root = __kmp_root[gtid];
3914 
3915     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3916     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3917     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3918     KMP_ASSERT( root->r.r_active == FALSE );
3919 
3920 
3921     KMP_MB();
3922 
3923 #if OMP_45_ENABLED
3924    kmp_info_t * thread = __kmp_threads[gtid];
3925    kmp_team_t * team = thread->th.th_team;
3926    kmp_task_team_t *   task_team = thread->th.th_task_team;
3927 
3928    // we need to wait for the proxy tasks before finishing the thread
3929    if ( task_team != NULL && task_team->tt.tt_found_proxy_tasks ) {
3930 #if OMPT_SUPPORT
3931         // the runtime is shutting down so we won't report any events
3932         thread->th.ompt_thread_info.state = ompt_state_undefined;
3933 #endif
3934         __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL));
3935    }
3936 #endif
3937 
3938     __kmp_reset_root(gtid, root);
3939 
3940     /* free up this thread slot */
3941     __kmp_gtid_set_specific( KMP_GTID_DNE );
3942 #ifdef KMP_TDATA_GTID
3943     __kmp_gtid = KMP_GTID_DNE;
3944 #endif
3945 
3946     KMP_MB();
3947     KC_TRACE( 10, ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid ));
3948 
3949     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
3950 }
3951 
3952 #if KMP_OS_WINDOWS
3953 /* __kmp_forkjoin_lock must be already held
3954    Unregisters a root thread that is not the current thread.  Returns the number of
3955    __kmp_threads entries freed as a result.
3956  */
3957 static int
3958 __kmp_unregister_root_other_thread( int gtid )
3959 {
3960     kmp_root_t *root = __kmp_root[gtid];
3961     int r;
3962 
3963     KA_TRACE( 1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid ));
3964     KMP_DEBUG_ASSERT( __kmp_threads && __kmp_threads[gtid] );
3965     KMP_ASSERT( KMP_UBER_GTID( gtid ));
3966     KMP_ASSERT( root == __kmp_threads[gtid]->th.th_root );
3967     KMP_ASSERT( root->r.r_active == FALSE );
3968 
3969     r = __kmp_reset_root(gtid, root);
3970     KC_TRACE( 10, ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid ));
3971     return r;
3972 }
3973 #endif
3974 
3975 #if KMP_DEBUG
3976 void __kmp_task_info() {
3977 
3978     kmp_int32 gtid       = __kmp_entry_gtid();
3979     kmp_int32 tid        = __kmp_tid_from_gtid( gtid );
3980     kmp_info_t *this_thr = __kmp_threads[ gtid ];
3981     kmp_team_t *steam    = this_thr->th.th_serial_team;
3982     kmp_team_t *team     = this_thr->th.th_team;
3983 
3984     __kmp_printf( "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p ptask=%p\n",
3985         gtid, tid, this_thr, team, this_thr->th.th_current_task, team->t.t_implicit_task_taskdata[tid].td_parent );
3986 }
3987 #endif // KMP_DEBUG
3988 
3989 /* TODO optimize with one big memclr, take out what isn't needed,
3990  * split responsibility to workers as much as possible, and delay
3991  * initialization of features as much as possible  */
3992 static void
3993 __kmp_initialize_info( kmp_info_t *this_thr, kmp_team_t *team, int tid, int gtid )
3994 {
3995     /* this_thr->th.th_info.ds.ds_gtid is setup in kmp_allocate_thread/create_worker
3996      * this_thr->th.th_serial_team is setup in __kmp_allocate_thread */
3997     kmp_info_t *master = team->t.t_threads[0];
3998     KMP_DEBUG_ASSERT( this_thr != NULL );
3999     KMP_DEBUG_ASSERT( this_thr->th.th_serial_team );
4000     KMP_DEBUG_ASSERT( team );
4001     KMP_DEBUG_ASSERT( team->t.t_threads  );
4002     KMP_DEBUG_ASSERT( team->t.t_dispatch );
4003     KMP_DEBUG_ASSERT( master );
4004     KMP_DEBUG_ASSERT( master->th.th_root );
4005 
4006     KMP_MB();
4007 
4008     TCW_SYNC_PTR(this_thr->th.th_team, team);
4009 
4010     this_thr->th.th_info.ds.ds_tid  = tid;
4011     this_thr->th.th_set_nproc       = 0;
4012     if (__kmp_tasking_mode != tskm_immediate_exec)
4013         // When tasking is possible, threads are not safe to reap until they are
4014         // done tasking; this will be set when tasking code is exited in wait
4015         this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP;
4016     else  // no tasking --> always safe to reap
4017         this_thr->th.th_reap_state = KMP_SAFE_TO_REAP;
4018 #if OMP_40_ENABLED
4019     this_thr->th.th_set_proc_bind   = proc_bind_default;
4020 # if KMP_AFFINITY_SUPPORTED
4021     this_thr->th.th_new_place       = this_thr->th.th_current_place;
4022 # endif
4023 #endif
4024     this_thr->th.th_root            = master->th.th_root;
4025 
4026     /* setup the thread's cache of the team structure */
4027     this_thr->th.th_team_nproc      = team->t.t_nproc;
4028     this_thr->th.th_team_master     = master;
4029     this_thr->th.th_team_serialized = team->t.t_serialized;
4030     TCW_PTR(this_thr->th.th_sleep_loc, NULL);
4031 
4032     KMP_DEBUG_ASSERT( team->t.t_implicit_task_taskdata );
4033 
4034     KF_TRACE( 10, ( "__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n",
4035                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4036 
4037     __kmp_init_implicit_task( this_thr->th.th_team_master->th.th_ident, this_thr, team, tid, TRUE );
4038 
4039     KF_TRACE( 10, ( "__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n",
4040                     tid, gtid, this_thr, this_thr->th.th_current_task ) );
4041     // TODO: Initialize ICVs from parent; GEH - isn't that already done in __kmp_initialize_team()?
4042 
4043     /* TODO no worksharing in speculative threads */
4044     this_thr->th.th_dispatch      = &team->t.t_dispatch[ tid ];
4045 
4046     this_thr->th.th_local.this_construct = 0;
4047 
4048 #ifdef BUILD_TV
4049     this_thr->th.th_local.tv_data = 0;
4050 #endif
4051 
4052     if ( ! this_thr->th.th_pri_common ) {
4053         this_thr->th.th_pri_common = (struct common_table *) __kmp_allocate( sizeof(struct common_table) );
4054         if ( __kmp_storage_map ) {
4055             __kmp_print_storage_map_gtid(
4056                 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1,
4057                 sizeof( struct common_table ), "th_%d.th_pri_common\n", gtid
4058             );
4059         }; // if
4060         this_thr->th.th_pri_head = NULL;
4061     }; // if
4062 
4063     /* Initialize dynamic dispatch */
4064     {
4065         volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch;
4066         /*
4067          * Use team max_nproc since this will never change for the team.
4068          */
4069         size_t disp_size = sizeof( dispatch_private_info_t ) *
4070             ( team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers );
4071         KD_TRACE( 10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, team->t.t_max_nproc ) );
4072         KMP_ASSERT( dispatch );
4073         KMP_DEBUG_ASSERT( team->t.t_dispatch );
4074         KMP_DEBUG_ASSERT( dispatch == &team->t.t_dispatch[ tid ] );
4075 
4076         dispatch->th_disp_index = 0;
4077 #if OMP_45_ENABLED
4078         dispatch->th_doacross_buf_idx = 0;
4079 #endif
4080         if( ! dispatch->th_disp_buffer )  {
4081             dispatch->th_disp_buffer = (dispatch_private_info_t *) __kmp_allocate( disp_size );
4082 
4083             if ( __kmp_storage_map ) {
4084                 __kmp_print_storage_map_gtid( gtid, &dispatch->th_disp_buffer[ 0 ],
4085                                          &dispatch->th_disp_buffer[ team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers ],
4086                                          disp_size, "th_%d.th_dispatch.th_disp_buffer "
4087                                          "(team_%d.t_dispatch[%d].th_disp_buffer)",
4088                                          gtid, team->t.t_id, gtid );
4089             }
4090         } else {
4091             memset( & dispatch->th_disp_buffer[0], '\0', disp_size );
4092         }
4093 
4094         dispatch->th_dispatch_pr_current = 0;
4095         dispatch->th_dispatch_sh_current = 0;
4096 
4097         dispatch->th_deo_fcn = 0;             /* ORDERED     */
4098         dispatch->th_dxo_fcn = 0;             /* END ORDERED */
4099     }
4100 
4101     this_thr->th.th_next_pool = NULL;
4102 
4103     if (!this_thr->th.th_task_state_memo_stack) {
4104         size_t i;
4105         this_thr->th.th_task_state_memo_stack = (kmp_uint8 *) __kmp_allocate( 4*sizeof(kmp_uint8) );
4106         this_thr->th.th_task_state_top = 0;
4107         this_thr->th.th_task_state_stack_sz = 4;
4108         for (i=0; i<this_thr->th.th_task_state_stack_sz; ++i) // zero init the stack
4109             this_thr->th.th_task_state_memo_stack[i] = 0;
4110     }
4111 
4112     KMP_DEBUG_ASSERT( !this_thr->th.th_spin_here );
4113     KMP_DEBUG_ASSERT( this_thr->th.th_next_waiting == 0 );
4114 
4115     KMP_MB();
4116 }
4117 
4118 
4119 /* allocate a new thread for the requesting team.  this is only called from within a
4120  * forkjoin critical section.  we will first try to get an available thread from the
4121  * thread pool.  if none is available, we will fork a new one assuming we are able
4122  * to create a new one.  this should be assured, as the caller should check on this
4123  * first.
4124  */
4125 kmp_info_t *
4126 __kmp_allocate_thread( kmp_root_t *root, kmp_team_t *team, int new_tid )
4127 {
4128     kmp_team_t  *serial_team;
4129     kmp_info_t  *new_thr;
4130     int          new_gtid;
4131 
4132     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid() ));
4133     KMP_DEBUG_ASSERT( root && team );
4134 #if !KMP_NESTED_HOT_TEAMS
4135     KMP_DEBUG_ASSERT( KMP_MASTER_GTID( __kmp_get_gtid() ));
4136 #endif
4137     KMP_MB();
4138 
4139     /* first, try to get one from the thread pool */
4140     if ( __kmp_thread_pool ) {
4141 
4142         new_thr = (kmp_info_t*)__kmp_thread_pool;
4143         __kmp_thread_pool = (volatile kmp_info_t *) new_thr->th.th_next_pool;
4144         if ( new_thr == __kmp_thread_pool_insert_pt ) {
4145             __kmp_thread_pool_insert_pt = NULL;
4146         }
4147         TCW_4(new_thr->th.th_in_pool, FALSE);
4148         //
4149         // Don't touch th_active_in_pool or th_active.
4150         // The worker thread adjusts those flags as it sleeps/awakens.
4151         //
4152         __kmp_thread_pool_nth--;
4153 
4154         KA_TRACE( 20, ("__kmp_allocate_thread: T#%d using thread T#%d\n",
4155                     __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid ));
4156         KMP_ASSERT(       ! new_thr->th.th_team );
4157         KMP_DEBUG_ASSERT( __kmp_nth < __kmp_threads_capacity );
4158         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth >= 0 );
4159 
4160         /* setup the thread structure */
4161         __kmp_initialize_info( new_thr, team, new_tid, new_thr->th.th_info.ds.ds_gtid );
4162         KMP_DEBUG_ASSERT( new_thr->th.th_serial_team );
4163 
4164         TCW_4(__kmp_nth, __kmp_nth + 1);
4165 
4166         new_thr->th.th_task_state = 0;
4167         new_thr->th.th_task_state_top = 0;
4168         new_thr->th.th_task_state_stack_sz = 4;
4169 
4170 #ifdef KMP_ADJUST_BLOCKTIME
4171         /* Adjust blocktime back to zero if necessar      y */
4172         /* Middle initialization might not have occurred yet */
4173         if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4174             if ( __kmp_nth > __kmp_avail_proc ) {
4175                 __kmp_zero_bt = TRUE;
4176             }
4177         }
4178 #endif /* KMP_ADJUST_BLOCKTIME */
4179 
4180 #if KMP_DEBUG
4181         // If thread entered pool via __kmp_free_thread, wait_flag should != KMP_BARRIER_PARENT_FLAG.
4182         int b;
4183         kmp_balign_t * balign = new_thr->th.th_bar;
4184         for( b = 0; b < bs_last_barrier; ++ b )
4185             KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4186 #endif
4187 
4188         KF_TRACE( 10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n",
4189                     __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid ));
4190 
4191         KMP_MB();
4192         return new_thr;
4193     }
4194 
4195 
4196     /* no, well fork a new one */
4197     KMP_ASSERT( __kmp_nth    == __kmp_all_nth );
4198     KMP_ASSERT( __kmp_all_nth < __kmp_threads_capacity );
4199 
4200 #if KMP_USE_MONITOR
4201     //
4202     // If this is the first worker thread the RTL is creating, then also
4203     // launch the monitor thread.  We try to do this as early as possible.
4204     //
4205     if ( ! TCR_4( __kmp_init_monitor ) ) {
4206         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
4207         if ( ! TCR_4( __kmp_init_monitor ) ) {
4208             KF_TRACE( 10, ( "before __kmp_create_monitor\n" ) );
4209             TCW_4( __kmp_init_monitor, 1 );
4210             __kmp_create_monitor( & __kmp_monitor );
4211             KF_TRACE( 10, ( "after __kmp_create_monitor\n" ) );
4212             #if KMP_OS_WINDOWS
4213                 // AC: wait until monitor has started. This is a fix for CQ232808.
4214                 //     The reason is that if the library is loaded/unloaded in a loop with small (parallel)
4215                 //     work in between, then there is high probability that monitor thread started after
4216                 //     the library shutdown. At shutdown it is too late to cope with the problem, because
4217                 //     when the master is in DllMain (process detach) the monitor has no chances to start
4218                 //     (it is blocked), and master has no means to inform the monitor that the library has gone,
4219                 //     because all the memory which the monitor can access is going to be released/reset.
4220                 while ( TCR_4(__kmp_init_monitor) < 2 ) {
4221                     KMP_YIELD( TRUE );
4222                 }
4223                 KF_TRACE( 10, ( "after monitor thread has started\n" ) );
4224             #endif
4225         }
4226         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
4227     }
4228 #endif
4229 
4230     KMP_MB();
4231     for( new_gtid=1 ; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid ) {
4232         KMP_DEBUG_ASSERT( new_gtid < __kmp_threads_capacity );
4233     }
4234 
4235     /* allocate space for it. */
4236     new_thr = (kmp_info_t*) __kmp_allocate( sizeof(kmp_info_t) );
4237 
4238     TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr);
4239 
4240     if ( __kmp_storage_map ) {
4241         __kmp_print_thread_storage_map( new_thr, new_gtid );
4242     }
4243 
4244     /* add the reserve serialized team, initialized from the team's master thread */
4245     {
4246     kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs( team );
4247     KF_TRACE( 10, ( "__kmp_allocate_thread: before th_serial/serial_team\n" ) );
4248 
4249     new_thr->th.th_serial_team = serial_team =
4250         (kmp_team_t*) __kmp_allocate_team( root, 1, 1,
4251 #if OMPT_SUPPORT
4252                                            0, // root parallel id
4253 #endif
4254 #if OMP_40_ENABLED
4255                                            proc_bind_default,
4256 #endif
4257                                            &r_icvs,
4258                                            0 USE_NESTED_HOT_ARG(NULL) );
4259     }
4260     KMP_ASSERT ( serial_team );
4261     serial_team->t.t_serialized = 0;   // AC: the team created in reserve, not for execution (it is unused for now).
4262     serial_team->t.t_threads[0] = new_thr;
4263     KF_TRACE( 10, ( "__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n",
4264       new_thr ) );
4265 
4266     /* setup the thread structures */
4267     __kmp_initialize_info( new_thr, team, new_tid, new_gtid );
4268 
4269     #if USE_FAST_MEMORY
4270         __kmp_initialize_fast_memory( new_thr );
4271     #endif /* USE_FAST_MEMORY */
4272 
4273     #if KMP_USE_BGET
4274         KMP_DEBUG_ASSERT( new_thr->th.th_local.bget_data == NULL );
4275         __kmp_initialize_bget( new_thr );
4276     #endif
4277 
4278     __kmp_init_random( new_thr );  // Initialize random number generator
4279 
4280     /* Initialize these only once when thread is grabbed for a team allocation */
4281     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n",
4282                     __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
4283 
4284     int b;
4285     kmp_balign_t * balign = new_thr->th.th_bar;
4286     for(b=0; b<bs_last_barrier; ++b) {
4287         balign[b].bb.b_go = KMP_INIT_BARRIER_STATE;
4288         balign[b].bb.team = NULL;
4289         balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING;
4290         balign[b].bb.use_oncore_barrier = 0;
4291     }
4292 
4293     new_thr->th.th_spin_here = FALSE;
4294     new_thr->th.th_next_waiting = 0;
4295 
4296 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4297     new_thr->th.th_current_place = KMP_PLACE_UNDEFINED;
4298     new_thr->th.th_new_place = KMP_PLACE_UNDEFINED;
4299     new_thr->th.th_first_place = KMP_PLACE_UNDEFINED;
4300     new_thr->th.th_last_place = KMP_PLACE_UNDEFINED;
4301 #endif
4302 
4303     TCW_4(new_thr->th.th_in_pool, FALSE);
4304     new_thr->th.th_active_in_pool = FALSE;
4305     TCW_4(new_thr->th.th_active, TRUE);
4306 
4307     /* adjust the global counters */
4308     __kmp_all_nth ++;
4309     __kmp_nth ++;
4310 
4311     //
4312     // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search)
4313     // for low numbers of procs, and method #2 (keyed API call) for higher
4314     // numbers of procs.
4315     //
4316     if ( __kmp_adjust_gtid_mode ) {
4317         if ( __kmp_all_nth >= __kmp_tls_gtid_min ) {
4318             if ( TCR_4(__kmp_gtid_mode) != 2) {
4319                 TCW_4(__kmp_gtid_mode, 2);
4320             }
4321         }
4322         else {
4323             if (TCR_4(__kmp_gtid_mode) != 1 ) {
4324                 TCW_4(__kmp_gtid_mode, 1);
4325             }
4326         }
4327     }
4328 
4329 #ifdef KMP_ADJUST_BLOCKTIME
4330     /* Adjust blocktime back to zero if necessary       */
4331     /* Middle initialization might not have occurred yet */
4332     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
4333         if ( __kmp_nth > __kmp_avail_proc ) {
4334             __kmp_zero_bt = TRUE;
4335         }
4336     }
4337 #endif /* KMP_ADJUST_BLOCKTIME */
4338 
4339     /* actually fork it and create the new worker thread */
4340     KF_TRACE( 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr ));
4341     __kmp_create_worker( new_gtid, new_thr, __kmp_stksize );
4342     KF_TRACE( 10, ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr ));
4343 
4344     KA_TRACE( 20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), new_gtid ));
4345     KMP_MB();
4346     return new_thr;
4347 }
4348 
4349 /*
4350  * reinitialize team for reuse.
4351  *
4352  * The hot team code calls this case at every fork barrier, so EPCC barrier
4353  * test are extremely sensitive to changes in it, esp. writes to the team
4354  * struct, which cause a cache invalidation in all threads.
4355  *
4356  * IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!!
4357  */
4358 static void
4359 __kmp_reinitialize_team( kmp_team_t *team, kmp_internal_control_t *new_icvs, ident_t *loc ) {
4360     KF_TRACE( 10, ( "__kmp_reinitialize_team: enter this_thread=%p team=%p\n",
4361                     team->t.t_threads[0], team ) );
4362     KMP_DEBUG_ASSERT( team && new_icvs);
4363     KMP_DEBUG_ASSERT( ( ! TCR_4(__kmp_init_parallel) ) || new_icvs->nproc );
4364     KMP_CHECK_UPDATE(team->t.t_ident, loc);
4365 
4366     KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID());
4367 
4368     // Copy ICVs to the master thread's implicit taskdata
4369     __kmp_init_implicit_task( loc, team->t.t_threads[0], team, 0, FALSE );
4370     copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs);
4371 
4372     KF_TRACE( 10, ( "__kmp_reinitialize_team: exit this_thread=%p team=%p\n",
4373                     team->t.t_threads[0], team ) );
4374 }
4375 
4376 
4377 /* initialize the team data structure
4378  * this assumes the t_threads and t_max_nproc are already set
4379  * also, we don't touch the arguments */
4380 static void
4381 __kmp_initialize_team(
4382     kmp_team_t * team,
4383     int          new_nproc,
4384     kmp_internal_control_t * new_icvs,
4385     ident_t *                loc
4386 ) {
4387     KF_TRACE( 10, ( "__kmp_initialize_team: enter: team=%p\n", team ) );
4388 
4389     /* verify */
4390     KMP_DEBUG_ASSERT( team );
4391     KMP_DEBUG_ASSERT( new_nproc <= team->t.t_max_nproc );
4392     KMP_DEBUG_ASSERT( team->t.t_threads );
4393     KMP_MB();
4394 
4395     team->t.t_master_tid  = 0;    /* not needed */
4396     /* team->t.t_master_bar;        not needed */
4397     team->t.t_serialized  = new_nproc > 1 ? 0 : 1;
4398     team->t.t_nproc       = new_nproc;
4399 
4400     /* team->t.t_parent     = NULL; TODO not needed & would mess up hot team */
4401     team->t.t_next_pool   = NULL;
4402     /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess up hot team */
4403 
4404     TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */
4405     team->t.t_invoke      = NULL; /* not needed */
4406 
4407     // TODO???: team->t.t_max_active_levels       = new_max_active_levels;
4408     team->t.t_sched       = new_icvs->sched;
4409 
4410 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
4411     team->t.t_fp_control_saved = FALSE; /* not needed */
4412     team->t.t_x87_fpu_control_word = 0; /* not needed */
4413     team->t.t_mxcsr = 0;                /* not needed */
4414 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
4415 
4416     team->t.t_construct   = 0;
4417     __kmp_init_lock( & team->t.t_single_lock );
4418 
4419     team->t.t_ordered .dt.t_value = 0;
4420     team->t.t_master_active = FALSE;
4421 
4422     memset( & team->t.t_taskq, '\0', sizeof( kmp_taskq_t ));
4423 
4424 #ifdef KMP_DEBUG
4425     team->t.t_copypriv_data = NULL;  /* not necessary, but nice for debugging */
4426 #endif
4427     team->t.t_copyin_counter = 0;    /* for barrier-free copyin implementation */
4428 
4429     team->t.t_control_stack_top = NULL;
4430 
4431     __kmp_reinitialize_team( team, new_icvs, loc );
4432 
4433     KMP_MB();
4434     KF_TRACE( 10, ( "__kmp_initialize_team: exit: team=%p\n", team ) );
4435 }
4436 
4437 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4438 /* Sets full mask for thread and returns old mask, no changes to structures. */
4439 static void
4440 __kmp_set_thread_affinity_mask_full_tmp( kmp_affin_mask_t *old_mask )
4441 {
4442     if ( KMP_AFFINITY_CAPABLE() ) {
4443         int status;
4444         if ( old_mask != NULL ) {
4445             status = __kmp_get_system_affinity( old_mask, TRUE );
4446             int error = errno;
4447             if ( status != 0 ) {
4448                 __kmp_msg(
4449                     kmp_ms_fatal,
4450                     KMP_MSG( ChangeThreadAffMaskError ),
4451                     KMP_ERR( error ),
4452                     __kmp_msg_null
4453                 );
4454             }
4455         }
4456         __kmp_set_system_affinity( __kmp_affin_fullMask, TRUE );
4457     }
4458 }
4459 #endif
4460 
4461 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED
4462 
4463 //
4464 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism.
4465 // It calculats the worker + master thread's partition based upon the parent
4466 // thread's partition, and binds each worker to a thread in their partition.
4467 // The master thread's partition should already include its current binding.
4468 //
4469 static void
4470 __kmp_partition_places( kmp_team_t *team, int update_master_only )
4471 {
4472     //
4473     // Copy the master thread's place partion to the team struct
4474     //
4475     kmp_info_t *master_th = team->t.t_threads[0];
4476     KMP_DEBUG_ASSERT( master_th != NULL );
4477     kmp_proc_bind_t proc_bind = team->t.t_proc_bind;
4478     int first_place = master_th->th.th_first_place;
4479     int last_place = master_th->th.th_last_place;
4480     int masters_place = master_th->th.th_current_place;
4481     team->t.t_first_place = first_place;
4482     team->t.t_last_place = last_place;
4483 
4484     KA_TRACE( 20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) bound to place %d partition = [%d,%d]\n",
4485        proc_bind, __kmp_gtid_from_thread( team->t.t_threads[0] ), team->t.t_id,
4486        masters_place, first_place, last_place ) );
4487 
4488     switch ( proc_bind ) {
4489 
4490         case proc_bind_default:
4491         //
4492         // serial teams might have the proc_bind policy set to
4493         // proc_bind_default.  It doesn't matter, as we don't
4494         // rebind the master thread for any proc_bind policy.
4495         //
4496         KMP_DEBUG_ASSERT( team->t.t_nproc == 1 );
4497         break;
4498 
4499         case proc_bind_master:
4500         {
4501             int f;
4502             int n_th = team->t.t_nproc;
4503             for ( f = 1; f < n_th; f++ ) {
4504                 kmp_info_t *th = team->t.t_threads[f];
4505                 KMP_DEBUG_ASSERT( th != NULL );
4506                 th->th.th_first_place = first_place;
4507                 th->th.th_last_place = last_place;
4508                 th->th.th_new_place = masters_place;
4509 
4510                 KA_TRACE( 100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4511                   __kmp_gtid_from_thread( team->t.t_threads[f] ),
4512                   team->t.t_id, f, masters_place, first_place, last_place ) );
4513             }
4514         }
4515         break;
4516 
4517         case proc_bind_close:
4518         {
4519             int f;
4520             int n_th = team->t.t_nproc;
4521             int n_places;
4522             if ( first_place <= last_place ) {
4523                 n_places = last_place - first_place + 1;
4524             }
4525             else {
4526                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4527             }
4528             if ( n_th <= n_places ) {
4529                 int place = masters_place;
4530                 for ( f = 1; f < n_th; f++ ) {
4531                     kmp_info_t *th = team->t.t_threads[f];
4532                     KMP_DEBUG_ASSERT( th != NULL );
4533 
4534                     if ( place == last_place ) {
4535                         place = first_place;
4536                     }
4537                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4538                         place = 0;
4539                     }
4540                     else {
4541                         place++;
4542                     }
4543                     th->th.th_first_place = first_place;
4544                     th->th.th_last_place = last_place;
4545                     th->th.th_new_place = place;
4546 
4547                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4548                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4549                        team->t.t_id, f, place, first_place, last_place ) );
4550                 }
4551             }
4552             else {
4553                 int S, rem, gap, s_count;
4554                 S = n_th / n_places;
4555                 s_count = 0;
4556                 rem = n_th - ( S * n_places );
4557                 gap = rem > 0 ? n_places/rem : n_places;
4558                 int place = masters_place;
4559                 int gap_ct = gap;
4560                 for ( f = 0; f < n_th; f++ ) {
4561                     kmp_info_t *th = team->t.t_threads[f];
4562                     KMP_DEBUG_ASSERT( th != NULL );
4563 
4564                     th->th.th_first_place = first_place;
4565                     th->th.th_last_place = last_place;
4566                     th->th.th_new_place = place;
4567                     s_count++;
4568 
4569                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4570                         // do nothing, add an extra thread to place on next iteration
4571                     }
4572                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4573                         // we added an extra thread to this place; move to next place
4574                         if ( place == last_place ) {
4575                             place = first_place;
4576                         }
4577                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4578                             place = 0;
4579                         }
4580                         else {
4581                             place++;
4582                         }
4583                         s_count = 0;
4584                         gap_ct = 1;
4585                         rem--;
4586                     }
4587                     else if (s_count == S) { // place full; don't add extra
4588                         if ( place == last_place ) {
4589                             place = first_place;
4590                         }
4591                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4592                             place = 0;
4593                         }
4594                         else {
4595                             place++;
4596                         }
4597                         gap_ct++;
4598                         s_count = 0;
4599                     }
4600 
4601                     KA_TRACE( 100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4602                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4603                       team->t.t_id, f, th->th.th_new_place, first_place,
4604                       last_place ) );
4605                 }
4606                 KMP_DEBUG_ASSERT( place == masters_place );
4607             }
4608         }
4609         break;
4610 
4611         case proc_bind_spread:
4612         {
4613             int f;
4614             int n_th = team->t.t_nproc;
4615             int n_places;
4616             int thidx;
4617             if ( first_place <= last_place ) {
4618                 n_places = last_place - first_place + 1;
4619             }
4620             else {
4621                 n_places = __kmp_affinity_num_masks - first_place + last_place + 1;
4622             }
4623             if ( n_th <= n_places ) {
4624                 int place = masters_place;
4625                 int S = n_places/n_th;
4626                 int s_count, rem, gap, gap_ct;
4627                 rem = n_places - n_th*S;
4628                 gap = rem ? n_th/rem : 1;
4629                 gap_ct = gap;
4630                 thidx = n_th;
4631                 if (update_master_only == 1)
4632                     thidx = 1;
4633                 for ( f = 0; f < thidx; f++ ) {
4634                     kmp_info_t *th = team->t.t_threads[f];
4635                     KMP_DEBUG_ASSERT( th != NULL );
4636 
4637                     th->th.th_first_place = place;
4638                     th->th.th_new_place = place;
4639                     s_count = 1;
4640                     while (s_count < S) {
4641                         if ( place == last_place ) {
4642                             place = first_place;
4643                         }
4644                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4645                             place = 0;
4646                         }
4647                         else {
4648                             place++;
4649                         }
4650                         s_count++;
4651                     }
4652                     if (rem && (gap_ct == gap)) {
4653                         if ( place == last_place ) {
4654                             place = first_place;
4655                         }
4656                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4657                             place = 0;
4658                         }
4659                         else {
4660                             place++;
4661                         }
4662                         rem--;
4663                         gap_ct = 0;
4664                     }
4665                     th->th.th_last_place = place;
4666                     gap_ct++;
4667 
4668                     if ( place == last_place ) {
4669                         place = first_place;
4670                     }
4671                     else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4672                         place = 0;
4673                     }
4674                     else {
4675                         place++;
4676                     }
4677 
4678                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4679                       __kmp_gtid_from_thread( team->t.t_threads[f] ),
4680                       team->t.t_id, f, th->th.th_new_place,
4681                       th->th.th_first_place, th->th.th_last_place ) );
4682                 }
4683                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4684             }
4685             else {
4686                 int S, rem, gap, s_count;
4687                 S = n_th / n_places;
4688                 s_count = 0;
4689                 rem = n_th - ( S * n_places );
4690                 gap = rem > 0 ? n_places/rem : n_places;
4691                 int place = masters_place;
4692                 int gap_ct = gap;
4693                 thidx = n_th;
4694                 if (update_master_only == 1)
4695                     thidx = 1;
4696                 for ( f = 0; f < thidx; f++ ) {
4697                     kmp_info_t *th = team->t.t_threads[f];
4698                     KMP_DEBUG_ASSERT( th != NULL );
4699 
4700                     th->th.th_first_place = place;
4701                     th->th.th_last_place = place;
4702                     th->th.th_new_place = place;
4703                     s_count++;
4704 
4705                     if ( (s_count == S) && rem && (gap_ct == gap) ) {
4706                         // do nothing, add an extra thread to place on next iteration
4707                     }
4708                     else if ( (s_count == S+1) && rem && (gap_ct == gap) ) {
4709                         // we added an extra thread to this place; move on to next place
4710                         if ( place == last_place ) {
4711                             place = first_place;
4712                         }
4713                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4714                             place = 0;
4715                         }
4716                         else {
4717                             place++;
4718                         }
4719                         s_count = 0;
4720                         gap_ct = 1;
4721                         rem--;
4722                     }
4723                     else if (s_count == S) { // place is full; don't add extra thread
4724                         if ( place == last_place ) {
4725                             place = first_place;
4726                         }
4727                         else if ( place == (int)(__kmp_affinity_num_masks - 1) ) {
4728                             place = 0;
4729                         }
4730                         else {
4731                             place++;
4732                         }
4733                         gap_ct++;
4734                         s_count = 0;
4735                     }
4736 
4737                     KA_TRACE( 100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d partition = [%d,%d]\n",
4738                        __kmp_gtid_from_thread( team->t.t_threads[f] ),
4739                        team->t.t_id, f, th->th.th_new_place,
4740                        th->th.th_first_place, th->th.th_last_place) );
4741                 }
4742                 KMP_DEBUG_ASSERT( update_master_only || place == masters_place );
4743             }
4744         }
4745         break;
4746 
4747         default:
4748         break;
4749     }
4750 
4751     KA_TRACE( 20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id ) );
4752 }
4753 
4754 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */
4755 
4756 /* allocate a new team data structure to use.  take one off of the free pool if available */
4757 kmp_team_t *
4758 __kmp_allocate_team( kmp_root_t *root, int new_nproc, int max_nproc,
4759 #if OMPT_SUPPORT
4760     ompt_parallel_id_t ompt_parallel_id,
4761 #endif
4762 #if OMP_40_ENABLED
4763     kmp_proc_bind_t new_proc_bind,
4764 #endif
4765     kmp_internal_control_t *new_icvs,
4766     int argc USE_NESTED_HOT_ARG(kmp_info_t *master) )
4767 {
4768     KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team);
4769     int f;
4770     kmp_team_t *team;
4771     int use_hot_team = ! root->r.r_active;
4772     int level = 0;
4773 
4774     KA_TRACE( 20, ("__kmp_allocate_team: called\n"));
4775     KMP_DEBUG_ASSERT( new_nproc >=1 && argc >=0 );
4776     KMP_DEBUG_ASSERT( max_nproc >= new_nproc );
4777     KMP_MB();
4778 
4779 #if KMP_NESTED_HOT_TEAMS
4780     kmp_hot_team_ptr_t *hot_teams;
4781     if( master ) {
4782         team = master->th.th_team;
4783         level = team->t.t_active_level;
4784         if( master->th.th_teams_microtask ) {                         // in teams construct?
4785             if( master->th.th_teams_size.nteams > 1 && (             // #teams > 1
4786                 team->t.t_pkfn == (microtask_t)__kmp_teams_master || // inner fork of the teams
4787                 master->th.th_teams_level < team->t.t_level ) ) {    // or nested parallel inside the teams
4788                 ++level; // not increment if #teams==1, or for outer fork of the teams; increment otherwise
4789             }
4790         }
4791         hot_teams = master->th.th_hot_teams;
4792         if( level < __kmp_hot_teams_max_level && hot_teams && hot_teams[level].hot_team )
4793         {   // hot team has already been allocated for given level
4794             use_hot_team = 1;
4795         } else {
4796             use_hot_team = 0;
4797         }
4798     }
4799 #endif
4800     // Optimization to use a "hot" team
4801     if( use_hot_team && new_nproc > 1 ) {
4802         KMP_DEBUG_ASSERT( new_nproc == max_nproc );
4803 #if KMP_NESTED_HOT_TEAMS
4804         team = hot_teams[level].hot_team;
4805 #else
4806         team =  root->r.r_hot_team;
4807 #endif
4808 #if KMP_DEBUG
4809         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
4810             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p before reinit\n",
4811                            team->t.t_task_team[0], team->t.t_task_team[1] ));
4812         }
4813 #endif
4814 
4815         // Has the number of threads changed?
4816         /* Let's assume the most common case is that the number of threads is unchanged, and
4817            put that case first. */
4818         if (team->t.t_nproc == new_nproc) { // Check changes in number of threads
4819             KA_TRACE( 20, ("__kmp_allocate_team: reusing hot team\n" ));
4820             // This case can mean that omp_set_num_threads() was called and the hot team size
4821             // was already reduced, so we check the special flag
4822             if ( team->t.t_size_changed == -1 ) {
4823                 team->t.t_size_changed = 1;
4824             } else {
4825                 KMP_CHECK_UPDATE(team->t.t_size_changed, 0);
4826             }
4827 
4828             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4829             kmp_r_sched_t new_sched = new_icvs->sched;
4830             if (team->t.t_sched.r_sched_type != new_sched.r_sched_type ||
4831                 team->t.t_sched.chunk != new_sched.chunk)
4832                 team->t.t_sched = new_sched; // set master's schedule as new run-time schedule
4833 
4834             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4835 
4836             KF_TRACE( 10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n",
4837                            0, team->t.t_threads[0], team ) );
4838             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4839 
4840 #if OMP_40_ENABLED
4841 # if KMP_AFFINITY_SUPPORTED
4842             if ( ( team->t.t_size_changed == 0 )
4843               && ( team->t.t_proc_bind == new_proc_bind ) ) {
4844                 if (new_proc_bind == proc_bind_spread) {
4845                     __kmp_partition_places(team, 1); // add flag to update only master for spread
4846                 }
4847                 KA_TRACE( 200, ("__kmp_allocate_team: reusing hot team #%d bindings: proc_bind = %d, partition = [%d,%d]\n",
4848                   team->t.t_id, new_proc_bind, team->t.t_first_place,
4849                   team->t.t_last_place ) );
4850             }
4851             else {
4852                 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4853                 __kmp_partition_places( team );
4854             }
4855 # else
4856             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4857 # endif /* KMP_AFFINITY_SUPPORTED */
4858 #endif /* OMP_40_ENABLED */
4859         }
4860         else if( team->t.t_nproc > new_nproc ) {
4861             KA_TRACE( 20, ("__kmp_allocate_team: decreasing hot team thread count to %d\n", new_nproc ));
4862 
4863             team->t.t_size_changed = 1;
4864 #if KMP_NESTED_HOT_TEAMS
4865             if( __kmp_hot_teams_mode == 0 ) {
4866                 // AC: saved number of threads should correspond to team's value in this mode,
4867                 // can be bigger in mode 1, when hot team has some threads in reserve
4868                 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc);
4869                 hot_teams[level].hot_team_nth = new_nproc;
4870 #endif // KMP_NESTED_HOT_TEAMS
4871                 /* release the extra threads we don't need any more */
4872                 for( f = new_nproc  ;  f < team->t.t_nproc  ;  f++ ) {
4873                     KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
4874                     if ( __kmp_tasking_mode != tskm_immediate_exec) {
4875                         // When decreasing team size, threads no longer in the team should unref task team.
4876                         team->t.t_threads[f]->th.th_task_team = NULL;
4877                     }
4878                     __kmp_free_thread( team->t.t_threads[ f ] );
4879                     team->t.t_threads[ f ] = NULL;
4880                 }
4881 #if KMP_NESTED_HOT_TEAMS
4882             } // (__kmp_hot_teams_mode == 0)
4883             else {
4884                 // When keeping extra threads in team, switch threads to wait on own b_go flag
4885                 for (f=new_nproc; f<team->t.t_nproc; ++f) {
4886                     KMP_DEBUG_ASSERT(team->t.t_threads[f]);
4887                     kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar;
4888                     for (int b=0; b<bs_last_barrier; ++b) {
4889                         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) {
4890                             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
4891                         }
4892                         KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0);
4893                     }
4894                 }
4895             }
4896 #endif // KMP_NESTED_HOT_TEAMS
4897             team->t.t_nproc =  new_nproc;
4898             // TODO???: team->t.t_max_active_levels = new_max_active_levels;
4899             if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type ||
4900                 team->t.t_sched.chunk != new_icvs->sched.chunk)
4901                 team->t.t_sched = new_icvs->sched;
4902             __kmp_reinitialize_team( team, new_icvs, root->r.r_uber_thread->th.th_ident );
4903 
4904             /* update the remaining threads */
4905             for(f = 0; f < new_nproc; ++f) {
4906                 team->t.t_threads[f]->th.th_team_nproc = new_nproc;
4907             }
4908             // restore the current task state of the master thread: should be the implicit task
4909             KF_TRACE( 10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n",
4910                        0, team->t.t_threads[0], team ) );
4911 
4912             __kmp_push_current_task_to_thread( team->t.t_threads[ 0 ], team, 0 );
4913 
4914 #ifdef KMP_DEBUG
4915             for ( f = 0; f < team->t.t_nproc; f++ ) {
4916                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
4917                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
4918             }
4919 #endif
4920 
4921 #if OMP_40_ENABLED
4922             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
4923 # if KMP_AFFINITY_SUPPORTED
4924             __kmp_partition_places( team );
4925 # endif
4926 #endif
4927         }
4928         else { // team->t.t_nproc < new_nproc
4929 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4930             kmp_affin_mask_t *old_mask;
4931             if ( KMP_AFFINITY_CAPABLE() ) {
4932                 KMP_CPU_ALLOC(old_mask);
4933             }
4934 #endif
4935 
4936             KA_TRACE( 20, ("__kmp_allocate_team: increasing hot team thread count to %d\n", new_nproc ));
4937 
4938             team->t.t_size_changed = 1;
4939 
4940 #if KMP_NESTED_HOT_TEAMS
4941             int avail_threads = hot_teams[level].hot_team_nth;
4942             if( new_nproc < avail_threads )
4943                 avail_threads = new_nproc;
4944             kmp_info_t **other_threads = team->t.t_threads;
4945             for ( f = team->t.t_nproc; f < avail_threads; ++f ) {
4946                 // Adjust barrier data of reserved threads (if any) of the team
4947                 // Other data will be set in __kmp_initialize_info() below.
4948                 int b;
4949                 kmp_balign_t * balign = other_threads[f]->th.th_bar;
4950                 for ( b = 0; b < bs_last_barrier; ++ b ) {
4951                     balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived;
4952                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
4953 #if USE_DEBUGGER
4954                     balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived;
4955 #endif
4956                 }
4957             }
4958             if( hot_teams[level].hot_team_nth >= new_nproc ) {
4959                 // we have all needed threads in reserve, no need to allocate any
4960                 // this only possible in mode 1, cannot have reserved threads in mode 0
4961                 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1);
4962                 team->t.t_nproc = new_nproc;                     // just get reserved threads involved
4963             } else {
4964                 // we may have some threads in reserve, but not enough
4965                 team->t.t_nproc = hot_teams[level].hot_team_nth; // get reserved threads involved if any
4966                 hot_teams[level].hot_team_nth = new_nproc;       // adjust hot team max size
4967 #endif // KMP_NESTED_HOT_TEAMS
4968             if(team->t.t_max_nproc < new_nproc) {
4969                 /* reallocate larger arrays */
4970                 __kmp_reallocate_team_arrays(team, new_nproc);
4971                 __kmp_reinitialize_team( team, new_icvs, NULL );
4972             }
4973 
4974 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
4975             /* Temporarily set full mask for master thread before
4976                creation of workers. The reason is that workers inherit
4977                the affinity from master, so if a lot of workers are
4978                created on the single core quickly, they don't get
4979                a chance to set their own affinity for a long time.
4980             */
4981             __kmp_set_thread_affinity_mask_full_tmp( old_mask );
4982 #endif
4983 
4984             /* allocate new threads for the hot team */
4985             for( f = team->t.t_nproc  ;  f < new_nproc  ;  f++ ) {
4986                 kmp_info_t * new_worker = __kmp_allocate_thread( root, team, f );
4987                 KMP_DEBUG_ASSERT( new_worker );
4988                 team->t.t_threads[ f ] = new_worker;
4989 
4990                 KA_TRACE( 20, ("__kmp_allocate_team: team %d init T#%d arrived: join=%llu, plain=%llu\n",
4991                                 team->t.t_id, __kmp_gtid_from_tid( f, team ), team->t.t_id, f,
4992                                 team->t.t_bar[bs_forkjoin_barrier].b_arrived,
4993                                 team->t.t_bar[bs_plain_barrier].b_arrived ) );
4994 
4995                 { // Initialize barrier data for new threads.
4996                     int b;
4997                     kmp_balign_t * balign = new_worker->th.th_bar;
4998                     for( b = 0; b < bs_last_barrier; ++ b ) {
4999                         balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
5000                         KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5001 #if USE_DEBUGGER
5002                         balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5003 #endif
5004                     }
5005                 }
5006             }
5007 
5008 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED
5009             if ( KMP_AFFINITY_CAPABLE() ) {
5010                 /* Restore initial master thread's affinity mask */
5011                 __kmp_set_system_affinity( old_mask, TRUE );
5012                 KMP_CPU_FREE(old_mask);
5013             }
5014 #endif
5015 #if KMP_NESTED_HOT_TEAMS
5016             } // end of check of t_nproc vs. new_nproc vs. hot_team_nth
5017 #endif // KMP_NESTED_HOT_TEAMS
5018             /* make sure everyone is syncronized */
5019             int old_nproc = team->t.t_nproc; // save old value and use to update only new threads below
5020             __kmp_initialize_team( team, new_nproc, new_icvs, root->r.r_uber_thread->th.th_ident );
5021 
5022             /* reinitialize the threads */
5023             KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc);
5024             for (f=0;  f < team->t.t_nproc; ++f)
5025                 __kmp_initialize_info( team->t.t_threads[ f ], team, f, __kmp_gtid_from_tid( f, team ) );
5026             if (level) { // set th_task_state for new threads in nested hot team
5027                 // __kmp_initialize_info() no longer zeroes th_task_state, so we should only need to set the
5028                 // th_task_state for the new threads. th_task_state for master thread will not be accurate until
5029                 // after this in __kmp_fork_call(), so we look to the master's memo_stack to get the correct value.
5030                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5031                     team->t.t_threads[f]->th.th_task_state = team->t.t_threads[0]->th.th_task_state_memo_stack[level];
5032             }
5033             else { // set th_task_state for new threads in non-nested hot team
5034                 int old_state = team->t.t_threads[0]->th.th_task_state; // copy master's state
5035                 for (f=old_nproc; f < team->t.t_nproc; ++f)
5036                     team->t.t_threads[f]->th.th_task_state = old_state;
5037             }
5038 
5039 #ifdef KMP_DEBUG
5040             for ( f = 0; f < team->t.t_nproc; ++ f ) {
5041                 KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
5042                     team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
5043             }
5044 #endif
5045 
5046 #if OMP_40_ENABLED
5047             KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind);
5048 # if KMP_AFFINITY_SUPPORTED
5049             __kmp_partition_places( team );
5050 # endif
5051 #endif
5052         } // Check changes in number of threads
5053 
5054 #if OMP_40_ENABLED
5055         kmp_info_t *master = team->t.t_threads[0];
5056         if( master->th.th_teams_microtask ) {
5057             for( f = 1; f < new_nproc; ++f ) {
5058                 // propagate teams construct specific info to workers
5059                 kmp_info_t *thr = team->t.t_threads[f];
5060                 thr->th.th_teams_microtask = master->th.th_teams_microtask;
5061                 thr->th.th_teams_level     = master->th.th_teams_level;
5062                 thr->th.th_teams_size      = master->th.th_teams_size;
5063             }
5064         }
5065 #endif /* OMP_40_ENABLED */
5066 #if KMP_NESTED_HOT_TEAMS
5067         if( level ) {
5068             // Sync barrier state for nested hot teams, not needed for outermost hot team.
5069             for( f = 1; f < new_nproc; ++f ) {
5070                 kmp_info_t *thr = team->t.t_threads[f];
5071                 int b;
5072                 kmp_balign_t * balign = thr->th.th_bar;
5073                 for( b = 0; b < bs_last_barrier; ++ b ) {
5074                     balign[ b ].bb.b_arrived        = team->t.t_bar[ b ].b_arrived;
5075                     KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG);
5076 #if USE_DEBUGGER
5077                     balign[ b ].bb.b_worker_arrived = team->t.t_bar[ b ].b_team_arrived;
5078 #endif
5079                 }
5080             }
5081         }
5082 #endif // KMP_NESTED_HOT_TEAMS
5083 
5084         /* reallocate space for arguments if necessary */
5085         __kmp_alloc_argv_entries( argc, team, TRUE );
5086         KMP_CHECK_UPDATE(team->t.t_argc, argc);
5087         //
5088         // The hot team re-uses the previous task team,
5089         // if untouched during the previous release->gather phase.
5090         //
5091 
5092         KF_TRACE( 10, ( " hot_team = %p\n", team ) );
5093 
5094 #if KMP_DEBUG
5095         if ( __kmp_tasking_mode != tskm_immediate_exec ) {
5096             KA_TRACE( 20, ("__kmp_allocate_team: hot team task_team[0] = %p task_team[1] = %p after reinit\n",
5097                            team->t.t_task_team[0], team->t.t_task_team[1] ));
5098         }
5099 #endif
5100 
5101 #if OMPT_SUPPORT
5102         __ompt_team_assign_id(team, ompt_parallel_id);
5103 #endif
5104 
5105         KMP_MB();
5106 
5107         return team;
5108     }
5109 
5110     /* next, let's try to take one from the team pool */
5111     KMP_MB();
5112     for( team = (kmp_team_t*) __kmp_team_pool ; (team) ; )
5113     {
5114         /* TODO: consider resizing undersized teams instead of reaping them, now that we have a resizing mechanism */
5115         if ( team->t.t_max_nproc >= max_nproc ) {
5116             /* take this team from the team pool */
5117             __kmp_team_pool = team->t.t_next_pool;
5118 
5119             /* setup the team for fresh use */
5120             __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5121 
5122             KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5123                             &team->t.t_task_team[0], &team->t.t_task_team[1]) );
5124             team->t.t_task_team[0] = NULL;
5125             team->t.t_task_team[1] = NULL;
5126 
5127             /* reallocate space for arguments if necessary */
5128             __kmp_alloc_argv_entries( argc, team, TRUE );
5129             KMP_CHECK_UPDATE(team->t.t_argc, argc);
5130 
5131             KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5132                             team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5133             { // Initialize barrier data.
5134                 int b;
5135                 for ( b = 0; b < bs_last_barrier; ++ b) {
5136                     team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5137 #if USE_DEBUGGER
5138                     team->t.t_bar[ b ].b_master_arrived = 0;
5139                     team->t.t_bar[ b ].b_team_arrived   = 0;
5140 #endif
5141                 }
5142             }
5143 
5144 #if OMP_40_ENABLED
5145             team->t.t_proc_bind = new_proc_bind;
5146 #endif
5147 
5148             KA_TRACE( 20, ("__kmp_allocate_team: using team from pool %d.\n", team->t.t_id ));
5149 
5150 #if OMPT_SUPPORT
5151             __ompt_team_assign_id(team, ompt_parallel_id);
5152 #endif
5153 
5154             KMP_MB();
5155 
5156             return team;
5157         }
5158 
5159         /* reap team if it is too small, then loop back and check the next one */
5160         /* not sure if this is wise, but, will be redone during the hot-teams rewrite. */
5161         /* TODO: Use technique to find the right size hot-team, don't reap them */
5162         team =  __kmp_reap_team( team );
5163         __kmp_team_pool = team;
5164     }
5165 
5166     /* nothing available in the pool, no matter, make a new team! */
5167     KMP_MB();
5168     team = (kmp_team_t*) __kmp_allocate( sizeof( kmp_team_t ) );
5169 
5170     /* and set it up */
5171     team->t.t_max_nproc   = max_nproc;
5172     /* NOTE well, for some reason allocating one big buffer and dividing it
5173      * up seems to really hurt performance a lot on the P4, so, let's not use
5174      * this... */
5175     __kmp_allocate_team_arrays( team, max_nproc );
5176 
5177     KA_TRACE( 20, ( "__kmp_allocate_team: making a new team\n" ) );
5178     __kmp_initialize_team( team, new_nproc, new_icvs, NULL );
5179 
5180     KA_TRACE( 20, ( "__kmp_allocate_team: setting task_team[0] %p and task_team[1] %p to NULL\n",
5181                     &team->t.t_task_team[0], &team->t.t_task_team[1] ) );
5182     team->t.t_task_team[0] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5183     team->t.t_task_team[1] = NULL;    // to be removed, as __kmp_allocate zeroes memory, no need to duplicate
5184 
5185     if ( __kmp_storage_map ) {
5186         __kmp_print_team_storage_map( "team", team, team->t.t_id, new_nproc );
5187     }
5188 
5189     /* allocate space for arguments */
5190     __kmp_alloc_argv_entries( argc, team, FALSE );
5191     team->t.t_argc        = argc;
5192 
5193     KA_TRACE( 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n",
5194                     team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE ));
5195     { // Initialize barrier data.
5196         int b;
5197         for ( b = 0; b < bs_last_barrier; ++ b ) {
5198             team->t.t_bar[ b ].b_arrived        = KMP_INIT_BARRIER_STATE;
5199 #if USE_DEBUGGER
5200             team->t.t_bar[ b ].b_master_arrived = 0;
5201             team->t.t_bar[ b ].b_team_arrived   = 0;
5202 #endif
5203         }
5204     }
5205 
5206 #if OMP_40_ENABLED
5207     team->t.t_proc_bind = new_proc_bind;
5208 #endif
5209 
5210 #if OMPT_SUPPORT
5211     __ompt_team_assign_id(team, ompt_parallel_id);
5212     team->t.ompt_serialized_team_info = NULL;
5213 #endif
5214 
5215     KMP_MB();
5216 
5217     KA_TRACE( 20, ("__kmp_allocate_team: done creating a new team %d.\n", team->t.t_id ));
5218 
5219     return team;
5220 }
5221 
5222 /* TODO implement hot-teams at all levels */
5223 /* TODO implement lazy thread release on demand (disband request) */
5224 
5225 /* free the team.  return it to the team pool.  release all the threads
5226  * associated with it */
5227 void
5228 __kmp_free_team( kmp_root_t *root, kmp_team_t *team  USE_NESTED_HOT_ARG(kmp_info_t *master) )
5229 {
5230     int f;
5231     KA_TRACE( 20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), team->t.t_id ));
5232 
5233     /* verify state */
5234     KMP_DEBUG_ASSERT( root );
5235     KMP_DEBUG_ASSERT( team );
5236     KMP_DEBUG_ASSERT( team->t.t_nproc <= team->t.t_max_nproc );
5237     KMP_DEBUG_ASSERT( team->t.t_threads );
5238 
5239     int use_hot_team = team == root->r.r_hot_team;
5240 #if KMP_NESTED_HOT_TEAMS
5241     int level;
5242     kmp_hot_team_ptr_t *hot_teams;
5243     if( master ) {
5244         level = team->t.t_active_level - 1;
5245         if( master->th.th_teams_microtask ) {                         // in teams construct?
5246             if( master->th.th_teams_size.nteams > 1 ) {
5247                ++level; // level was not increased in teams construct for team_of_masters
5248             }
5249             if( team->t.t_pkfn != (microtask_t)__kmp_teams_master &&
5250                 master->th.th_teams_level == team->t.t_level ) {
5251                 ++level; // level was not increased in teams construct for team_of_workers before the parallel
5252             }            // team->t.t_level will be increased inside parallel
5253         }
5254         hot_teams = master->th.th_hot_teams;
5255         if( level < __kmp_hot_teams_max_level ) {
5256             KMP_DEBUG_ASSERT( team == hot_teams[level].hot_team );
5257             use_hot_team = 1;
5258         }
5259     }
5260 #endif // KMP_NESTED_HOT_TEAMS
5261 
5262     /* team is done working */
5263     TCW_SYNC_PTR(team->t.t_pkfn, NULL); // Important for Debugging Support Library.
5264     team->t.t_copyin_counter = 0; // init counter for possible reuse
5265     // Do not reset pointer to parent team to NULL for hot teams.
5266 
5267     /* if we are non-hot team, release our threads */
5268     if( ! use_hot_team ) {
5269         if (__kmp_tasking_mode != tskm_immediate_exec) {
5270             // Wait for threads to reach reapable state
5271             for (f = 1; f < team->t.t_nproc; ++f) {
5272                 KMP_DEBUG_ASSERT(team->t.t_threads[f]);
5273                 kmp_info_t *th = team->t.t_threads[f];
5274                 volatile kmp_uint32 *state = &th->th.th_reap_state;
5275                 while (*state != KMP_SAFE_TO_REAP) {
5276 #if KMP_OS_WINDOWS
5277                     // On Windows a thread can be killed at any time, check this
5278                     DWORD ecode;
5279                     if (!__kmp_is_thread_alive(th, &ecode)) {
5280                         *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread
5281                         break;
5282                     }
5283 #endif
5284                     // first check if thread is sleeping
5285                     kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th);
5286                     if (fl.is_sleeping())
5287                         fl.resume(__kmp_gtid_from_thread(th));
5288                     KMP_CPU_PAUSE();
5289                 }
5290             }
5291 
5292             // Delete task teams
5293             int tt_idx;
5294             for (tt_idx=0; tt_idx<2; ++tt_idx) {
5295                 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx];
5296                 if ( task_team != NULL ) {
5297                     for (f=0; f<team->t.t_nproc; ++f) { // Have all threads unref task teams
5298                         team->t.t_threads[f]->th.th_task_team = NULL;
5299                     }
5300                     KA_TRACE( 20, ( "__kmp_free_team: T#%d deactivating task_team %p on team %d\n", __kmp_get_gtid(), task_team, team->t.t_id ) );
5301 #if KMP_NESTED_HOT_TEAMS
5302                     __kmp_free_task_team( master, task_team );
5303 #endif
5304                     team->t.t_task_team[tt_idx] = NULL;
5305                 }
5306             }
5307         }
5308 
5309         // Reset pointer to parent team only for non-hot teams.
5310         team->t.t_parent = NULL;
5311         team->t.t_level = 0;
5312         team->t.t_active_level = 0;
5313 
5314         /* free the worker threads */
5315         for ( f = 1; f < team->t.t_nproc; ++ f ) {
5316             KMP_DEBUG_ASSERT( team->t.t_threads[ f ] );
5317             __kmp_free_thread( team->t.t_threads[ f ] );
5318             team->t.t_threads[ f ] = NULL;
5319         }
5320 
5321         /* put the team back in the team pool */
5322         /* TODO limit size of team pool, call reap_team if pool too large */
5323         team->t.t_next_pool  = (kmp_team_t*) __kmp_team_pool;
5324         __kmp_team_pool        = (volatile kmp_team_t*) team;
5325     }
5326 
5327     KMP_MB();
5328 }
5329 
5330 
5331 /* reap the team.  destroy it, reclaim all its resources and free its memory */
5332 kmp_team_t *
5333 __kmp_reap_team( kmp_team_t *team )
5334 {
5335     kmp_team_t *next_pool = team->t.t_next_pool;
5336 
5337     KMP_DEBUG_ASSERT( team );
5338     KMP_DEBUG_ASSERT( team->t.t_dispatch    );
5339     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
5340     KMP_DEBUG_ASSERT( team->t.t_threads     );
5341     KMP_DEBUG_ASSERT( team->t.t_argv        );
5342 
5343     /* TODO clean the threads that are a part of this? */
5344 
5345     /* free stuff */
5346 
5347     __kmp_free_team_arrays( team );
5348     if ( team->t.t_argv != &team->t.t_inline_argv[0] )
5349         __kmp_free( (void*) team->t.t_argv );
5350     __kmp_free( team );
5351 
5352     KMP_MB();
5353     return next_pool;
5354 }
5355 
5356 //
5357 // Free the thread.  Don't reap it, just place it on the pool of available
5358 // threads.
5359 //
5360 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid
5361 // binding for the affinity mechanism to be useful.
5362 //
5363 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid.
5364 // However, we want to avoid a potential performance problem by always
5365 // scanning through the list to find the correct point at which to insert
5366 // the thread (potential N**2 behavior).  To do this we keep track of the
5367 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt).
5368 // With single-level parallelism, threads will always be added to the tail
5369 // of the list, kept track of by __kmp_thread_pool_insert_pt.  With nested
5370 // parallelism, all bets are off and we may need to scan through the entire
5371 // free list.
5372 //
5373 // This change also has a potentially large performance benefit, for some
5374 // applications.  Previously, as threads were freed from the hot team, they
5375 // would be placed back on the free list in inverse order.  If the hot team
5376 // grew back to it's original size, then the freed thread would be placed
5377 // back on the hot team in reverse order.  This could cause bad cache
5378 // locality problems on programs where the size of the hot team regularly
5379 // grew and shrunk.
5380 //
5381 // Now, for single-level parallelism, the OMP tid is alway == gtid.
5382 //
5383 void
5384 __kmp_free_thread( kmp_info_t *this_th )
5385 {
5386     int gtid;
5387     kmp_info_t **scan;
5388 
5389     KA_TRACE( 20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n",
5390                 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid ));
5391 
5392     KMP_DEBUG_ASSERT( this_th );
5393 
5394     // When moving thread to pool, switch thread to wait on own b_go flag, and uninitialized (NULL team).
5395     int b;
5396     kmp_balign_t *balign = this_th->th.th_bar;
5397     for (b=0; b<bs_last_barrier; ++b) {
5398         if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG)
5399             balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG;
5400         balign[b].bb.team = NULL;
5401         balign[b].bb.leaf_kids = 0;
5402     }
5403     this_th->th.th_task_state = 0;
5404 
5405     /* put thread back on the free pool */
5406     TCW_PTR(this_th->th.th_team, NULL);
5407     TCW_PTR(this_th->th.th_root, NULL);
5408     TCW_PTR(this_th->th.th_dispatch, NULL);               /* NOT NEEDED */
5409 
5410     //
5411     // If the __kmp_thread_pool_insert_pt is already past the new insert
5412     // point, then we need to re-scan the entire list.
5413     //
5414     gtid = this_th->th.th_info.ds.ds_gtid;
5415     if ( __kmp_thread_pool_insert_pt != NULL ) {
5416         KMP_DEBUG_ASSERT( __kmp_thread_pool != NULL );
5417         if ( __kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid ) {
5418              __kmp_thread_pool_insert_pt = NULL;
5419         }
5420     }
5421 
5422     //
5423     // Scan down the list to find the place to insert the thread.
5424     // scan is the address of a link in the list, possibly the address of
5425     // __kmp_thread_pool itself.
5426     //
5427     // In the absence of nested parallism, the for loop will have 0 iterations.
5428     //
5429     if ( __kmp_thread_pool_insert_pt != NULL ) {
5430         scan = &( __kmp_thread_pool_insert_pt->th.th_next_pool );
5431     }
5432     else {
5433         scan = (kmp_info_t **)&__kmp_thread_pool;
5434     }
5435     for (; ( *scan != NULL ) && ( (*scan)->th.th_info.ds.ds_gtid < gtid );
5436       scan = &( (*scan)->th.th_next_pool ) );
5437 
5438     //
5439     // Insert the new element on the list, and set __kmp_thread_pool_insert_pt
5440     // to its address.
5441     //
5442     TCW_PTR(this_th->th.th_next_pool, *scan);
5443     __kmp_thread_pool_insert_pt = *scan = this_th;
5444     KMP_DEBUG_ASSERT( ( this_th->th.th_next_pool == NULL )
5445       || ( this_th->th.th_info.ds.ds_gtid
5446       < this_th->th.th_next_pool->th.th_info.ds.ds_gtid ) );
5447     TCW_4(this_th->th.th_in_pool, TRUE);
5448     __kmp_thread_pool_nth++;
5449 
5450     TCW_4(__kmp_nth, __kmp_nth - 1);
5451 
5452 #ifdef KMP_ADJUST_BLOCKTIME
5453     /* Adjust blocktime back to user setting or default if necessary */
5454     /* Middle initialization might never have occurred                */
5455     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5456         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5457         if ( __kmp_nth <= __kmp_avail_proc ) {
5458             __kmp_zero_bt = FALSE;
5459         }
5460     }
5461 #endif /* KMP_ADJUST_BLOCKTIME */
5462 
5463     KMP_MB();
5464 }
5465 
5466 
5467 /* ------------------------------------------------------------------------ */
5468 
5469 void *
5470 __kmp_launch_thread( kmp_info_t *this_thr )
5471 {
5472     int                   gtid = this_thr->th.th_info.ds.ds_gtid;
5473 /*    void                 *stack_data;*/
5474     kmp_team_t *(*volatile pteam);
5475 
5476     KMP_MB();
5477     KA_TRACE( 10, ("__kmp_launch_thread: T#%d start\n", gtid ) );
5478 
5479     if( __kmp_env_consistency_check ) {
5480         this_thr->th.th_cons = __kmp_allocate_cons_stack( gtid );  // ATT: Memory leak?
5481     }
5482 
5483 #if OMPT_SUPPORT
5484     if (ompt_enabled) {
5485         this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5486         this_thr->th.ompt_thread_info.wait_id = 0;
5487         this_thr->th.ompt_thread_info.idle_frame = __builtin_frame_address(0);
5488         if (ompt_callbacks.ompt_callback(ompt_event_thread_begin)) {
5489             __ompt_thread_begin(ompt_thread_worker, gtid);
5490         }
5491     }
5492 #endif
5493 
5494     /* This is the place where threads wait for work */
5495     while( ! TCR_4(__kmp_global.g.g_done) ) {
5496         KMP_DEBUG_ASSERT( this_thr == __kmp_threads[ gtid ] );
5497         KMP_MB();
5498 
5499         /* wait for work to do */
5500         KA_TRACE( 20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid ));
5501 
5502 #if OMPT_SUPPORT
5503         if (ompt_enabled) {
5504             this_thr->th.ompt_thread_info.state = ompt_state_idle;
5505         }
5506 #endif
5507 
5508         /* No tid yet since not part of a team */
5509         __kmp_fork_barrier( gtid, KMP_GTID_DNE );
5510 
5511 #if OMPT_SUPPORT
5512         if (ompt_enabled) {
5513             this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5514         }
5515 #endif
5516 
5517         pteam = (kmp_team_t *(*))(& this_thr->th.th_team);
5518 
5519         /* have we been allocated? */
5520         if ( TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done) ) {
5521 #if OMPT_SUPPORT
5522             ompt_task_info_t *task_info;
5523             ompt_parallel_id_t my_parallel_id;
5524             if (ompt_enabled) {
5525                 task_info = __ompt_get_taskinfo(0);
5526                 my_parallel_id = (*pteam)->t.ompt_team_info.parallel_id;
5527             }
5528 #endif
5529             /* we were just woken up, so run our new task */
5530             if ( TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL ) {
5531                 int rc;
5532                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n",
5533                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5534 
5535                 updateHWFPControl (*pteam);
5536 
5537 #if OMPT_SUPPORT
5538                 if (ompt_enabled) {
5539                     this_thr->th.ompt_thread_info.state = ompt_state_work_parallel;
5540                     // Initialize OMPT task id for implicit task.
5541                     int tid = __kmp_tid_from_gtid(gtid);
5542                     task_info->task_id = __ompt_task_id_new(tid);
5543                 }
5544 #endif
5545 
5546                 {
5547                     KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
5548                     KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
5549                     rc = (*pteam)->t.t_invoke( gtid );
5550                 }
5551                 KMP_ASSERT( rc );
5552 
5553 #if OMPT_SUPPORT
5554                 if (ompt_enabled) {
5555                     /* no frame set while outside task */
5556                     task_info->frame.exit_runtime_frame = NULL;
5557 
5558                     this_thr->th.ompt_thread_info.state = ompt_state_overhead;
5559                 }
5560 #endif
5561                 KMP_MB();
5562                 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n",
5563                               gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), (*pteam)->t.t_pkfn));
5564             }
5565             /* join barrier after parallel region */
5566             __kmp_join_barrier( gtid );
5567 #if OMPT_SUPPORT && OMPT_TRACE
5568             if (ompt_enabled) {
5569                 if (ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)) {
5570                     // don't access *pteam here: it may have already been freed
5571                     // by the master thread behind the barrier (possible race)
5572                     ompt_callbacks.ompt_callback(ompt_event_implicit_task_end)(
5573                         my_parallel_id, task_info->task_id);
5574                 }
5575                 task_info->frame.exit_runtime_frame = NULL;
5576                 task_info->task_id = 0;
5577             }
5578 #endif
5579         }
5580     }
5581     TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done);
5582 
5583 #if OMPT_SUPPORT
5584     if (ompt_enabled &&
5585         ompt_callbacks.ompt_callback(ompt_event_thread_end)) {
5586         __ompt_thread_end(ompt_thread_worker, gtid);
5587     }
5588 #endif
5589 
5590     this_thr->th.th_task_team = NULL;
5591     /* run the destructors for the threadprivate data for this thread */
5592     __kmp_common_destroy_gtid( gtid );
5593 
5594     KA_TRACE( 10, ("__kmp_launch_thread: T#%d done\n", gtid ) );
5595     KMP_MB();
5596     return this_thr;
5597 }
5598 
5599 /* ------------------------------------------------------------------------ */
5600 /* ------------------------------------------------------------------------ */
5601 
5602 void
5603 __kmp_internal_end_dest( void *specific_gtid )
5604 {
5605     #if KMP_COMPILER_ICC
5606         #pragma warning( push )
5607         #pragma warning( disable:  810 ) // conversion from "void *" to "int" may lose significant bits
5608     #endif
5609     // Make sure no significant bits are lost
5610     int gtid = (kmp_intptr_t)specific_gtid - 1;
5611     #if KMP_COMPILER_ICC
5612         #pragma warning( pop )
5613     #endif
5614 
5615     KA_TRACE( 30, ("__kmp_internal_end_dest: T#%d\n", gtid));
5616     /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage
5617      * this is because 0 is reserved for the nothing-stored case */
5618 
5619     /* josh: One reason for setting the gtid specific data even when it is being
5620        destroyed by pthread is to allow gtid lookup through thread specific data
5621        (__kmp_gtid_get_specific).  Some of the code, especially stat code,
5622        that gets executed in the call to __kmp_internal_end_thread, actually
5623        gets the gtid through the thread specific data.  Setting it here seems
5624        rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread
5625        to run smoothly.
5626        todo: get rid of this after we remove the dependence on
5627        __kmp_gtid_get_specific
5628     */
5629     if(gtid >= 0 && KMP_UBER_GTID(gtid))
5630         __kmp_gtid_set_specific( gtid );
5631     #ifdef KMP_TDATA_GTID
5632         __kmp_gtid = gtid;
5633     #endif
5634     __kmp_internal_end_thread( gtid );
5635 }
5636 
5637 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB
5638 
5639 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases destructors work
5640 // perfectly, but in real libomp.so I have no evidence it is ever called. However, -fini linker
5641 // option in makefile.mk works fine.
5642 
5643 __attribute__(( destructor ))
5644 void
5645 __kmp_internal_end_dtor( void )
5646 {
5647     __kmp_internal_end_atexit();
5648 }
5649 
5650 void
5651 __kmp_internal_end_fini( void )
5652 {
5653     __kmp_internal_end_atexit();
5654 }
5655 
5656 #endif
5657 
5658 /* [Windows] josh: when the atexit handler is called, there may still be more than one thread alive */
5659 void
5660 __kmp_internal_end_atexit( void )
5661 {
5662     KA_TRACE( 30, ( "__kmp_internal_end_atexit\n" ) );
5663     /* [Windows]
5664        josh: ideally, we want to completely shutdown the library in this atexit handler, but
5665        stat code that depends on thread specific data for gtid fails because that data becomes
5666        unavailable at some point during the shutdown, so we call __kmp_internal_end_thread
5667        instead.  We should eventually remove the dependency on __kmp_get_specific_gtid in the
5668        stat code and use __kmp_internal_end_library to cleanly shutdown the library.
5669 
5670 // TODO: Can some of this comment about GVS be removed?
5671        I suspect that the offending stat code is executed when the calling thread tries to
5672        clean up a dead root thread's data structures, resulting in GVS code trying to close
5673        the GVS structures for that thread, but since the stat code uses
5674        __kmp_get_specific_gtid to get the gtid with the assumption that the calling thread is
5675        cleaning up itself instead of another thread, it gets confused.  This happens because
5676        allowing a thread to unregister and cleanup another thread is a recent modification for
5677        addressing an issue with Maxon Cinema4D.  Based on the current design (20050722), a
5678        thread may end up trying to unregister another thread only if thread death does not
5679        trigger the calling of __kmp_internal_end_thread.  For Linux* OS, there is the thread
5680        specific data destructor function to detect thread death.  For Windows dynamic, there
5681        is DllMain(THREAD_DETACH).  For Windows static, there is nothing.  Thus, the
5682        workaround is applicable only for Windows static stat library.
5683     */
5684     __kmp_internal_end_library( -1 );
5685     #if KMP_OS_WINDOWS
5686         __kmp_close_console();
5687     #endif
5688 }
5689 
5690 static void
5691 __kmp_reap_thread(
5692     kmp_info_t * thread,
5693     int is_root
5694 ) {
5695 
5696     // It is assumed __kmp_forkjoin_lock is acquired.
5697 
5698     int gtid;
5699 
5700     KMP_DEBUG_ASSERT( thread != NULL );
5701 
5702     gtid = thread->th.th_info.ds.ds_gtid;
5703 
5704     if ( ! is_root ) {
5705 
5706         if ( __kmp_dflt_blocktime != KMP_MAX_BLOCKTIME ) {
5707             /* Assume the threads are at the fork barrier here */
5708             KA_TRACE( 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", gtid ) );
5709             /* Need release fence here to prevent seg faults for tree forkjoin barrier (GEH) */
5710             ANNOTATE_HAPPENS_BEFORE(thread);
5711             kmp_flag_64 flag(&thread->th.th_bar[ bs_forkjoin_barrier ].bb.b_go, thread);
5712             __kmp_release_64(&flag);
5713         }; // if
5714 
5715         // Terminate OS thread.
5716         __kmp_reap_worker( thread );
5717 
5718         //
5719         // The thread was killed asynchronously.  If it was actively
5720         // spinning in the thread pool, decrement the global count.
5721         //
5722         // There is a small timing hole here - if the worker thread was
5723         // just waking up after sleeping in the pool, had reset it's
5724         // th_active_in_pool flag but not decremented the global counter
5725         // __kmp_thread_pool_active_nth yet, then the global counter
5726         // might not get updated.
5727         //
5728         // Currently, this can only happen as the library is unloaded,
5729         // so there are no harmful side effects.
5730         //
5731         if ( thread->th.th_active_in_pool ) {
5732             thread->th.th_active_in_pool = FALSE;
5733             KMP_TEST_THEN_DEC32(
5734               (kmp_int32 *) &__kmp_thread_pool_active_nth );
5735             KMP_DEBUG_ASSERT( TCR_4(__kmp_thread_pool_active_nth) >= 0 );
5736         }
5737 
5738         // Decrement # of [worker] threads in the pool.
5739         KMP_DEBUG_ASSERT( __kmp_thread_pool_nth > 0 );
5740         --__kmp_thread_pool_nth;
5741     }; // if
5742 
5743     __kmp_free_implicit_task(thread);
5744 
5745     // Free the fast memory for tasking
5746     #if USE_FAST_MEMORY
5747         __kmp_free_fast_memory( thread );
5748     #endif /* USE_FAST_MEMORY */
5749 
5750     __kmp_suspend_uninitialize_thread( thread );
5751 
5752     KMP_DEBUG_ASSERT( __kmp_threads[ gtid ] == thread );
5753     TCW_SYNC_PTR(__kmp_threads[gtid], NULL);
5754 
5755     -- __kmp_all_nth;
5756     // __kmp_nth was decremented when thread is added to the pool.
5757 
5758 #ifdef KMP_ADJUST_BLOCKTIME
5759     /* Adjust blocktime back to user setting or default if necessary */
5760     /* Middle initialization might never have occurred                */
5761     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
5762         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
5763         if ( __kmp_nth <= __kmp_avail_proc ) {
5764             __kmp_zero_bt = FALSE;
5765         }
5766     }
5767 #endif /* KMP_ADJUST_BLOCKTIME */
5768 
5769     /* free the memory being used */
5770     if( __kmp_env_consistency_check ) {
5771         if ( thread->th.th_cons ) {
5772             __kmp_free_cons_stack( thread->th.th_cons );
5773             thread->th.th_cons = NULL;
5774         }; // if
5775     }
5776 
5777     if ( thread->th.th_pri_common != NULL ) {
5778         __kmp_free( thread->th.th_pri_common );
5779         thread->th.th_pri_common = NULL;
5780     }; // if
5781 
5782     if (thread->th.th_task_state_memo_stack != NULL) {
5783         __kmp_free(thread->th.th_task_state_memo_stack);
5784         thread->th.th_task_state_memo_stack = NULL;
5785     }
5786 
5787     #if KMP_USE_BGET
5788         if ( thread->th.th_local.bget_data != NULL ) {
5789             __kmp_finalize_bget( thread );
5790         }; // if
5791     #endif
5792 
5793 #if KMP_AFFINITY_SUPPORTED
5794     if ( thread->th.th_affin_mask != NULL ) {
5795         KMP_CPU_FREE( thread->th.th_affin_mask );
5796         thread->th.th_affin_mask = NULL;
5797     }; // if
5798 #endif /* KMP_AFFINITY_SUPPORTED */
5799 
5800     __kmp_reap_team( thread->th.th_serial_team );
5801     thread->th.th_serial_team = NULL;
5802     __kmp_free( thread );
5803 
5804     KMP_MB();
5805 
5806 } // __kmp_reap_thread
5807 
5808 static void
5809 __kmp_internal_end(void)
5810 {
5811     int i;
5812 
5813     /* First, unregister the library */
5814     __kmp_unregister_library();
5815 
5816     #if KMP_OS_WINDOWS
5817         /* In Win static library, we can't tell when a root actually dies, so we
5818            reclaim the data structures for any root threads that have died but not
5819            unregistered themselves, in order to shut down cleanly.
5820            In Win dynamic library we also can't tell when a thread dies.
5821         */
5822         __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of dead roots
5823     #endif
5824 
5825     for( i=0 ; i<__kmp_threads_capacity ; i++ )
5826         if( __kmp_root[i] )
5827             if( __kmp_root[i]->r.r_active )
5828                 break;
5829     KMP_MB();       /* Flush all pending memory write invalidates.  */
5830     TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5831 
5832     if ( i < __kmp_threads_capacity ) {
5833 #if KMP_USE_MONITOR
5834         // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor??
5835         KMP_MB();       /* Flush all pending memory write invalidates.  */
5836 
5837         //
5838         // Need to check that monitor was initialized before reaping it.
5839         // If we are called form __kmp_atfork_child (which sets
5840         // __kmp_init_parallel = 0), then __kmp_monitor will appear to
5841         // contain valid data, but it is only valid in the parent process,
5842         // not the child.
5843         //
5844         // New behavior (201008): instead of keying off of the flag
5845         // __kmp_init_parallel, the monitor thread creation is keyed off
5846         // of the new flag __kmp_init_monitor.
5847         //
5848         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5849         if ( TCR_4( __kmp_init_monitor ) ) {
5850             __kmp_reap_monitor( & __kmp_monitor );
5851             TCW_4( __kmp_init_monitor, 0 );
5852         }
5853         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5854         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5855 #endif // KMP_USE_MONITOR
5856     } else {
5857         /* TODO move this to cleanup code */
5858         #ifdef KMP_DEBUG
5859             /* make sure that everything has properly ended */
5860             for ( i = 0; i < __kmp_threads_capacity; i++ ) {
5861                 if( __kmp_root[i] ) {
5862 //                    KMP_ASSERT( ! KMP_UBER_GTID( i ) );         // AC: there can be uber threads alive here
5863                     KMP_ASSERT( ! __kmp_root[i]->r.r_active );  // TODO: can they be active?
5864                 }
5865             }
5866         #endif
5867 
5868         KMP_MB();
5869 
5870         // Reap the worker threads.
5871         // This is valid for now, but be careful if threads are reaped sooner.
5872         while ( __kmp_thread_pool != NULL ) {    // Loop thru all the thread in the pool.
5873             // Get the next thread from the pool.
5874             kmp_info_t * thread = (kmp_info_t *) __kmp_thread_pool;
5875             __kmp_thread_pool = thread->th.th_next_pool;
5876             // Reap it.
5877             KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP);
5878             thread->th.th_next_pool = NULL;
5879             thread->th.th_in_pool = FALSE;
5880             __kmp_reap_thread( thread, 0 );
5881         }; // while
5882         __kmp_thread_pool_insert_pt = NULL;
5883 
5884         // Reap teams.
5885         while ( __kmp_team_pool != NULL ) {     // Loop thru all the teams in the pool.
5886             // Get the next team from the pool.
5887             kmp_team_t * team = (kmp_team_t *) __kmp_team_pool;
5888             __kmp_team_pool = team->t.t_next_pool;
5889             // Reap it.
5890             team->t.t_next_pool = NULL;
5891             __kmp_reap_team( team );
5892         }; // while
5893 
5894         __kmp_reap_task_teams( );
5895 
5896         for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
5897             // TBD: Add some checking...
5898             // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL );
5899         }
5900 
5901         /* Make sure all threadprivate destructors get run by joining with all worker
5902            threads before resetting this flag */
5903         TCW_SYNC_4(__kmp_init_common, FALSE);
5904 
5905         KA_TRACE( 10, ("__kmp_internal_end: all workers reaped\n" ) );
5906         KMP_MB();
5907 
5908 #if KMP_USE_MONITOR
5909         //
5910         // See note above: One of the possible fixes for CQ138434 / CQ140126
5911         //
5912         // FIXME: push both code fragments down and CSE them?
5913         // push them into __kmp_cleanup() ?
5914         //
5915         __kmp_acquire_bootstrap_lock( & __kmp_monitor_lock );
5916         if ( TCR_4( __kmp_init_monitor ) ) {
5917             __kmp_reap_monitor( & __kmp_monitor );
5918             TCW_4( __kmp_init_monitor, 0 );
5919         }
5920         __kmp_release_bootstrap_lock( & __kmp_monitor_lock );
5921         KA_TRACE( 10, ("__kmp_internal_end: monitor reaped\n" ) );
5922 #endif
5923     } /* else !__kmp_global.t_active */
5924     TCW_4(__kmp_init_gtid, FALSE);
5925     KMP_MB();       /* Flush all pending memory write invalidates.  */
5926 
5927     __kmp_cleanup();
5928 #if OMPT_SUPPORT
5929     ompt_fini();
5930 #endif
5931 }
5932 
5933 void
5934 __kmp_internal_end_library( int gtid_req )
5935 {
5936     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
5937     /* this shouldn't be a race condition because __kmp_internal_end() is the
5938      * only place to clear __kmp_serial_init */
5939     /* we'll check this later too, after we get the lock */
5940     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundaant,
5941     // because the next check will work in any case.
5942     if( __kmp_global.g.g_abort ) {
5943         KA_TRACE( 11, ("__kmp_internal_end_library: abort, exiting\n" ));
5944         /* TODO abort? */
5945         return;
5946     }
5947     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
5948         KA_TRACE( 10, ("__kmp_internal_end_library: already finished\n" ));
5949         return;
5950     }
5951 
5952 
5953     KMP_MB();       /* Flush all pending memory write invalidates.  */
5954 
5955     /* find out who we are and what we should do */
5956     {
5957         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
5958         KA_TRACE( 10, ("__kmp_internal_end_library: enter T#%d  (%d)\n", gtid, gtid_req ));
5959         if( gtid == KMP_GTID_SHUTDOWN ) {
5960             KA_TRACE( 10, ("__kmp_internal_end_library: !__kmp_init_runtime, system already shutdown\n" ));
5961             return;
5962         } else if( gtid == KMP_GTID_MONITOR ) {
5963             KA_TRACE( 10, ("__kmp_internal_end_library: monitor thread, gtid not registered, or system shutdown\n" ));
5964             return;
5965         } else if( gtid == KMP_GTID_DNE ) {
5966             KA_TRACE( 10, ("__kmp_internal_end_library: gtid not registered or system shutdown\n" ));
5967             /* we don't know who we are, but we may still shutdown the library */
5968         } else if( KMP_UBER_GTID( gtid )) {
5969             /* unregister ourselves as an uber thread.  gtid is no longer valid */
5970             if( __kmp_root[gtid]->r.r_active ) {
5971                 __kmp_global.g.g_abort = -1;
5972                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
5973                 KA_TRACE( 10, ("__kmp_internal_end_library: root still active, abort T#%d\n", gtid ));
5974                 return;
5975             } else {
5976                 KA_TRACE( 10, ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid ));
5977                 __kmp_unregister_root_current_thread( gtid );
5978             }
5979         } else {
5980             /* worker threads may call this function through the atexit handler, if they call exit() */
5981             /* For now, skip the usual subsequent processing and just dump the debug buffer.
5982                TODO: do a thorough shutdown instead
5983             */
5984             #ifdef DUMP_DEBUG_ON_EXIT
5985                 if ( __kmp_debug_buf )
5986                     __kmp_dump_debug_buffer( );
5987             #endif
5988             return;
5989         }
5990     }
5991     /* synchronize the termination process */
5992     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
5993 
5994     /* have we already finished */
5995     if( __kmp_global.g.g_abort ) {
5996         KA_TRACE( 10, ("__kmp_internal_end_library: abort, exiting\n" ));
5997         /* TODO abort? */
5998         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
5999         return;
6000     }
6001     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6002         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6003         return;
6004     }
6005 
6006     /* We need this lock to enforce mutex between this reading of
6007        __kmp_threads_capacity and the writing by __kmp_register_root.
6008        Alternatively, we can use a counter of roots that is
6009        atomically updated by __kmp_get_global_thread_id_reg,
6010        __kmp_do_serial_initialize and __kmp_internal_end_*.
6011     */
6012     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6013 
6014     /* now we can safely conduct the actual termination */
6015     __kmp_internal_end();
6016 
6017     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6018     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6019 
6020     KA_TRACE( 10, ("__kmp_internal_end_library: exit\n" ) );
6021 
6022     #ifdef DUMP_DEBUG_ON_EXIT
6023         if ( __kmp_debug_buf )
6024             __kmp_dump_debug_buffer();
6025     #endif
6026 
6027     #if KMP_OS_WINDOWS
6028         __kmp_close_console();
6029     #endif
6030 
6031     __kmp_fini_allocator();
6032 
6033 } // __kmp_internal_end_library
6034 
6035 void
6036 __kmp_internal_end_thread( int gtid_req )
6037 {
6038     int i;
6039 
6040     /* if we have already cleaned up, don't try again, it wouldn't be pretty */
6041     /* this shouldn't be a race condition because __kmp_internal_end() is the
6042      * only place to clear __kmp_serial_init */
6043     /* we'll check this later too, after we get the lock */
6044     // 2009-09-06: We do not set g_abort without setting g_done. This check looks redundant,
6045     // because the next check will work in any case.
6046     if( __kmp_global.g.g_abort ) {
6047         KA_TRACE( 11, ("__kmp_internal_end_thread: abort, exiting\n" ));
6048         /* TODO abort? */
6049         return;
6050     }
6051     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6052         KA_TRACE( 10, ("__kmp_internal_end_thread: already finished\n" ));
6053         return;
6054     }
6055 
6056     KMP_MB();       /* Flush all pending memory write invalidates.  */
6057 
6058     /* find out who we are and what we should do */
6059     {
6060         int gtid = (gtid_req>=0) ? gtid_req : __kmp_gtid_get_specific();
6061         KA_TRACE( 10, ("__kmp_internal_end_thread: enter T#%d  (%d)\n", gtid, gtid_req ));
6062         if( gtid == KMP_GTID_SHUTDOWN ) {
6063             KA_TRACE( 10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system already shutdown\n" ));
6064             return;
6065         } else if( gtid == KMP_GTID_MONITOR ) {
6066             KA_TRACE( 10, ("__kmp_internal_end_thread: monitor thread, gtid not registered, or system shutdown\n" ));
6067             return;
6068         } else if( gtid == KMP_GTID_DNE ) {
6069             KA_TRACE( 10, ("__kmp_internal_end_thread: gtid not registered or system shutdown\n" ));
6070             return;
6071             /* we don't know who we are */
6072         } else if( KMP_UBER_GTID( gtid )) {
6073         /* unregister ourselves as an uber thread.  gtid is no longer valid */
6074             if( __kmp_root[gtid]->r.r_active ) {
6075                 __kmp_global.g.g_abort = -1;
6076                 TCW_SYNC_4(__kmp_global.g.g_done, TRUE);
6077                 KA_TRACE( 10, ("__kmp_internal_end_thread: root still active, abort T#%d\n", gtid ));
6078                 return;
6079             } else {
6080                 KA_TRACE( 10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", gtid ));
6081                 __kmp_unregister_root_current_thread( gtid );
6082             }
6083         } else {
6084             /* just a worker thread, let's leave */
6085             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid ));
6086 
6087             if ( gtid >= 0 ) {
6088                 __kmp_threads[gtid]->th.th_task_team = NULL;
6089             }
6090 
6091             KA_TRACE( 10, ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", gtid ));
6092             return;
6093         }
6094     }
6095     #if defined KMP_DYNAMIC_LIB
6096     // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber thread,
6097     //     because we will better shutdown later in the library destructor.
6098     //     The reason of this change is performance problem when non-openmp thread
6099     //     in a loop forks and joins many openmp threads. We can save a lot of time
6100     //     keeping worker threads alive until the program shutdown.
6101     // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) and
6102     //     Windows(DPD200287443) that occurs when using critical sections from foreign threads.
6103         KA_TRACE( 10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req) );
6104         return;
6105     #endif
6106     /* synchronize the termination process */
6107     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6108 
6109     /* have we already finished */
6110     if( __kmp_global.g.g_abort ) {
6111         KA_TRACE( 10, ("__kmp_internal_end_thread: abort, exiting\n" ));
6112         /* TODO abort? */
6113         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6114         return;
6115     }
6116     if( TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial ) {
6117         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6118         return;
6119     }
6120 
6121     /* We need this lock to enforce mutex between this reading of
6122        __kmp_threads_capacity and the writing by __kmp_register_root.
6123        Alternatively, we can use a counter of roots that is
6124        atomically updated by __kmp_get_global_thread_id_reg,
6125        __kmp_do_serial_initialize and __kmp_internal_end_*.
6126     */
6127 
6128     /* should we finish the run-time?  are all siblings done? */
6129     __kmp_acquire_bootstrap_lock( &__kmp_forkjoin_lock );
6130 
6131     for ( i = 0; i < __kmp_threads_capacity; ++ i ) {
6132         if ( KMP_UBER_GTID( i ) ) {
6133             KA_TRACE( 10, ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i ));
6134             __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6135             __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6136             return;
6137         };
6138     }
6139 
6140     /* now we can safely conduct the actual termination */
6141 
6142     __kmp_internal_end();
6143 
6144     __kmp_release_bootstrap_lock( &__kmp_forkjoin_lock );
6145     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6146 
6147     KA_TRACE( 10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req ) );
6148 
6149     #ifdef DUMP_DEBUG_ON_EXIT
6150         if ( __kmp_debug_buf )
6151             __kmp_dump_debug_buffer();
6152     #endif
6153 } // __kmp_internal_end_thread
6154 
6155 // -------------------------------------------------------------------------------------------------
6156 // Library registration stuff.
6157 
6158 static long   __kmp_registration_flag = 0;
6159     // Random value used to indicate library initialization.
6160 static char * __kmp_registration_str  = NULL;
6161     // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>.
6162 
6163 
6164 static inline
6165 char *
6166 __kmp_reg_status_name() {
6167     /*
6168         On RHEL 3u5 if linked statically, getpid() returns different values in each thread.
6169         If registration and unregistration go in different threads (omp_misc_other_root_exit.cpp test case),
6170         the name of registered_lib_env env var can not be found, because the name will contain different pid.
6171     */
6172     return __kmp_str_format( "__KMP_REGISTERED_LIB_%d", (int) getpid() );
6173 } // __kmp_reg_status_get
6174 
6175 
6176 void
6177 __kmp_register_library_startup(
6178     void
6179 ) {
6180 
6181     char * name   = __kmp_reg_status_name();  // Name of the environment variable.
6182     int    done   = 0;
6183     union {
6184         double dtime;
6185         long   ltime;
6186     } time;
6187     #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6188         __kmp_initialize_system_tick();
6189     #endif
6190     __kmp_read_system_time( & time.dtime );
6191     __kmp_registration_flag = 0xCAFE0000L | ( time.ltime & 0x0000FFFFL );
6192     __kmp_registration_str =
6193         __kmp_str_format(
6194             "%p-%lx-%s",
6195             & __kmp_registration_flag,
6196             __kmp_registration_flag,
6197             KMP_LIBRARY_FILE
6198         );
6199 
6200     KA_TRACE( 50, ( "__kmp_register_library_startup: %s=\"%s\"\n", name, __kmp_registration_str ) );
6201 
6202     while ( ! done ) {
6203 
6204         char * value  = NULL; // Actual value of the environment variable.
6205 
6206         // Set environment variable, but do not overwrite if it is exist.
6207         __kmp_env_set( name, __kmp_registration_str, 0 );
6208         // Check the variable is written.
6209         value = __kmp_env_get( name );
6210         if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6211 
6212             done = 1;    // Ok, environment variable set successfully, exit the loop.
6213 
6214         } else {
6215 
6216             // Oops. Write failed. Another copy of OpenMP RTL is in memory.
6217             // Check whether it alive or dead.
6218             int    neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead.
6219             char * tail          = value;
6220             char * flag_addr_str = NULL;
6221             char * flag_val_str  = NULL;
6222             char const * file_name     = NULL;
6223             __kmp_str_split( tail, '-', & flag_addr_str, & tail );
6224             __kmp_str_split( tail, '-', & flag_val_str,  & tail );
6225             file_name = tail;
6226             if ( tail != NULL ) {
6227                 long * flag_addr = 0;
6228                 long   flag_val  = 0;
6229                 KMP_SSCANF( flag_addr_str, "%p",  & flag_addr );
6230                 KMP_SSCANF( flag_val_str,  "%lx", & flag_val  );
6231                 if ( flag_addr != 0 && flag_val != 0 && strcmp( file_name, "" ) != 0 ) {
6232                     // First, check whether environment-encoded address is mapped into addr space.
6233                     // If so, dereference it to see if it still has the right value.
6234 
6235                     if ( __kmp_is_address_mapped( flag_addr ) && * flag_addr == flag_val ) {
6236                         neighbor = 1;
6237                     } else {
6238                         // If not, then we know the other copy of the library is no longer running.
6239                         neighbor = 2;
6240                     }; // if
6241                 }; // if
6242             }; // if
6243             switch ( neighbor ) {
6244                 case 0 :      // Cannot parse environment variable -- neighbor status unknown.
6245                     // Assume it is the incompatible format of future version of the library.
6246                     // Assume the other library is alive.
6247                     // WARN( ... ); // TODO: Issue a warning.
6248                     file_name = "unknown library";
6249                     // Attention! Falling to the next case. That's intentional.
6250                 case 1 : {    // Neighbor is alive.
6251                     // Check it is allowed.
6252                     char * duplicate_ok = __kmp_env_get( "KMP_DUPLICATE_LIB_OK" );
6253                     if ( ! __kmp_str_match_true( duplicate_ok ) ) {
6254                         // That's not allowed. Issue fatal error.
6255                         __kmp_msg(
6256                             kmp_ms_fatal,
6257                             KMP_MSG( DuplicateLibrary, KMP_LIBRARY_FILE, file_name ),
6258                             KMP_HNT( DuplicateLibrary ),
6259                             __kmp_msg_null
6260                         );
6261                     }; // if
6262                     KMP_INTERNAL_FREE( duplicate_ok );
6263                     __kmp_duplicate_library_ok = 1;
6264                     done = 1;    // Exit the loop.
6265                 } break;
6266                 case 2 : {    // Neighbor is dead.
6267                     // Clear the variable and try to register library again.
6268                     __kmp_env_unset( name );
6269                 }  break;
6270                 default : {
6271                     KMP_DEBUG_ASSERT( 0 );
6272                 } break;
6273             }; // switch
6274 
6275         }; // if
6276         KMP_INTERNAL_FREE( (void *) value );
6277 
6278     }; // while
6279     KMP_INTERNAL_FREE( (void *) name );
6280 
6281 } // func __kmp_register_library_startup
6282 
6283 
6284 void
6285 __kmp_unregister_library( void ) {
6286 
6287     char * name  = __kmp_reg_status_name();
6288     char * value = __kmp_env_get( name );
6289 
6290     KMP_DEBUG_ASSERT( __kmp_registration_flag != 0 );
6291     KMP_DEBUG_ASSERT( __kmp_registration_str  != NULL );
6292     if ( value != NULL && strcmp( value, __kmp_registration_str ) == 0 ) {
6293         // Ok, this is our variable. Delete it.
6294         __kmp_env_unset( name );
6295     }; // if
6296 
6297     KMP_INTERNAL_FREE( __kmp_registration_str );
6298     KMP_INTERNAL_FREE( value );
6299     KMP_INTERNAL_FREE( name );
6300 
6301     __kmp_registration_flag = 0;
6302     __kmp_registration_str  = NULL;
6303 
6304 } // __kmp_unregister_library
6305 
6306 
6307 // End of Library registration stuff.
6308 // -------------------------------------------------------------------------------------------------
6309 
6310 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6311 
6312 static void __kmp_check_mic_type()
6313 {
6314     kmp_cpuid_t cpuid_state = {0};
6315     kmp_cpuid_t * cs_p = &cpuid_state;
6316     __kmp_x86_cpuid(1, 0, cs_p);
6317     // We don't support mic1 at the moment
6318     if( (cs_p->eax & 0xff0) == 0xB10 ) {
6319         __kmp_mic_type = mic2;
6320     } else if( (cs_p->eax & 0xf0ff0) == 0x50670 ) {
6321         __kmp_mic_type = mic3;
6322     } else {
6323         __kmp_mic_type = non_mic;
6324     }
6325 }
6326 
6327 #endif /* KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS) */
6328 
6329 static void
6330 __kmp_do_serial_initialize( void )
6331 {
6332     int i, gtid;
6333     int size;
6334 
6335     KA_TRACE( 10, ("__kmp_do_serial_initialize: enter\n" ) );
6336 
6337     KMP_DEBUG_ASSERT( sizeof( kmp_int32 ) == 4 );
6338     KMP_DEBUG_ASSERT( sizeof( kmp_uint32 ) == 4 );
6339     KMP_DEBUG_ASSERT( sizeof( kmp_int64 ) == 8 );
6340     KMP_DEBUG_ASSERT( sizeof( kmp_uint64 ) == 8 );
6341     KMP_DEBUG_ASSERT( sizeof( kmp_intptr_t ) == sizeof( void * ) );
6342 
6343 #if OMPT_SUPPORT
6344     ompt_pre_init();
6345 #endif
6346 
6347     __kmp_validate_locks();
6348 
6349     /* Initialize internal memory allocator */
6350     __kmp_init_allocator();
6351 
6352     /* Register the library startup via an environment variable
6353        and check to see whether another copy of the library is already
6354        registered. */
6355 
6356     __kmp_register_library_startup( );
6357 
6358     /* TODO reinitialization of library */
6359     if( TCR_4(__kmp_global.g.g_done) ) {
6360        KA_TRACE( 10, ("__kmp_do_serial_initialize: reinitialization of library\n" ) );
6361     }
6362 
6363     __kmp_global.g.g_abort = 0;
6364     TCW_SYNC_4(__kmp_global.g.g_done, FALSE);
6365 
6366     /* initialize the locks */
6367 #if KMP_USE_ADAPTIVE_LOCKS
6368 #if KMP_DEBUG_ADAPTIVE_LOCKS
6369     __kmp_init_speculative_stats();
6370 #endif
6371 #endif
6372 #if KMP_STATS_ENABLED
6373     __kmp_stats_init();
6374 #endif
6375     __kmp_init_lock( & __kmp_global_lock     );
6376     __kmp_init_queuing_lock( & __kmp_dispatch_lock );
6377     __kmp_init_lock( & __kmp_debug_lock      );
6378     __kmp_init_atomic_lock( & __kmp_atomic_lock     );
6379     __kmp_init_atomic_lock( & __kmp_atomic_lock_1i  );
6380     __kmp_init_atomic_lock( & __kmp_atomic_lock_2i  );
6381     __kmp_init_atomic_lock( & __kmp_atomic_lock_4i  );
6382     __kmp_init_atomic_lock( & __kmp_atomic_lock_4r  );
6383     __kmp_init_atomic_lock( & __kmp_atomic_lock_8i  );
6384     __kmp_init_atomic_lock( & __kmp_atomic_lock_8r  );
6385     __kmp_init_atomic_lock( & __kmp_atomic_lock_8c  );
6386     __kmp_init_atomic_lock( & __kmp_atomic_lock_10r );
6387     __kmp_init_atomic_lock( & __kmp_atomic_lock_16r );
6388     __kmp_init_atomic_lock( & __kmp_atomic_lock_16c );
6389     __kmp_init_atomic_lock( & __kmp_atomic_lock_20c );
6390     __kmp_init_atomic_lock( & __kmp_atomic_lock_32c );
6391     __kmp_init_bootstrap_lock( & __kmp_forkjoin_lock  );
6392     __kmp_init_bootstrap_lock( & __kmp_exit_lock      );
6393 #if KMP_USE_MONITOR
6394     __kmp_init_bootstrap_lock( & __kmp_monitor_lock   );
6395 #endif
6396     __kmp_init_bootstrap_lock( & __kmp_tp_cached_lock );
6397 
6398     /* conduct initialization and initial setup of configuration */
6399 
6400     __kmp_runtime_initialize();
6401 
6402 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6403     __kmp_check_mic_type();
6404 #endif
6405 
6406     // Some global variable initialization moved here from kmp_env_initialize()
6407 #ifdef KMP_DEBUG
6408     kmp_diag = 0;
6409 #endif
6410     __kmp_abort_delay = 0;
6411 
6412     // From __kmp_init_dflt_team_nth()
6413     /* assume the entire machine will be used */
6414     __kmp_dflt_team_nth_ub = __kmp_xproc;
6415     if( __kmp_dflt_team_nth_ub < KMP_MIN_NTH ) {
6416         __kmp_dflt_team_nth_ub = KMP_MIN_NTH;
6417     }
6418     if( __kmp_dflt_team_nth_ub > __kmp_sys_max_nth ) {
6419         __kmp_dflt_team_nth_ub = __kmp_sys_max_nth;
6420     }
6421     __kmp_max_nth = __kmp_sys_max_nth;
6422 
6423     // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" part
6424     __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME;
6425 #if KMP_USE_MONITOR
6426     __kmp_monitor_wakeups = KMP_WAKEUPS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6427     __kmp_bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME( __kmp_dflt_blocktime, __kmp_monitor_wakeups );
6428 #endif
6429     // From "KMP_LIBRARY" part of __kmp_env_initialize()
6430     __kmp_library = library_throughput;
6431     // From KMP_SCHEDULE initialization
6432     __kmp_static = kmp_sch_static_balanced;
6433     // AC: do not use analytical here, because it is non-monotonous
6434     //__kmp_guided = kmp_sch_guided_iterative_chunked;
6435     //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no need to repeate assignment
6436     // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch bit control and barrier method
6437     // control parts
6438     #if KMP_FAST_REDUCTION_BARRIER
6439         #define kmp_reduction_barrier_gather_bb ((int)1)
6440         #define kmp_reduction_barrier_release_bb ((int)1)
6441         #define kmp_reduction_barrier_gather_pat bp_hyper_bar
6442         #define kmp_reduction_barrier_release_pat bp_hyper_bar
6443     #endif // KMP_FAST_REDUCTION_BARRIER
6444     for ( i=bs_plain_barrier; i<bs_last_barrier; i++ ) {
6445         __kmp_barrier_gather_branch_bits [ i ] = __kmp_barrier_gather_bb_dflt;
6446         __kmp_barrier_release_branch_bits[ i ] = __kmp_barrier_release_bb_dflt;
6447         __kmp_barrier_gather_pattern [ i ] = __kmp_barrier_gather_pat_dflt;
6448         __kmp_barrier_release_pattern[ i ] = __kmp_barrier_release_pat_dflt;
6449         #if KMP_FAST_REDUCTION_BARRIER
6450         if( i == bs_reduction_barrier ) { // tested and confirmed on ALTIX only ( lin_64 ): hyper,1
6451             __kmp_barrier_gather_branch_bits [ i ] = kmp_reduction_barrier_gather_bb;
6452             __kmp_barrier_release_branch_bits[ i ] = kmp_reduction_barrier_release_bb;
6453             __kmp_barrier_gather_pattern [ i ] = kmp_reduction_barrier_gather_pat;
6454             __kmp_barrier_release_pattern[ i ] = kmp_reduction_barrier_release_pat;
6455         }
6456         #endif // KMP_FAST_REDUCTION_BARRIER
6457     }
6458     #if KMP_FAST_REDUCTION_BARRIER
6459         #undef kmp_reduction_barrier_release_pat
6460         #undef kmp_reduction_barrier_gather_pat
6461         #undef kmp_reduction_barrier_release_bb
6462         #undef kmp_reduction_barrier_gather_bb
6463     #endif // KMP_FAST_REDUCTION_BARRIER
6464 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
6465     if (__kmp_mic_type == mic2) { // KNC
6466         // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC
6467         __kmp_barrier_gather_branch_bits [ bs_plain_barrier ] = 3;  // plain gather
6468         __kmp_barrier_release_branch_bits[ bs_forkjoin_barrier ] = 1;  // forkjoin release
6469         __kmp_barrier_gather_pattern [ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6470         __kmp_barrier_release_pattern[ bs_forkjoin_barrier ] = bp_hierarchical_bar;
6471     }
6472 #if KMP_FAST_REDUCTION_BARRIER
6473     if (__kmp_mic_type == mic2) { // KNC
6474         __kmp_barrier_gather_pattern [ bs_reduction_barrier ] = bp_hierarchical_bar;
6475         __kmp_barrier_release_pattern[ bs_reduction_barrier ] = bp_hierarchical_bar;
6476     }
6477 #endif
6478 #endif
6479 
6480     // From KMP_CHECKS initialization
6481 #ifdef KMP_DEBUG
6482     __kmp_env_checks = TRUE;   /* development versions have the extra checks */
6483 #else
6484     __kmp_env_checks = FALSE;  /* port versions do not have the extra checks */
6485 #endif
6486 
6487     // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization
6488     __kmp_foreign_tp = TRUE;
6489 
6490     __kmp_global.g.g_dynamic = FALSE;
6491     __kmp_global.g.g_dynamic_mode = dynamic_default;
6492 
6493     __kmp_env_initialize( NULL );
6494 
6495     // Print all messages in message catalog for testing purposes.
6496     #ifdef KMP_DEBUG
6497         char const * val = __kmp_env_get( "KMP_DUMP_CATALOG" );
6498         if ( __kmp_str_match_true( val ) ) {
6499             kmp_str_buf_t buffer;
6500             __kmp_str_buf_init( & buffer );
6501             __kmp_i18n_dump_catalog( & buffer );
6502             __kmp_printf( "%s", buffer.str );
6503             __kmp_str_buf_free( & buffer );
6504         }; // if
6505         __kmp_env_free( & val );
6506     #endif
6507 
6508     __kmp_threads_capacity = __kmp_initial_threads_capacity( __kmp_dflt_team_nth_ub );
6509     // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part
6510     __kmp_tp_capacity = __kmp_default_tp_capacity(__kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified);
6511 
6512     // If the library is shut down properly, both pools must be NULL. Just in case, set them
6513     // to NULL -- some memory may leak, but subsequent code will work even if pools are not freed.
6514     KMP_DEBUG_ASSERT( __kmp_thread_pool == NULL );
6515     KMP_DEBUG_ASSERT( __kmp_thread_pool_insert_pt == NULL );
6516     KMP_DEBUG_ASSERT( __kmp_team_pool   == NULL );
6517     __kmp_thread_pool = NULL;
6518     __kmp_thread_pool_insert_pt = NULL;
6519     __kmp_team_pool   = NULL;
6520 
6521     /* Allocate all of the variable sized records */
6522     /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are expandable */
6523     /* Since allocation is cache-aligned, just add extra padding at the end */
6524     size = (sizeof(kmp_info_t*) + sizeof(kmp_root_t*))*__kmp_threads_capacity + CACHE_LINE;
6525     __kmp_threads = (kmp_info_t**) __kmp_allocate( size );
6526     __kmp_root    = (kmp_root_t**) ((char*)__kmp_threads + sizeof(kmp_info_t*) * __kmp_threads_capacity );
6527 
6528     /* init thread counts */
6529     KMP_DEBUG_ASSERT( __kmp_all_nth == 0 ); // Asserts fail if the library is reinitializing and
6530     KMP_DEBUG_ASSERT( __kmp_nth == 0 );     // something was wrong in termination.
6531     __kmp_all_nth = 0;
6532     __kmp_nth     = 0;
6533 
6534     /* setup the uber master thread and hierarchy */
6535     gtid = __kmp_register_root( TRUE );
6536     KA_TRACE( 10, ("__kmp_do_serial_initialize  T#%d\n", gtid ));
6537     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6538     KMP_ASSERT( KMP_INITIAL_GTID( gtid ) );
6539 
6540     KMP_MB();       /* Flush all pending memory write invalidates.  */
6541 
6542     __kmp_common_initialize();
6543 
6544     #if KMP_OS_UNIX
6545         /* invoke the child fork handler */
6546         __kmp_register_atfork();
6547     #endif
6548 
6549     #if ! defined KMP_DYNAMIC_LIB
6550         {
6551             /* Invoke the exit handler when the program finishes, only for static library.
6552                For dynamic library, we already have _fini and DllMain.
6553              */
6554             int rc = atexit( __kmp_internal_end_atexit );
6555             if ( rc != 0 ) {
6556                 __kmp_msg( kmp_ms_fatal, KMP_MSG( FunctionError, "atexit()" ), KMP_ERR( rc ), __kmp_msg_null );
6557             }; // if
6558         }
6559     #endif
6560 
6561     #if KMP_HANDLE_SIGNALS
6562         #if KMP_OS_UNIX
6563             /* NOTE: make sure that this is called before the user installs
6564              *          their own signal handlers so that the user handlers
6565              *          are called first.  this way they can return false,
6566              *          not call our handler, avoid terminating the library,
6567              *          and continue execution where they left off. */
6568             __kmp_install_signals( FALSE );
6569         #endif /* KMP_OS_UNIX */
6570         #if KMP_OS_WINDOWS
6571             __kmp_install_signals( TRUE );
6572         #endif /* KMP_OS_WINDOWS */
6573     #endif
6574 
6575     /* we have finished the serial initialization */
6576     __kmp_init_counter ++;
6577 
6578     __kmp_init_serial = TRUE;
6579 
6580     if (__kmp_settings) {
6581         __kmp_env_print();
6582     }
6583 
6584 #if OMP_40_ENABLED
6585     if (__kmp_display_env || __kmp_display_env_verbose) {
6586         __kmp_env_print_2();
6587     }
6588 #endif // OMP_40_ENABLED
6589 
6590 #if OMPT_SUPPORT
6591     ompt_post_init();
6592 #endif
6593 
6594     KMP_MB();
6595 
6596     KA_TRACE( 10, ("__kmp_do_serial_initialize: exit\n" ) );
6597 }
6598 
6599 void
6600 __kmp_serial_initialize( void )
6601 {
6602     if ( __kmp_init_serial ) {
6603         return;
6604     }
6605     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6606     if ( __kmp_init_serial ) {
6607         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6608         return;
6609     }
6610     __kmp_do_serial_initialize();
6611     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6612 }
6613 
6614 static void
6615 __kmp_do_middle_initialize( void )
6616 {
6617     int i, j;
6618     int prev_dflt_team_nth;
6619 
6620     if( !__kmp_init_serial ) {
6621         __kmp_do_serial_initialize();
6622     }
6623 
6624     KA_TRACE( 10, ("__kmp_middle_initialize: enter\n" ) );
6625 
6626     //
6627     // Save the previous value for the __kmp_dflt_team_nth so that
6628     // we can avoid some reinitialization if it hasn't changed.
6629     //
6630     prev_dflt_team_nth = __kmp_dflt_team_nth;
6631 
6632 #if KMP_AFFINITY_SUPPORTED
6633     //
6634     // __kmp_affinity_initialize() will try to set __kmp_ncores to the
6635     // number of cores on the machine.
6636     //
6637     __kmp_affinity_initialize();
6638 
6639     //
6640     // Run through the __kmp_threads array and set the affinity mask
6641     // for each root thread that is currently registered with the RTL.
6642     //
6643     for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6644         if ( TCR_PTR( __kmp_threads[ i ] ) != NULL ) {
6645             __kmp_affinity_set_init_mask( i, TRUE );
6646         }
6647     }
6648 #endif /* KMP_AFFINITY_SUPPORTED */
6649 
6650     KMP_ASSERT( __kmp_xproc > 0 );
6651     if ( __kmp_avail_proc == 0 ) {
6652         __kmp_avail_proc = __kmp_xproc;
6653     }
6654 
6655     // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), correct them now
6656     j = 0;
6657     while ( ( j < __kmp_nested_nth.used ) && ! __kmp_nested_nth.nth[ j ] ) {
6658         __kmp_nested_nth.nth[ j ] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = __kmp_avail_proc;
6659         j++;
6660     }
6661 
6662     if ( __kmp_dflt_team_nth == 0 ) {
6663 #ifdef KMP_DFLT_NTH_CORES
6664         //
6665         // Default #threads = #cores
6666         //
6667         __kmp_dflt_team_nth = __kmp_ncores;
6668         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_ncores (%d)\n",
6669           __kmp_dflt_team_nth ) );
6670 #else
6671         //
6672         // Default #threads = #available OS procs
6673         //
6674         __kmp_dflt_team_nth = __kmp_avail_proc;
6675         KA_TRACE( 20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = __kmp_avail_proc(%d)\n",
6676           __kmp_dflt_team_nth ) );
6677 #endif /* KMP_DFLT_NTH_CORES */
6678     }
6679 
6680     if ( __kmp_dflt_team_nth < KMP_MIN_NTH ) {
6681         __kmp_dflt_team_nth = KMP_MIN_NTH;
6682     }
6683     if( __kmp_dflt_team_nth > __kmp_sys_max_nth ) {
6684         __kmp_dflt_team_nth = __kmp_sys_max_nth;
6685     }
6686 
6687     //
6688     // There's no harm in continuing if the following check fails,
6689     // but it indicates an error in the previous logic.
6690     //
6691     KMP_DEBUG_ASSERT( __kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub );
6692 
6693     if ( __kmp_dflt_team_nth != prev_dflt_team_nth ) {
6694         //
6695         // Run through the __kmp_threads array and set the num threads icv
6696         // for each root thread that is currently registered with the RTL
6697         // (which has not already explicitly set its nthreads-var with a
6698         // call to omp_set_num_threads()).
6699         //
6700         for ( i = 0; i < __kmp_threads_capacity; i++ ) {
6701             kmp_info_t *thread = __kmp_threads[ i ];
6702             if ( thread == NULL ) continue;
6703             if ( thread->th.th_current_task->td_icvs.nproc != 0 ) continue;
6704 
6705             set__nproc( __kmp_threads[ i ], __kmp_dflt_team_nth );
6706         }
6707     }
6708     KA_TRACE( 20, ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n",
6709       __kmp_dflt_team_nth) );
6710 
6711 #ifdef KMP_ADJUST_BLOCKTIME
6712     /* Adjust blocktime to zero if necessary */
6713     /* now that __kmp_avail_proc is set      */
6714     if ( !__kmp_env_blocktime && ( __kmp_avail_proc > 0 ) ) {
6715         KMP_DEBUG_ASSERT( __kmp_avail_proc > 0 );
6716         if ( __kmp_nth > __kmp_avail_proc ) {
6717             __kmp_zero_bt = TRUE;
6718         }
6719     }
6720 #endif /* KMP_ADJUST_BLOCKTIME */
6721 
6722     /* we have finished middle initialization */
6723     TCW_SYNC_4(__kmp_init_middle, TRUE);
6724 
6725     KA_TRACE( 10, ("__kmp_do_middle_initialize: exit\n" ) );
6726 }
6727 
6728 void
6729 __kmp_middle_initialize( void )
6730 {
6731     if ( __kmp_init_middle ) {
6732         return;
6733     }
6734     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6735     if ( __kmp_init_middle ) {
6736         __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6737         return;
6738     }
6739     __kmp_do_middle_initialize();
6740     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6741 }
6742 
6743 void
6744 __kmp_parallel_initialize( void )
6745 {
6746     int gtid = __kmp_entry_gtid();      // this might be a new root
6747 
6748     /* synchronize parallel initialization (for sibling) */
6749     if( TCR_4(__kmp_init_parallel) ) return;
6750     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
6751     if( TCR_4(__kmp_init_parallel) ) { __kmp_release_bootstrap_lock( &__kmp_initz_lock ); return; }
6752 
6753     /* TODO reinitialization after we have already shut down */
6754     if( TCR_4(__kmp_global.g.g_done) ) {
6755         KA_TRACE( 10, ("__kmp_parallel_initialize: attempt to init while shutting down\n" ) );
6756         __kmp_infinite_loop();
6757     }
6758 
6759     /* jc: The lock __kmp_initz_lock is already held, so calling __kmp_serial_initialize
6760            would cause a deadlock.  So we call __kmp_do_serial_initialize directly.
6761     */
6762     if( !__kmp_init_middle ) {
6763         __kmp_do_middle_initialize();
6764     }
6765 
6766     /* begin initialization */
6767     KA_TRACE( 10, ("__kmp_parallel_initialize: enter\n" ) );
6768     KMP_ASSERT( KMP_UBER_GTID( gtid ) );
6769 
6770 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
6771     //
6772     // Save the FP control regs.
6773     // Worker threads will set theirs to these values at thread startup.
6774     //
6775     __kmp_store_x87_fpu_control_word( &__kmp_init_x87_fpu_control_word );
6776     __kmp_store_mxcsr( &__kmp_init_mxcsr );
6777     __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK;
6778 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */
6779 
6780 #if KMP_OS_UNIX
6781 # if KMP_HANDLE_SIGNALS
6782     /*  must be after __kmp_serial_initialize  */
6783     __kmp_install_signals( TRUE );
6784 # endif
6785 #endif
6786 
6787     __kmp_suspend_initialize();
6788 
6789 #if defined(USE_LOAD_BALANCE)
6790     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6791         __kmp_global.g.g_dynamic_mode = dynamic_load_balance;
6792     }
6793 #else
6794     if ( __kmp_global.g.g_dynamic_mode == dynamic_default ) {
6795         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
6796     }
6797 #endif
6798 
6799     if ( __kmp_version ) {
6800         __kmp_print_version_2();
6801     }
6802 
6803     /* we have finished parallel initialization */
6804     TCW_SYNC_4(__kmp_init_parallel, TRUE);
6805 
6806     KMP_MB();
6807     KA_TRACE( 10, ("__kmp_parallel_initialize: exit\n" ) );
6808 
6809     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
6810 }
6811 
6812 
6813 /* ------------------------------------------------------------------------ */
6814 
6815 void
6816 __kmp_run_before_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6817   kmp_team_t *team )
6818 {
6819     kmp_disp_t *dispatch;
6820 
6821     KMP_MB();
6822 
6823     /* none of the threads have encountered any constructs, yet. */
6824     this_thr->th.th_local.this_construct = 0;
6825 #if KMP_CACHE_MANAGE
6826     KMP_CACHE_PREFETCH( &this_thr->th.th_bar[ bs_forkjoin_barrier ].bb.b_arrived );
6827 #endif /* KMP_CACHE_MANAGE */
6828     dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch);
6829     KMP_DEBUG_ASSERT( dispatch );
6830     KMP_DEBUG_ASSERT( team->t.t_dispatch );
6831     //KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ this_thr->th.th_info.ds.ds_tid ] );
6832 
6833     dispatch->th_disp_index = 0;    /* reset the dispatch buffer counter */
6834 #if OMP_45_ENABLED
6835     dispatch->th_doacross_buf_idx = 0; /* reset the doacross dispatch buffer counter */
6836 #endif
6837     if( __kmp_env_consistency_check )
6838         __kmp_push_parallel( gtid, team->t.t_ident );
6839 
6840     KMP_MB();       /* Flush all pending memory write invalidates.  */
6841 }
6842 
6843 void
6844 __kmp_run_after_invoked_task( int gtid, int tid, kmp_info_t *this_thr,
6845   kmp_team_t *team )
6846 {
6847     if( __kmp_env_consistency_check )
6848         __kmp_pop_parallel( gtid, team->t.t_ident );
6849 
6850     __kmp_finish_implicit_task(this_thr);
6851 }
6852 
6853 int
6854 __kmp_invoke_task_func( int gtid )
6855 {
6856     int          rc;
6857     int          tid      = __kmp_tid_from_gtid( gtid );
6858     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6859     kmp_team_t  *team     = this_thr->th.th_team;
6860 
6861     __kmp_run_before_invoked_task( gtid, tid, this_thr, team );
6862 #if USE_ITT_BUILD
6863     if ( __itt_stack_caller_create_ptr ) {
6864         __kmp_itt_stack_callee_enter( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about entering user's code
6865     }
6866 #endif /* USE_ITT_BUILD */
6867 #if INCLUDE_SSC_MARKS
6868     SSC_MARK_INVOKING();
6869 #endif
6870 
6871 #if OMPT_SUPPORT
6872     void *dummy;
6873     void **exit_runtime_p;
6874     ompt_task_id_t my_task_id;
6875     ompt_parallel_id_t my_parallel_id;
6876 
6877     if (ompt_enabled) {
6878         exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid].
6879             ompt_task_info.frame.exit_runtime_frame);
6880     } else {
6881         exit_runtime_p = &dummy;
6882     }
6883 
6884 #if OMPT_TRACE
6885     my_task_id = team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_id;
6886     my_parallel_id = team->t.ompt_team_info.parallel_id;
6887     if (ompt_enabled &&
6888         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)) {
6889         ompt_callbacks.ompt_callback(ompt_event_implicit_task_begin)(
6890             my_parallel_id, my_task_id);
6891     }
6892 #endif
6893 #endif
6894 
6895     {
6896         KMP_TIME_PARTITIONED_BLOCK(OMP_parallel);
6897         KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK);
6898         rc = __kmp_invoke_microtask( (microtask_t) TCR_SYNC_PTR(team->t.t_pkfn),
6899                                      gtid, tid, (int) team->t.t_argc, (void **) team->t.t_argv
6900 #if OMPT_SUPPORT
6901                                      , exit_runtime_p
6902 #endif
6903                                      );
6904 #if OMPT_SUPPORT
6905         *exit_runtime_p = NULL;
6906 #endif
6907     }
6908 
6909 #if USE_ITT_BUILD
6910     if ( __itt_stack_caller_create_ptr ) {
6911         __kmp_itt_stack_callee_leave( (__itt_caller)team->t.t_stack_id ); // inform ittnotify about leaving user's code
6912     }
6913 #endif /* USE_ITT_BUILD */
6914     __kmp_run_after_invoked_task( gtid, tid, this_thr, team );
6915 
6916     return rc;
6917 }
6918 
6919 #if OMP_40_ENABLED
6920 void
6921 __kmp_teams_master( int gtid )
6922 {
6923     // This routine is called by all master threads in teams construct
6924     kmp_info_t *thr = __kmp_threads[ gtid ];
6925     kmp_team_t *team = thr->th.th_team;
6926     ident_t     *loc =  team->t.t_ident;
6927     thr->th.th_set_nproc = thr->th.th_teams_size.nth;
6928     KMP_DEBUG_ASSERT( thr->th.th_teams_microtask );
6929     KMP_DEBUG_ASSERT( thr->th.th_set_nproc );
6930     KA_TRACE( 20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n",
6931                    gtid, __kmp_tid_from_gtid( gtid ), thr->th.th_teams_microtask ) );
6932     // Launch league of teams now, but not let workers execute
6933     // (they hang on fork barrier until next parallel)
6934 #if INCLUDE_SSC_MARKS
6935     SSC_MARK_FORKING();
6936 #endif
6937     __kmp_fork_call( loc, gtid, fork_context_intel,
6938             team->t.t_argc,
6939 #if OMPT_SUPPORT
6940             (void *)thr->th.th_teams_microtask,      // "unwrapped" task
6941 #endif
6942             (microtask_t)thr->th.th_teams_microtask, // "wrapped" task
6943             VOLATILE_CAST(launch_t) __kmp_invoke_task_func,
6944             NULL );
6945 #if INCLUDE_SSC_MARKS
6946     SSC_MARK_JOINING();
6947 #endif
6948 
6949     // AC: last parameter "1" eliminates join barrier which won't work because
6950     // worker threads are in a fork barrier waiting for more parallel regions
6951     __kmp_join_call( loc, gtid
6952 #if OMPT_SUPPORT
6953         , fork_context_intel
6954 #endif
6955         , 1 );
6956 }
6957 
6958 int
6959 __kmp_invoke_teams_master( int gtid )
6960 {
6961     kmp_info_t  *this_thr = __kmp_threads[ gtid ];
6962     kmp_team_t  *team     = this_thr->th.th_team;
6963     #if KMP_DEBUG
6964     if ( !__kmp_threads[gtid]-> th.th_team->t.t_serialized )
6965         KMP_DEBUG_ASSERT( (void*)__kmp_threads[gtid]-> th.th_team->t.t_pkfn == (void*)__kmp_teams_master );
6966     #endif
6967     __kmp_run_before_invoked_task( gtid, 0, this_thr, team );
6968     __kmp_teams_master( gtid );
6969     __kmp_run_after_invoked_task( gtid, 0, this_thr, team );
6970     return 1;
6971 }
6972 #endif /* OMP_40_ENABLED */
6973 
6974 /* this sets the requested number of threads for the next parallel region
6975  * encountered by this team */
6976 /* since this should be enclosed in the forkjoin critical section it
6977  * should avoid race conditions with assymmetrical nested parallelism */
6978 
6979 void
6980 __kmp_push_num_threads( ident_t *id, int gtid, int num_threads )
6981 {
6982     kmp_info_t *thr = __kmp_threads[gtid];
6983 
6984     if( num_threads > 0 )
6985         thr->th.th_set_nproc = num_threads;
6986 }
6987 
6988 #if OMP_40_ENABLED
6989 
6990 /* this sets the requested number of teams for the teams region and/or
6991  * the number of threads for the next parallel region encountered  */
6992 void
6993 __kmp_push_num_teams( ident_t *id, int gtid, int num_teams, int num_threads )
6994 {
6995     kmp_info_t *thr = __kmp_threads[gtid];
6996     KMP_DEBUG_ASSERT(num_teams >= 0);
6997     KMP_DEBUG_ASSERT(num_threads >= 0);
6998 
6999     if( num_teams == 0 )
7000         num_teams = 1;    // default number of teams is 1.
7001     if( num_teams > __kmp_max_nth ) { // if too many teams requested?
7002         if ( !__kmp_reserve_warn ) {
7003             __kmp_reserve_warn = 1;
7004             __kmp_msg(
7005                 kmp_ms_warning,
7006                 KMP_MSG( CantFormThrTeam, num_teams, __kmp_max_nth ),
7007                 KMP_HNT( Unset_ALL_THREADS ),
7008                 __kmp_msg_null
7009             );
7010         }
7011         num_teams = __kmp_max_nth;
7012     }
7013     // Set number of teams (number of threads in the outer "parallel" of the teams)
7014     thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams;
7015 
7016     // Remember the number of threads for inner parallel regions
7017     if( num_threads == 0 ) {
7018         if( !TCR_4(__kmp_init_middle) )
7019             __kmp_middle_initialize();  // get __kmp_avail_proc calculated
7020         num_threads = __kmp_avail_proc / num_teams;
7021         if( num_teams * num_threads > __kmp_max_nth ) {
7022             // adjust num_threads w/o warning as it is not user setting
7023             num_threads = __kmp_max_nth / num_teams;
7024         }
7025     } else {
7026         if( num_teams * num_threads > __kmp_max_nth ) {
7027             int new_threads = __kmp_max_nth / num_teams;
7028             if ( !__kmp_reserve_warn ) { // user asked for too many threads
7029                 __kmp_reserve_warn = 1;  // that conflicts with OMP_THREAD_LIMIT
7030                 __kmp_msg(
7031                     kmp_ms_warning,
7032                     KMP_MSG( CantFormThrTeam, num_threads, new_threads ),
7033                     KMP_HNT( Unset_ALL_THREADS ),
7034                     __kmp_msg_null
7035                 );
7036             }
7037             num_threads = new_threads;
7038         }
7039     }
7040     thr->th.th_teams_size.nth = num_threads;
7041 }
7042 
7043 
7044 //
7045 // Set the proc_bind var to use in the following parallel region.
7046 //
7047 void
7048 __kmp_push_proc_bind( ident_t *id, int gtid, kmp_proc_bind_t proc_bind )
7049 {
7050     kmp_info_t *thr = __kmp_threads[gtid];
7051     thr->th.th_set_proc_bind = proc_bind;
7052 }
7053 
7054 #endif /* OMP_40_ENABLED */
7055 
7056 /* Launch the worker threads into the microtask. */
7057 
7058 void
7059 __kmp_internal_fork( ident_t *id, int gtid, kmp_team_t *team )
7060 {
7061     kmp_info_t *this_thr = __kmp_threads[gtid];
7062 
7063 #ifdef KMP_DEBUG
7064     int f;
7065 #endif /* KMP_DEBUG */
7066 
7067     KMP_DEBUG_ASSERT( team );
7068     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7069     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7070     KMP_MB();       /* Flush all pending memory write invalidates.  */
7071 
7072     team->t.t_construct = 0;          /* no single directives seen yet */
7073     team->t.t_ordered.dt.t_value = 0; /* thread 0 enters the ordered section first */
7074 
7075     /* Reset the identifiers on the dispatch buffer */
7076     KMP_DEBUG_ASSERT( team->t.t_disp_buffer );
7077     if ( team->t.t_max_nproc > 1 ) {
7078         int i;
7079         for (i = 0; i <  __kmp_dispatch_num_buffers; ++i) {
7080             team->t.t_disp_buffer[ i ].buffer_index = i;
7081 #if OMP_45_ENABLED
7082             team->t.t_disp_buffer[i].doacross_buf_idx = i;
7083 #endif
7084         }
7085     } else {
7086         team->t.t_disp_buffer[ 0 ].buffer_index = 0;
7087 #if OMP_45_ENABLED
7088         team->t.t_disp_buffer[0].doacross_buf_idx = 0;
7089 #endif
7090     }
7091 
7092     KMP_MB();       /* Flush all pending memory write invalidates.  */
7093     KMP_ASSERT( this_thr->th.th_team  ==  team );
7094 
7095 #ifdef KMP_DEBUG
7096     for( f=0 ; f<team->t.t_nproc ; f++ ) {
7097         KMP_DEBUG_ASSERT( team->t.t_threads[f] &&
7098                           team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc );
7099     }
7100 #endif /* KMP_DEBUG */
7101 
7102     /* release the worker threads so they may begin working */
7103     __kmp_fork_barrier( gtid, 0 );
7104 }
7105 
7106 
7107 void
7108 __kmp_internal_join( ident_t *id, int gtid, kmp_team_t *team )
7109 {
7110     kmp_info_t *this_thr = __kmp_threads[gtid];
7111 
7112     KMP_DEBUG_ASSERT( team );
7113     KMP_DEBUG_ASSERT( this_thr->th.th_team  ==  team );
7114     KMP_ASSERT(       KMP_MASTER_GTID(gtid) );
7115     KMP_MB();       /* Flush all pending memory write invalidates.  */
7116 
7117     /* Join barrier after fork */
7118 
7119 #ifdef KMP_DEBUG
7120     if (__kmp_threads[gtid] && __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc ) {
7121         __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n",gtid, gtid, __kmp_threads[gtid]);
7122         __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, team->t.t_nproc=%d\n",
7123                      gtid, __kmp_threads[gtid]->th.th_team_nproc, team, team->t.t_nproc);
7124         __kmp_print_structure();
7125     }
7126     KMP_DEBUG_ASSERT( __kmp_threads[gtid] &&
7127                      __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc );
7128 #endif /* KMP_DEBUG */
7129 
7130     __kmp_join_barrier( gtid );  /* wait for everyone */
7131 
7132     KMP_MB();       /* Flush all pending memory write invalidates.  */
7133     KMP_ASSERT( this_thr->th.th_team  ==  team );
7134 }
7135 
7136 
7137 /* ------------------------------------------------------------------------ */
7138 /* ------------------------------------------------------------------------ */
7139 
7140 #ifdef USE_LOAD_BALANCE
7141 
7142 //
7143 // Return the worker threads actively spinning in the hot team, if we
7144 // are at the outermost level of parallelism.  Otherwise, return 0.
7145 //
7146 static int
7147 __kmp_active_hot_team_nproc( kmp_root_t *root )
7148 {
7149     int i;
7150     int retval;
7151     kmp_team_t *hot_team;
7152 
7153     if ( root->r.r_active ) {
7154         return 0;
7155     }
7156     hot_team = root->r.r_hot_team;
7157     if ( __kmp_dflt_blocktime == KMP_MAX_BLOCKTIME ) {
7158         return hot_team->t.t_nproc - 1;  // Don't count master thread
7159     }
7160 
7161     //
7162     // Skip the master thread - it is accounted for elsewhere.
7163     //
7164     retval = 0;
7165     for ( i = 1; i < hot_team->t.t_nproc; i++ ) {
7166         if ( hot_team->t.t_threads[i]->th.th_active ) {
7167             retval++;
7168         }
7169     }
7170     return retval;
7171 }
7172 
7173 //
7174 // Perform an automatic adjustment to the number of
7175 // threads used by the next parallel region.
7176 //
7177 static int
7178 __kmp_load_balance_nproc( kmp_root_t *root, int set_nproc )
7179 {
7180     int retval;
7181     int pool_active;
7182     int hot_team_active;
7183     int team_curr_active;
7184     int system_active;
7185 
7186     KB_TRACE( 20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n",
7187                 root, set_nproc ) );
7188     KMP_DEBUG_ASSERT( root );
7189     KMP_DEBUG_ASSERT( root->r.r_root_team->t.t_threads[0]->th.th_current_task->td_icvs.dynamic == TRUE );
7190     KMP_DEBUG_ASSERT( set_nproc > 1 );
7191 
7192     if ( set_nproc == 1) {
7193         KB_TRACE( 20, ("__kmp_load_balance_nproc: serial execution.\n" ) );
7194         return 1;
7195     }
7196 
7197     //
7198     // Threads that are active in the thread pool, active in the hot team
7199     // for this particular root (if we are at the outer par level), and
7200     // the currently executing thread (to become the master) are available
7201     // to add to the new team, but are currently contributing to the system
7202     // load, and must be accounted for.
7203     //
7204     pool_active = TCR_4(__kmp_thread_pool_active_nth);
7205     hot_team_active = __kmp_active_hot_team_nproc( root );
7206     team_curr_active = pool_active + hot_team_active + 1;
7207 
7208     //
7209     // Check the system load.
7210     //
7211     system_active = __kmp_get_load_balance( __kmp_avail_proc + team_curr_active );
7212     KB_TRACE( 30, ("__kmp_load_balance_nproc: system active = %d pool active = %d hot team active = %d\n",
7213       system_active, pool_active, hot_team_active ) );
7214 
7215     if ( system_active < 0 ) {
7216         //
7217         // There was an error reading the necessary info from /proc,
7218         // so use the thread limit algorithm instead.  Once we set
7219         // __kmp_global.g.g_dynamic_mode = dynamic_thread_limit,
7220         // we shouldn't wind up getting back here.
7221         //
7222         __kmp_global.g.g_dynamic_mode = dynamic_thread_limit;
7223         KMP_WARNING( CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit" );
7224 
7225         //
7226         // Make this call behave like the thread limit algorithm.
7227         //
7228         retval = __kmp_avail_proc - __kmp_nth + (root->r.r_active ? 1
7229           : root->r.r_hot_team->t.t_nproc);
7230         if ( retval > set_nproc ) {
7231             retval = set_nproc;
7232         }
7233         if ( retval < KMP_MIN_NTH ) {
7234             retval = KMP_MIN_NTH;
7235         }
7236 
7237         KB_TRACE( 20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", retval ) );
7238         return retval;
7239     }
7240 
7241     //
7242     // There is a slight delay in the load balance algorithm in detecting
7243     // new running procs.  The real system load at this instant should be
7244     // at least as large as the #active omp thread that are available to
7245     // add to the team.
7246     //
7247     if ( system_active < team_curr_active ) {
7248         system_active = team_curr_active;
7249     }
7250     retval = __kmp_avail_proc - system_active + team_curr_active;
7251     if ( retval > set_nproc ) {
7252         retval = set_nproc;
7253     }
7254     if ( retval < KMP_MIN_NTH ) {
7255         retval = KMP_MIN_NTH;
7256     }
7257 
7258     KB_TRACE( 20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval ) );
7259     return retval;
7260 } // __kmp_load_balance_nproc()
7261 
7262 #endif /* USE_LOAD_BALANCE */
7263 
7264 /* ------------------------------------------------------------------------ */
7265 /* ------------------------------------------------------------------------ */
7266 
7267 /* NOTE: this is called with the __kmp_init_lock held */
7268 void
7269 __kmp_cleanup( void )
7270 {
7271     int f;
7272 
7273     KA_TRACE( 10, ("__kmp_cleanup: enter\n" ) );
7274 
7275     if (TCR_4(__kmp_init_parallel)) {
7276 #if KMP_HANDLE_SIGNALS
7277         __kmp_remove_signals();
7278 #endif
7279         TCW_4(__kmp_init_parallel, FALSE);
7280     }
7281 
7282     if (TCR_4(__kmp_init_middle)) {
7283 #if KMP_AFFINITY_SUPPORTED
7284         __kmp_affinity_uninitialize();
7285 #endif /* KMP_AFFINITY_SUPPORTED */
7286         __kmp_cleanup_hierarchy();
7287         TCW_4(__kmp_init_middle, FALSE);
7288     }
7289 
7290     KA_TRACE( 10, ("__kmp_cleanup: go serial cleanup\n" ) );
7291 
7292     if (__kmp_init_serial) {
7293         __kmp_runtime_destroy();
7294         __kmp_init_serial = FALSE;
7295     }
7296 
7297     for ( f = 0; f < __kmp_threads_capacity; f++ ) {
7298         if ( __kmp_root[ f ] != NULL ) {
7299             __kmp_free( __kmp_root[ f ] );
7300             __kmp_root[ f ] = NULL;
7301         }
7302     }
7303     __kmp_free( __kmp_threads );
7304     // __kmp_threads and __kmp_root were allocated at once, as single block, so there is no need in
7305     // freeing __kmp_root.
7306     __kmp_threads = NULL;
7307     __kmp_root    = NULL;
7308     __kmp_threads_capacity = 0;
7309 
7310 #if KMP_USE_DYNAMIC_LOCK
7311     __kmp_cleanup_indirect_user_locks();
7312 #else
7313     __kmp_cleanup_user_locks();
7314 #endif
7315 
7316     #if KMP_AFFINITY_SUPPORTED
7317         KMP_INTERNAL_FREE( (void *) __kmp_cpuinfo_file );
7318         __kmp_cpuinfo_file = NULL;
7319     #endif /* KMP_AFFINITY_SUPPORTED */
7320 
7321    #if KMP_USE_ADAPTIVE_LOCKS
7322    #if KMP_DEBUG_ADAPTIVE_LOCKS
7323        __kmp_print_speculative_stats();
7324    #endif
7325    #endif
7326     KMP_INTERNAL_FREE( __kmp_nested_nth.nth );
7327     __kmp_nested_nth.nth = NULL;
7328     __kmp_nested_nth.size = 0;
7329     __kmp_nested_nth.used = 0;
7330     KMP_INTERNAL_FREE( __kmp_nested_proc_bind.bind_types );
7331     __kmp_nested_proc_bind.bind_types = NULL;
7332     __kmp_nested_proc_bind.size = 0;
7333     __kmp_nested_proc_bind.used = 0;
7334 
7335     __kmp_i18n_catclose();
7336 
7337 #if KMP_STATS_ENABLED
7338     __kmp_stats_fini();
7339 #endif
7340 
7341     KA_TRACE( 10, ("__kmp_cleanup: exit\n" ) );
7342 }
7343 
7344 /* ------------------------------------------------------------------------ */
7345 /* ------------------------------------------------------------------------ */
7346 
7347 int
7348 __kmp_ignore_mppbeg( void )
7349 {
7350     char *env;
7351 
7352     if ((env = getenv( "KMP_IGNORE_MPPBEG" )) != NULL) {
7353         if (__kmp_str_match_false( env ))
7354             return FALSE;
7355     }
7356     // By default __kmpc_begin() is no-op.
7357     return TRUE;
7358 }
7359 
7360 int
7361 __kmp_ignore_mppend( void )
7362 {
7363     char *env;
7364 
7365     if ((env = getenv( "KMP_IGNORE_MPPEND" )) != NULL) {
7366         if (__kmp_str_match_false( env ))
7367             return FALSE;
7368     }
7369     // By default __kmpc_end() is no-op.
7370     return TRUE;
7371 }
7372 
7373 void
7374 __kmp_internal_begin( void )
7375 {
7376     int gtid;
7377     kmp_root_t *root;
7378 
7379     /* this is a very important step as it will register new sibling threads
7380      * and assign these new uber threads a new gtid */
7381     gtid = __kmp_entry_gtid();
7382     root = __kmp_threads[ gtid ]->th.th_root;
7383     KMP_ASSERT( KMP_UBER_GTID( gtid ));
7384 
7385     if( root->r.r_begin ) return;
7386     __kmp_acquire_lock( &root->r.r_begin_lock, gtid );
7387     if( root->r.r_begin ) {
7388         __kmp_release_lock( & root->r.r_begin_lock, gtid );
7389         return;
7390     }
7391 
7392     root->r.r_begin = TRUE;
7393 
7394     __kmp_release_lock( & root->r.r_begin_lock, gtid );
7395 }
7396 
7397 
7398 /* ------------------------------------------------------------------------ */
7399 /* ------------------------------------------------------------------------ */
7400 
7401 void
7402 __kmp_user_set_library (enum library_type arg)
7403 {
7404     int gtid;
7405     kmp_root_t *root;
7406     kmp_info_t *thread;
7407 
7408     /* first, make sure we are initialized so we can get our gtid */
7409 
7410     gtid = __kmp_entry_gtid();
7411     thread = __kmp_threads[ gtid ];
7412 
7413     root = thread->th.th_root;
7414 
7415     KA_TRACE( 20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, library_serial ));
7416     if (root->r.r_in_parallel) { /* Must be called in serial section of top-level thread */
7417         KMP_WARNING( SetLibraryIncorrectCall );
7418         return;
7419     }
7420 
7421     switch ( arg ) {
7422     case library_serial :
7423         thread->th.th_set_nproc = 0;
7424         set__nproc( thread, 1 );
7425         break;
7426     case library_turnaround :
7427         thread->th.th_set_nproc = 0;
7428         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7429         break;
7430     case library_throughput :
7431         thread->th.th_set_nproc = 0;
7432         set__nproc( thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth : __kmp_dflt_team_nth_ub );
7433         break;
7434     default:
7435         KMP_FATAL( UnknownLibraryType, arg );
7436     }
7437 
7438     __kmp_aux_set_library ( arg );
7439 }
7440 
7441 void
7442 __kmp_aux_set_stacksize( size_t arg )
7443 {
7444     if (! __kmp_init_serial)
7445         __kmp_serial_initialize();
7446 
7447 #if KMP_OS_DARWIN
7448     if (arg & (0x1000 - 1)) {
7449         arg &= ~(0x1000 - 1);
7450         if(arg + 0x1000) /* check for overflow if we round up */
7451             arg += 0x1000;
7452     }
7453 #endif
7454     __kmp_acquire_bootstrap_lock( &__kmp_initz_lock );
7455 
7456     /* only change the default stacksize before the first parallel region */
7457     if (! TCR_4(__kmp_init_parallel)) {
7458         size_t value = arg;       /* argument is in bytes */
7459 
7460         if (value < __kmp_sys_min_stksize )
7461             value = __kmp_sys_min_stksize ;
7462         else if (value > KMP_MAX_STKSIZE)
7463             value = KMP_MAX_STKSIZE;
7464 
7465         __kmp_stksize = value;
7466 
7467         __kmp_env_stksize = TRUE;    /* was KMP_STACKSIZE specified? */
7468     }
7469 
7470     __kmp_release_bootstrap_lock( &__kmp_initz_lock );
7471 }
7472 
7473 /* set the behaviour of the runtime library */
7474 /* TODO this can cause some odd behaviour with sibling parallelism... */
7475 void
7476 __kmp_aux_set_library (enum library_type arg)
7477 {
7478     __kmp_library = arg;
7479 
7480     switch ( __kmp_library ) {
7481     case library_serial :
7482         {
7483             KMP_INFORM( LibraryIsSerial );
7484             (void) __kmp_change_library( TRUE );
7485         }
7486         break;
7487     case library_turnaround :
7488         (void) __kmp_change_library( TRUE );
7489         break;
7490     case library_throughput :
7491         (void) __kmp_change_library( FALSE );
7492         break;
7493     default:
7494         KMP_FATAL( UnknownLibraryType, arg );
7495     }
7496 }
7497 
7498 /* ------------------------------------------------------------------------ */
7499 /* ------------------------------------------------------------------------ */
7500 
7501 void
7502 __kmp_aux_set_blocktime (int arg, kmp_info_t *thread, int tid)
7503 {
7504     int blocktime = arg;        /* argument is in milliseconds */
7505 #if KMP_USE_MONITOR
7506     int bt_intervals;
7507 #endif
7508     int bt_set;
7509 
7510     __kmp_save_internal_controls( thread );
7511 
7512     /* Normalize and set blocktime for the teams */
7513     if (blocktime < KMP_MIN_BLOCKTIME)
7514         blocktime = KMP_MIN_BLOCKTIME;
7515     else if (blocktime > KMP_MAX_BLOCKTIME)
7516         blocktime = KMP_MAX_BLOCKTIME;
7517 
7518     set__blocktime_team( thread->th.th_team, tid, blocktime );
7519     set__blocktime_team( thread->th.th_serial_team, 0, blocktime );
7520 
7521 #if KMP_USE_MONITOR
7522     /* Calculate and set blocktime intervals for the teams */
7523     bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups);
7524 
7525     set__bt_intervals_team( thread->th.th_team, tid, bt_intervals );
7526     set__bt_intervals_team( thread->th.th_serial_team, 0, bt_intervals );
7527 #endif
7528 
7529     /* Set whether blocktime has been set to "TRUE" */
7530     bt_set = TRUE;
7531 
7532     set__bt_set_team( thread->th.th_team, tid, bt_set );
7533     set__bt_set_team( thread->th.th_serial_team, 0, bt_set );
7534 #if KMP_USE_MONITOR
7535     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, "
7536                   "bt_intervals=%d, monitor_updates=%d\n",
7537                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7538                   thread->th.th_team->t.t_id, tid, blocktime, bt_intervals,
7539                   __kmp_monitor_wakeups));
7540 #else
7541     KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n",
7542                   __kmp_gtid_from_tid(tid, thread->th.th_team),
7543                   thread->th.th_team->t.t_id, tid, blocktime));
7544 #endif
7545 }
7546 
7547 void
7548 __kmp_aux_set_defaults(
7549     char const * str,
7550     int          len
7551 ) {
7552     if ( ! __kmp_init_serial ) {
7553         __kmp_serial_initialize();
7554     };
7555     __kmp_env_initialize( str );
7556 
7557     if (__kmp_settings
7558 #if OMP_40_ENABLED
7559         || __kmp_display_env || __kmp_display_env_verbose
7560 #endif // OMP_40_ENABLED
7561         ) {
7562         __kmp_env_print();
7563     }
7564 } // __kmp_aux_set_defaults
7565 
7566 /* ------------------------------------------------------------------------ */
7567 
7568 /*
7569  * internal fast reduction routines
7570  */
7571 
7572 PACKED_REDUCTION_METHOD_T
7573 __kmp_determine_reduction_method( ident_t *loc, kmp_int32 global_tid,
7574         kmp_int32 num_vars, size_t reduce_size, void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data),
7575         kmp_critical_name *lck )
7576 {
7577 
7578     // Default reduction method: critical construct ( lck != NULL, like in current PAROPT )
7579     // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method can be selected by RTL
7580     // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method can be selected by RTL
7581     // Finally, it's up to OpenMP RTL to make a decision on which method to select among generated by PAROPT.
7582 
7583     PACKED_REDUCTION_METHOD_T retval;
7584 
7585     int team_size;
7586 
7587     KMP_DEBUG_ASSERT( loc );    // it would be nice to test ( loc != 0 )
7588     KMP_DEBUG_ASSERT( lck );    // it would be nice to test ( lck != 0 )
7589 
7590     #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED ( ( loc->flags & ( KMP_IDENT_ATOMIC_REDUCE ) ) == ( KMP_IDENT_ATOMIC_REDUCE ) )
7591     #define FAST_REDUCTION_TREE_METHOD_GENERATED   ( ( reduce_data ) && ( reduce_func ) )
7592 
7593     retval = critical_reduce_block;
7594 
7595     team_size = __kmp_get_team_num_threads( global_tid ); // another choice of getting a team size ( with 1 dynamic deference ) is slower
7596 
7597     if( team_size == 1 ) {
7598 
7599         retval = empty_reduce_block;
7600 
7601     } else {
7602 
7603         int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7604         int tree_available   = FAST_REDUCTION_TREE_METHOD_GENERATED;
7605 
7606         #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64
7607 
7608             #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7609 
7610 	    int teamsize_cutoff = 4;
7611 
7612 #if KMP_ARCH_X86_64 && (KMP_OS_LINUX || KMP_OS_WINDOWS)
7613                 if( __kmp_mic_type != non_mic ) {
7614                     teamsize_cutoff = 8;
7615                 }
7616 #endif
7617                 if( tree_available ) {
7618                     if( team_size <= teamsize_cutoff ) {
7619                         if ( atomic_available ) {
7620                             retval = atomic_reduce_block;
7621                         }
7622                     } else {
7623                         retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7624                     }
7625                 } else if ( atomic_available ) {
7626                     retval = atomic_reduce_block;
7627                 }
7628             #else
7629                 #error "Unknown or unsupported OS"
7630             #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN
7631 
7632         #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS
7633 
7634             #if KMP_OS_LINUX || KMP_OS_WINDOWS
7635 
7636                 // basic tuning
7637 
7638                 if( atomic_available ) {
7639                     if( num_vars <= 2 ) { // && ( team_size <= 8 ) due to false-sharing ???
7640                         retval = atomic_reduce_block;
7641                     }
7642                 } // otherwise: use critical section
7643 
7644             #elif KMP_OS_DARWIN
7645 
7646                 if( atomic_available && ( num_vars <= 3 ) ) {
7647                         retval = atomic_reduce_block;
7648                 } else if( tree_available ) {
7649                     if( ( reduce_size > ( 9 * sizeof( kmp_real64 ) ) ) && ( reduce_size < ( 2000 * sizeof( kmp_real64 ) ) ) ) {
7650                         retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER;
7651                     }
7652                 } // otherwise: use critical section
7653 
7654             #else
7655                 #error "Unknown or unsupported OS"
7656             #endif
7657 
7658         #else
7659             #error "Unknown or unsupported architecture"
7660         #endif
7661 
7662     }
7663 
7664     // KMP_FORCE_REDUCTION
7665 
7666     // If the team is serialized (team_size == 1), ignore the forced reduction
7667     // method and stay with the unsynchronized method (empty_reduce_block)
7668     if( __kmp_force_reduction_method != reduction_method_not_defined && team_size != 1) {
7669 
7670         PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block;
7671 
7672         int atomic_available, tree_available;
7673 
7674         switch( ( forced_retval = __kmp_force_reduction_method ) )
7675         {
7676         case critical_reduce_block:
7677                 KMP_ASSERT( lck );              // lck should be != 0
7678                 break;
7679 
7680             case atomic_reduce_block:
7681                 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED;
7682                 if( ! atomic_available ) {
7683                     KMP_WARNING(RedMethodNotSupported, "atomic");
7684                     forced_retval = critical_reduce_block;
7685                 }
7686                 break;
7687 
7688             case tree_reduce_block:
7689                 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED;
7690                 if( ! tree_available ) {
7691                     KMP_WARNING(RedMethodNotSupported, "tree");
7692                     forced_retval = critical_reduce_block;
7693                 } else {
7694                     #if KMP_FAST_REDUCTION_BARRIER
7695                     forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER;
7696                     #endif
7697                 }
7698                 break;
7699 
7700             default:
7701                 KMP_ASSERT( 0 ); // "unsupported method specified"
7702         }
7703 
7704         retval = forced_retval;
7705     }
7706 
7707     KA_TRACE(10, ( "reduction method selected=%08x\n", retval ) );
7708 
7709     #undef FAST_REDUCTION_TREE_METHOD_GENERATED
7710     #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED
7711 
7712     return ( retval );
7713 }
7714 
7715 // this function is for testing set/get/determine reduce method
7716 kmp_int32
7717 __kmp_get_reduce_method( void ) {
7718     return ( ( __kmp_entry_thread()->th.th_local.packed_reduction_method ) >> 8 );
7719 }
7720 
7721 /* ------------------------------------------------------------------------ */
7722