1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 #if OMP_PROFILING_SUPPORT 36 #include "llvm/Support/TimeProfiler.h" 37 static char *ProfileTraceFile = nullptr; 38 #endif 39 40 /* these are temporary issues to be dealt with */ 41 #define KMP_USE_PRCTL 0 42 43 #if KMP_OS_WINDOWS 44 #include <process.h> 45 #endif 46 47 #include "tsan_annotations.h" 48 49 #if KMP_OS_WINDOWS 50 // windows does not need include files as it doesn't use shared memory 51 #else 52 #include <sys/mman.h> 53 #include <sys/stat.h> 54 #include <fcntl.h> 55 #define SHM_SIZE 1024 56 #endif 57 58 #if defined(KMP_GOMP_COMPAT) 59 char const __kmp_version_alt_comp[] = 60 KMP_VERSION_PREFIX "alternative compiler support: yes"; 61 #endif /* defined(KMP_GOMP_COMPAT) */ 62 63 char const __kmp_version_omp_api[] = 64 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 65 66 #ifdef KMP_DEBUG 67 char const __kmp_version_lock[] = 68 KMP_VERSION_PREFIX "lock type: run time selectable"; 69 #endif /* KMP_DEBUG */ 70 71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 72 73 /* ------------------------------------------------------------------------ */ 74 75 #if KMP_USE_MONITOR 76 kmp_info_t __kmp_monitor; 77 #endif 78 79 /* Forward declarations */ 80 81 void __kmp_cleanup(void); 82 83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 84 int gtid); 85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 86 kmp_internal_control_t *new_icvs, 87 ident_t *loc); 88 #if KMP_AFFINITY_SUPPORTED 89 static void __kmp_partition_places(kmp_team_t *team, 90 int update_master_only = 0); 91 #endif 92 static void __kmp_do_serial_initialize(void); 93 void __kmp_fork_barrier(int gtid, int tid); 94 void __kmp_join_barrier(int gtid); 95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 96 kmp_internal_control_t *new_icvs, ident_t *loc); 97 98 #ifdef USE_LOAD_BALANCE 99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 100 #endif 101 102 static int __kmp_expand_threads(int nNeed); 103 #if KMP_OS_WINDOWS 104 static int __kmp_unregister_root_other_thread(int gtid); 105 #endif 106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 108 109 /* Calculate the identifier of the current thread */ 110 /* fast (and somewhat portable) way to get unique identifier of executing 111 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 112 int __kmp_get_global_thread_id() { 113 int i; 114 kmp_info_t **other_threads; 115 size_t stack_data; 116 char *stack_addr; 117 size_t stack_size; 118 char *stack_base; 119 120 KA_TRACE( 121 1000, 122 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 123 __kmp_nth, __kmp_all_nth)); 124 125 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 126 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 127 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 128 __kmp_init_gtid for this to work. */ 129 130 if (!TCR_4(__kmp_init_gtid)) 131 return KMP_GTID_DNE; 132 133 #ifdef KMP_TDATA_GTID 134 if (TCR_4(__kmp_gtid_mode) >= 3) { 135 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 136 return __kmp_gtid; 137 } 138 #endif 139 if (TCR_4(__kmp_gtid_mode) >= 2) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 141 return __kmp_gtid_get_specific(); 142 } 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 144 145 stack_addr = (char *)&stack_data; 146 other_threads = __kmp_threads; 147 148 /* ATT: The code below is a source of potential bugs due to unsynchronized 149 access to __kmp_threads array. For example: 150 1. Current thread loads other_threads[i] to thr and checks it, it is 151 non-NULL. 152 2. Current thread is suspended by OS. 153 3. Another thread unregisters and finishes (debug versions of free() 154 may fill memory with something like 0xEF). 155 4. Current thread is resumed. 156 5. Current thread reads junk from *thr. 157 TODO: Fix it. --ln */ 158 159 for (i = 0; i < __kmp_threads_capacity; i++) { 160 161 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 162 if (!thr) 163 continue; 164 165 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 166 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 167 168 /* stack grows down -- search through all of the active threads */ 169 170 if (stack_addr <= stack_base) { 171 size_t stack_diff = stack_base - stack_addr; 172 173 if (stack_diff <= stack_size) { 174 /* The only way we can be closer than the allocated */ 175 /* stack size is if we are running on this thread. */ 176 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 177 return i; 178 } 179 } 180 } 181 182 /* get specific to try and determine our gtid */ 183 KA_TRACE(1000, 184 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 185 "thread, using TLS\n")); 186 i = __kmp_gtid_get_specific(); 187 188 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 189 190 /* if we havn't been assigned a gtid, then return code */ 191 if (i < 0) 192 return i; 193 194 /* dynamically updated stack window for uber threads to avoid get_specific 195 call */ 196 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 197 KMP_FATAL(StackOverflow, i); 198 } 199 200 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 201 if (stack_addr > stack_base) { 202 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 203 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 204 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 205 stack_base); 206 } else { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 208 stack_base - stack_addr); 209 } 210 211 /* Reprint stack bounds for ubermaster since they have been refined */ 212 if (__kmp_storage_map) { 213 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 214 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 215 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 216 other_threads[i]->th.th_info.ds.ds_stacksize, 217 "th_%d stack (refinement)", i); 218 } 219 return i; 220 } 221 222 int __kmp_get_global_thread_id_reg() { 223 int gtid; 224 225 if (!__kmp_init_serial) { 226 gtid = KMP_GTID_DNE; 227 } else 228 #ifdef KMP_TDATA_GTID 229 if (TCR_4(__kmp_gtid_mode) >= 3) { 230 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 231 gtid = __kmp_gtid; 232 } else 233 #endif 234 if (TCR_4(__kmp_gtid_mode) >= 2) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 236 gtid = __kmp_gtid_get_specific(); 237 } else { 238 KA_TRACE(1000, 239 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 240 gtid = __kmp_get_global_thread_id(); 241 } 242 243 /* we must be a new uber master sibling thread */ 244 if (gtid == KMP_GTID_DNE) { 245 KA_TRACE(10, 246 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 247 "Registering a new gtid.\n")); 248 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 249 if (!__kmp_init_serial) { 250 __kmp_do_serial_initialize(); 251 gtid = __kmp_gtid_get_specific(); 252 } else { 253 gtid = __kmp_register_root(FALSE); 254 } 255 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 256 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 257 } 258 259 KMP_DEBUG_ASSERT(gtid >= 0); 260 261 return gtid; 262 } 263 264 /* caller must hold forkjoin_lock */ 265 void __kmp_check_stack_overlap(kmp_info_t *th) { 266 int f; 267 char *stack_beg = NULL; 268 char *stack_end = NULL; 269 int gtid; 270 271 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 272 if (__kmp_storage_map) { 273 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 274 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 275 276 gtid = __kmp_gtid_from_thread(th); 277 278 if (gtid == KMP_GTID_MONITOR) { 279 __kmp_print_storage_map_gtid( 280 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 281 "th_%s stack (%s)", "mon", 282 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 283 } else { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%d stack (%s)", gtid, 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } 289 } 290 291 /* No point in checking ubermaster threads since they use refinement and 292 * cannot overlap */ 293 gtid = __kmp_gtid_from_thread(th); 294 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 295 KA_TRACE(10, 296 ("__kmp_check_stack_overlap: performing extensive checking\n")); 297 if (stack_beg == NULL) { 298 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 299 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 300 } 301 302 for (f = 0; f < __kmp_threads_capacity; f++) { 303 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 304 305 if (f_th && f_th != th) { 306 char *other_stack_end = 307 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 308 char *other_stack_beg = 309 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 310 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 311 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 312 313 /* Print the other stack values before the abort */ 314 if (__kmp_storage_map) 315 __kmp_print_storage_map_gtid( 316 -1, other_stack_beg, other_stack_end, 317 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 318 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 319 320 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 321 __kmp_msg_null); 322 } 323 } 324 } 325 } 326 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 327 } 328 329 /* ------------------------------------------------------------------------ */ 330 331 void __kmp_infinite_loop(void) { 332 static int done = FALSE; 333 334 while (!done) { 335 KMP_YIELD(TRUE); 336 } 337 } 338 339 #define MAX_MESSAGE 512 340 341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 342 char const *format, ...) { 343 char buffer[MAX_MESSAGE]; 344 va_list ap; 345 346 va_start(ap, format); 347 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 348 p2, (unsigned long)size, format); 349 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 350 __kmp_vprintf(kmp_err, buffer, ap); 351 #if KMP_PRINT_DATA_PLACEMENT 352 int node; 353 if (gtid >= 0) { 354 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 355 if (__kmp_storage_map_verbose) { 356 node = __kmp_get_host_node(p1); 357 if (node < 0) /* doesn't work, so don't try this next time */ 358 __kmp_storage_map_verbose = FALSE; 359 else { 360 char *last; 361 int lastNode; 362 int localProc = __kmp_get_cpu_from_gtid(gtid); 363 364 const int page_size = KMP_GET_PAGE_SIZE(); 365 366 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 367 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 368 if (localProc >= 0) 369 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 370 localProc >> 1); 371 else 372 __kmp_printf_no_lock(" GTID %d\n", gtid); 373 #if KMP_USE_PRCTL 374 /* The more elaborate format is disabled for now because of the prctl 375 * hanging bug. */ 376 do { 377 last = p1; 378 lastNode = node; 379 /* This loop collates adjacent pages with the same host node. */ 380 do { 381 (char *)p1 += page_size; 382 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 383 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 384 lastNode); 385 } while (p1 <= p2); 386 #else 387 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 388 (char *)p1 + (page_size - 1), 389 __kmp_get_host_node(p1)); 390 if (p1 < p2) { 391 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 392 (char *)p2 + (page_size - 1), 393 __kmp_get_host_node(p2)); 394 } 395 #endif 396 } 397 } 398 } else 399 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 400 } 401 #endif /* KMP_PRINT_DATA_PLACEMENT */ 402 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 403 } 404 405 void __kmp_warn(char const *format, ...) { 406 char buffer[MAX_MESSAGE]; 407 va_list ap; 408 409 if (__kmp_generate_warnings == kmp_warnings_off) { 410 return; 411 } 412 413 va_start(ap, format); 414 415 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 416 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 417 __kmp_vprintf(kmp_err, buffer, ap); 418 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 419 420 va_end(ap); 421 } 422 423 void __kmp_abort_process() { 424 // Later threads may stall here, but that's ok because abort() will kill them. 425 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 426 427 if (__kmp_debug_buf) { 428 __kmp_dump_debug_buffer(); 429 } 430 431 if (KMP_OS_WINDOWS) { 432 // Let other threads know of abnormal termination and prevent deadlock 433 // if abort happened during library initialization or shutdown 434 __kmp_global.g.g_abort = SIGABRT; 435 436 /* On Windows* OS by default abort() causes pop-up error box, which stalls 437 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 438 boxes. _set_abort_behavior() works well, but this function is not 439 available in VS7 (this is not problem for DLL, but it is a problem for 440 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 441 help, at least in some versions of MS C RTL. 442 443 It seems following sequence is the only way to simulate abort() and 444 avoid pop-up error box. */ 445 raise(SIGABRT); 446 _exit(3); // Just in case, if signal ignored, exit anyway. 447 } else { 448 __kmp_unregister_library(); 449 abort(); 450 } 451 452 __kmp_infinite_loop(); 453 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 454 455 } // __kmp_abort_process 456 457 void __kmp_abort_thread(void) { 458 // TODO: Eliminate g_abort global variable and this function. 459 // In case of abort just call abort(), it will kill all the threads. 460 __kmp_infinite_loop(); 461 } // __kmp_abort_thread 462 463 /* Print out the storage map for the major kmp_info_t thread data structures 464 that are allocated together. */ 465 466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 467 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 468 gtid); 469 470 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 471 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 474 sizeof(kmp_local_t), "th_%d.th_local", gtid); 475 476 __kmp_print_storage_map_gtid( 477 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 478 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 479 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 481 &thr->th.th_bar[bs_plain_barrier + 1], 482 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 483 gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 486 &thr->th.th_bar[bs_forkjoin_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 488 gtid); 489 490 #if KMP_FAST_REDUCTION_BARRIER 491 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 492 &thr->th.th_bar[bs_reduction_barrier + 1], 493 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 494 gtid); 495 #endif // KMP_FAST_REDUCTION_BARRIER 496 } 497 498 /* Print out the storage map for the major kmp_team_t team data structures 499 that are allocated together. */ 500 501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 502 int team_id, int num_thr) { 503 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 504 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 505 header, team_id); 506 507 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 508 &team->t.t_bar[bs_last_barrier], 509 sizeof(kmp_balign_team_t) * bs_last_barrier, 510 "%s_%d.t_bar", header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 513 &team->t.t_bar[bs_plain_barrier + 1], 514 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 515 header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 518 &team->t.t_bar[bs_forkjoin_barrier + 1], 519 sizeof(kmp_balign_team_t), 520 "%s_%d.t_bar[forkjoin]", header, team_id); 521 522 #if KMP_FAST_REDUCTION_BARRIER 523 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 524 &team->t.t_bar[bs_reduction_barrier + 1], 525 sizeof(kmp_balign_team_t), 526 "%s_%d.t_bar[reduction]", header, team_id); 527 #endif // KMP_FAST_REDUCTION_BARRIER 528 529 __kmp_print_storage_map_gtid( 530 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 531 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 532 533 __kmp_print_storage_map_gtid( 534 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 535 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 536 537 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 538 &team->t.t_disp_buffer[num_disp_buff], 539 sizeof(dispatch_shared_info_t) * num_disp_buff, 540 "%s_%d.t_disp_buffer", header, team_id); 541 } 542 543 static void __kmp_init_allocator() { __kmp_init_memkind(); } 544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 545 546 /* ------------------------------------------------------------------------ */ 547 548 #if KMP_DYNAMIC_LIB 549 #if KMP_OS_WINDOWS 550 551 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 552 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 553 554 switch (fdwReason) { 555 556 case DLL_PROCESS_ATTACH: 557 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 558 559 return TRUE; 560 561 case DLL_PROCESS_DETACH: 562 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 563 564 // According to Windows* documentation for DllMain entry point: 565 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 566 // lpReserved == NULL when FreeLibrary() is called, 567 // lpReserved != NULL when the process is terminated. 568 // When FreeLibrary() is called, worker threads remain alive. So the 569 // runtime's state is consistent and executing proper shutdown is OK. 570 // When the process is terminated, worker threads have exited or been 571 // forcefully terminated by the OS and only the shutdown thread remains. 572 // This can leave the runtime in an inconsistent state. 573 // Hence, only attempt proper cleanup when FreeLibrary() is called. 574 // Otherwise, rely on OS to reclaim resources. 575 if (lpReserved == NULL) 576 __kmp_internal_end_library(__kmp_gtid_get_specific()); 577 578 return TRUE; 579 580 case DLL_THREAD_ATTACH: 581 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 582 583 /* if we want to register new siblings all the time here call 584 * __kmp_get_gtid(); */ 585 return TRUE; 586 587 case DLL_THREAD_DETACH: 588 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 589 590 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 591 return TRUE; 592 } 593 594 return TRUE; 595 } 596 597 #endif /* KMP_OS_WINDOWS */ 598 #endif /* KMP_DYNAMIC_LIB */ 599 600 /* __kmp_parallel_deo -- Wait until it's our turn. */ 601 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 602 int gtid = *gtid_ref; 603 #ifdef BUILD_PARALLEL_ORDERED 604 kmp_team_t *team = __kmp_team_from_gtid(gtid); 605 #endif /* BUILD_PARALLEL_ORDERED */ 606 607 if (__kmp_env_consistency_check) { 608 if (__kmp_threads[gtid]->th.th_root->r.r_active) 609 #if KMP_USE_DYNAMIC_LOCK 610 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 611 #else 612 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 613 #endif 614 } 615 #ifdef BUILD_PARALLEL_ORDERED 616 if (!team->t.t_serialized) { 617 KMP_MB(); 618 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 619 NULL); 620 KMP_MB(); 621 } 622 #endif /* BUILD_PARALLEL_ORDERED */ 623 } 624 625 /* __kmp_parallel_dxo -- Signal the next task. */ 626 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 627 int gtid = *gtid_ref; 628 #ifdef BUILD_PARALLEL_ORDERED 629 int tid = __kmp_tid_from_gtid(gtid); 630 kmp_team_t *team = __kmp_team_from_gtid(gtid); 631 #endif /* BUILD_PARALLEL_ORDERED */ 632 633 if (__kmp_env_consistency_check) { 634 if (__kmp_threads[gtid]->th.th_root->r.r_active) 635 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 636 } 637 #ifdef BUILD_PARALLEL_ORDERED 638 if (!team->t.t_serialized) { 639 KMP_MB(); /* Flush all pending memory write invalidates. */ 640 641 /* use the tid of the next thread in this team */ 642 /* TODO replace with general release procedure */ 643 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 644 645 KMP_MB(); /* Flush all pending memory write invalidates. */ 646 } 647 #endif /* BUILD_PARALLEL_ORDERED */ 648 } 649 650 /* ------------------------------------------------------------------------ */ 651 /* The BARRIER for a SINGLE process section is always explicit */ 652 653 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 654 int status; 655 kmp_info_t *th; 656 kmp_team_t *team; 657 658 if (!TCR_4(__kmp_init_parallel)) 659 __kmp_parallel_initialize(); 660 __kmp_resume_if_soft_paused(); 661 662 th = __kmp_threads[gtid]; 663 team = th->th.th_team; 664 status = 0; 665 666 th->th.th_ident = id_ref; 667 668 if (team->t.t_serialized) { 669 status = 1; 670 } else { 671 kmp_int32 old_this = th->th.th_local.this_construct; 672 673 ++th->th.th_local.this_construct; 674 /* try to set team count to thread count--success means thread got the 675 single block */ 676 /* TODO: Should this be acquire or release? */ 677 if (team->t.t_construct == old_this) { 678 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 679 th->th.th_local.this_construct); 680 } 681 #if USE_ITT_BUILD 682 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 683 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 684 team->t.t_active_level == 685 1) { // Only report metadata by master of active team at level 1 686 __kmp_itt_metadata_single(id_ref); 687 } 688 #endif /* USE_ITT_BUILD */ 689 } 690 691 if (__kmp_env_consistency_check) { 692 if (status && push_ws) { 693 __kmp_push_workshare(gtid, ct_psingle, id_ref); 694 } else { 695 __kmp_check_workshare(gtid, ct_psingle, id_ref); 696 } 697 } 698 #if USE_ITT_BUILD 699 if (status) { 700 __kmp_itt_single_start(gtid); 701 } 702 #endif /* USE_ITT_BUILD */ 703 return status; 704 } 705 706 void __kmp_exit_single(int gtid) { 707 #if USE_ITT_BUILD 708 __kmp_itt_single_end(gtid); 709 #endif /* USE_ITT_BUILD */ 710 if (__kmp_env_consistency_check) 711 __kmp_pop_workshare(gtid, ct_psingle, NULL); 712 } 713 714 /* determine if we can go parallel or must use a serialized parallel region and 715 * how many threads we can use 716 * set_nproc is the number of threads requested for the team 717 * returns 0 if we should serialize or only use one thread, 718 * otherwise the number of threads to use 719 * The forkjoin lock is held by the caller. */ 720 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 721 int master_tid, int set_nthreads, 722 int enter_teams) { 723 int capacity; 724 int new_nthreads; 725 KMP_DEBUG_ASSERT(__kmp_init_serial); 726 KMP_DEBUG_ASSERT(root && parent_team); 727 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 728 729 // If dyn-var is set, dynamically adjust the number of desired threads, 730 // according to the method specified by dynamic_mode. 731 new_nthreads = set_nthreads; 732 if (!get__dynamic_2(parent_team, master_tid)) { 733 ; 734 } 735 #ifdef USE_LOAD_BALANCE 736 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 737 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 738 if (new_nthreads == 1) { 739 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 740 "reservation to 1 thread\n", 741 master_tid)); 742 return 1; 743 } 744 if (new_nthreads < set_nthreads) { 745 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 746 "reservation to %d threads\n", 747 master_tid, new_nthreads)); 748 } 749 } 750 #endif /* USE_LOAD_BALANCE */ 751 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 752 new_nthreads = __kmp_avail_proc - __kmp_nth + 753 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 754 if (new_nthreads <= 1) { 755 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 756 "reservation to 1 thread\n", 757 master_tid)); 758 return 1; 759 } 760 if (new_nthreads < set_nthreads) { 761 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 762 "reservation to %d threads\n", 763 master_tid, new_nthreads)); 764 } else { 765 new_nthreads = set_nthreads; 766 } 767 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 768 if (set_nthreads > 2) { 769 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 770 new_nthreads = (new_nthreads % set_nthreads) + 1; 771 if (new_nthreads == 1) { 772 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 773 "reservation to 1 thread\n", 774 master_tid)); 775 return 1; 776 } 777 if (new_nthreads < set_nthreads) { 778 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 779 "reservation to %d threads\n", 780 master_tid, new_nthreads)); 781 } 782 } 783 } else { 784 KMP_ASSERT(0); 785 } 786 787 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 788 if (__kmp_nth + new_nthreads - 789 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 790 __kmp_max_nth) { 791 int tl_nthreads = __kmp_max_nth - __kmp_nth + 792 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 793 if (tl_nthreads <= 0) { 794 tl_nthreads = 1; 795 } 796 797 // If dyn-var is false, emit a 1-time warning. 798 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 799 __kmp_reserve_warn = 1; 800 __kmp_msg(kmp_ms_warning, 801 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 802 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 803 } 804 if (tl_nthreads == 1) { 805 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 806 "reduced reservation to 1 thread\n", 807 master_tid)); 808 return 1; 809 } 810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 811 "reservation to %d threads\n", 812 master_tid, tl_nthreads)); 813 new_nthreads = tl_nthreads; 814 } 815 816 // Respect OMP_THREAD_LIMIT 817 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 818 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 819 if (cg_nthreads + new_nthreads - 820 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 821 max_cg_threads) { 822 int tl_nthreads = max_cg_threads - cg_nthreads + 823 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 824 if (tl_nthreads <= 0) { 825 tl_nthreads = 1; 826 } 827 828 // If dyn-var is false, emit a 1-time warning. 829 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 830 __kmp_reserve_warn = 1; 831 __kmp_msg(kmp_ms_warning, 832 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 833 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 834 } 835 if (tl_nthreads == 1) { 836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 837 "reduced reservation to 1 thread\n", 838 master_tid)); 839 return 1; 840 } 841 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 842 "reservation to %d threads\n", 843 master_tid, tl_nthreads)); 844 new_nthreads = tl_nthreads; 845 } 846 847 // Check if the threads array is large enough, or needs expanding. 848 // See comment in __kmp_register_root() about the adjustment if 849 // __kmp_threads[0] == NULL. 850 capacity = __kmp_threads_capacity; 851 if (TCR_PTR(__kmp_threads[0]) == NULL) { 852 --capacity; 853 } 854 if (__kmp_nth + new_nthreads - 855 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 856 capacity) { 857 // Expand the threads array. 858 int slotsRequired = __kmp_nth + new_nthreads - 859 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 860 capacity; 861 int slotsAdded = __kmp_expand_threads(slotsRequired); 862 if (slotsAdded < slotsRequired) { 863 // The threads array was not expanded enough. 864 new_nthreads -= (slotsRequired - slotsAdded); 865 KMP_ASSERT(new_nthreads >= 1); 866 867 // If dyn-var is false, emit a 1-time warning. 868 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 869 __kmp_reserve_warn = 1; 870 if (__kmp_tp_cached) { 871 __kmp_msg(kmp_ms_warning, 872 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 873 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 874 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 875 } else { 876 __kmp_msg(kmp_ms_warning, 877 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 878 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 879 } 880 } 881 } 882 } 883 884 #ifdef KMP_DEBUG 885 if (new_nthreads == 1) { 886 KC_TRACE(10, 887 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 888 "dead roots and rechecking; requested %d threads\n", 889 __kmp_get_gtid(), set_nthreads)); 890 } else { 891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 892 " %d threads\n", 893 __kmp_get_gtid(), new_nthreads, set_nthreads)); 894 } 895 #endif // KMP_DEBUG 896 return new_nthreads; 897 } 898 899 /* Allocate threads from the thread pool and assign them to the new team. We are 900 assured that there are enough threads available, because we checked on that 901 earlier within critical section forkjoin */ 902 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 903 kmp_info_t *master_th, int master_gtid) { 904 int i; 905 int use_hot_team; 906 907 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 908 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 909 KMP_MB(); 910 911 /* first, let's setup the master thread */ 912 master_th->th.th_info.ds.ds_tid = 0; 913 master_th->th.th_team = team; 914 master_th->th.th_team_nproc = team->t.t_nproc; 915 master_th->th.th_team_master = master_th; 916 master_th->th.th_team_serialized = FALSE; 917 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 918 919 /* make sure we are not the optimized hot team */ 920 #if KMP_NESTED_HOT_TEAMS 921 use_hot_team = 0; 922 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 923 if (hot_teams) { // hot teams array is not allocated if 924 // KMP_HOT_TEAMS_MAX_LEVEL=0 925 int level = team->t.t_active_level - 1; // index in array of hot teams 926 if (master_th->th.th_teams_microtask) { // are we inside the teams? 927 if (master_th->th.th_teams_size.nteams > 1) { 928 ++level; // level was not increased in teams construct for 929 // team_of_masters 930 } 931 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 932 master_th->th.th_teams_level == team->t.t_level) { 933 ++level; // level was not increased in teams construct for 934 // team_of_workers before the parallel 935 } // team->t.t_level will be increased inside parallel 936 } 937 if (level < __kmp_hot_teams_max_level) { 938 if (hot_teams[level].hot_team) { 939 // hot team has already been allocated for given level 940 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 941 use_hot_team = 1; // the team is ready to use 942 } else { 943 use_hot_team = 0; // AC: threads are not allocated yet 944 hot_teams[level].hot_team = team; // remember new hot team 945 hot_teams[level].hot_team_nth = team->t.t_nproc; 946 } 947 } else { 948 use_hot_team = 0; 949 } 950 } 951 #else 952 use_hot_team = team == root->r.r_hot_team; 953 #endif 954 if (!use_hot_team) { 955 956 /* install the master thread */ 957 team->t.t_threads[0] = master_th; 958 __kmp_initialize_info(master_th, team, 0, master_gtid); 959 960 /* now, install the worker threads */ 961 for (i = 1; i < team->t.t_nproc; i++) { 962 963 /* fork or reallocate a new thread and install it in team */ 964 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 965 team->t.t_threads[i] = thr; 966 KMP_DEBUG_ASSERT(thr); 967 KMP_DEBUG_ASSERT(thr->th.th_team == team); 968 /* align team and thread arrived states */ 969 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 970 "T#%d(%d:%d) join =%llu, plain=%llu\n", 971 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 972 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 973 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 974 team->t.t_bar[bs_plain_barrier].b_arrived)); 975 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 976 thr->th.th_teams_level = master_th->th.th_teams_level; 977 thr->th.th_teams_size = master_th->th.th_teams_size; 978 { // Initialize threads' barrier data. 979 int b; 980 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 981 for (b = 0; b < bs_last_barrier; ++b) { 982 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 983 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 984 #if USE_DEBUGGER 985 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 986 #endif 987 } 988 } 989 } 990 991 #if KMP_AFFINITY_SUPPORTED 992 __kmp_partition_places(team); 993 #endif 994 } 995 996 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 997 for (i = 0; i < team->t.t_nproc; i++) { 998 kmp_info_t *thr = team->t.t_threads[i]; 999 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1000 thr->th.th_prev_level != team->t.t_level) { 1001 team->t.t_display_affinity = 1; 1002 break; 1003 } 1004 } 1005 } 1006 1007 KMP_MB(); 1008 } 1009 1010 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1011 // Propagate any changes to the floating point control registers out to the team 1012 // We try to avoid unnecessary writes to the relevant cache line in the team 1013 // structure, so we don't make changes unless they are needed. 1014 inline static void propagateFPControl(kmp_team_t *team) { 1015 if (__kmp_inherit_fp_control) { 1016 kmp_int16 x87_fpu_control_word; 1017 kmp_uint32 mxcsr; 1018 1019 // Get master values of FPU control flags (both X87 and vector) 1020 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1021 __kmp_store_mxcsr(&mxcsr); 1022 mxcsr &= KMP_X86_MXCSR_MASK; 1023 1024 // There is no point looking at t_fp_control_saved here. 1025 // If it is TRUE, we still have to update the values if they are different 1026 // from those we now have. If it is FALSE we didn't save anything yet, but 1027 // our objective is the same. We have to ensure that the values in the team 1028 // are the same as those we have. 1029 // So, this code achieves what we need whether or not t_fp_control_saved is 1030 // true. By checking whether the value needs updating we avoid unnecessary 1031 // writes that would put the cache-line into a written state, causing all 1032 // threads in the team to have to read it again. 1033 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1034 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1035 // Although we don't use this value, other code in the runtime wants to know 1036 // whether it should restore them. So we must ensure it is correct. 1037 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1038 } else { 1039 // Similarly here. Don't write to this cache-line in the team structure 1040 // unless we have to. 1041 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1042 } 1043 } 1044 1045 // Do the opposite, setting the hardware registers to the updated values from 1046 // the team. 1047 inline static void updateHWFPControl(kmp_team_t *team) { 1048 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1049 // Only reset the fp control regs if they have been changed in the team. 1050 // the parallel region that we are exiting. 1051 kmp_int16 x87_fpu_control_word; 1052 kmp_uint32 mxcsr; 1053 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1054 __kmp_store_mxcsr(&mxcsr); 1055 mxcsr &= KMP_X86_MXCSR_MASK; 1056 1057 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1058 __kmp_clear_x87_fpu_status_word(); 1059 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1060 } 1061 1062 if (team->t.t_mxcsr != mxcsr) { 1063 __kmp_load_mxcsr(&team->t.t_mxcsr); 1064 } 1065 } 1066 } 1067 #else 1068 #define propagateFPControl(x) ((void)0) 1069 #define updateHWFPControl(x) ((void)0) 1070 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1071 1072 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1073 int realloc); // forward declaration 1074 1075 /* Run a parallel region that has been serialized, so runs only in a team of the 1076 single master thread. */ 1077 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1078 kmp_info_t *this_thr; 1079 kmp_team_t *serial_team; 1080 1081 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1082 1083 /* Skip all this code for autopar serialized loops since it results in 1084 unacceptable overhead */ 1085 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1086 return; 1087 1088 if (!TCR_4(__kmp_init_parallel)) 1089 __kmp_parallel_initialize(); 1090 __kmp_resume_if_soft_paused(); 1091 1092 this_thr = __kmp_threads[global_tid]; 1093 serial_team = this_thr->th.th_serial_team; 1094 1095 /* utilize the serialized team held by this thread */ 1096 KMP_DEBUG_ASSERT(serial_team); 1097 KMP_MB(); 1098 1099 if (__kmp_tasking_mode != tskm_immediate_exec) { 1100 KMP_DEBUG_ASSERT( 1101 this_thr->th.th_task_team == 1102 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1103 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1104 NULL); 1105 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1106 "team %p, new task_team = NULL\n", 1107 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1108 this_thr->th.th_task_team = NULL; 1109 } 1110 1111 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1112 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1113 proc_bind = proc_bind_false; 1114 } else if (proc_bind == proc_bind_default) { 1115 // No proc_bind clause was specified, so use the current value 1116 // of proc-bind-var for this parallel region. 1117 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1118 } 1119 // Reset for next parallel region 1120 this_thr->th.th_set_proc_bind = proc_bind_default; 1121 1122 #if OMPT_SUPPORT 1123 ompt_data_t ompt_parallel_data = ompt_data_none; 1124 ompt_data_t *implicit_task_data; 1125 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1126 if (ompt_enabled.enabled && 1127 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1128 1129 ompt_task_info_t *parent_task_info; 1130 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1131 1132 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1133 if (ompt_enabled.ompt_callback_parallel_begin) { 1134 int team_size = 1; 1135 1136 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1137 &(parent_task_info->task_data), &(parent_task_info->frame), 1138 &ompt_parallel_data, team_size, 1139 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1140 } 1141 } 1142 #endif // OMPT_SUPPORT 1143 1144 if (this_thr->th.th_team != serial_team) { 1145 // Nested level will be an index in the nested nthreads array 1146 int level = this_thr->th.th_team->t.t_level; 1147 1148 if (serial_team->t.t_serialized) { 1149 /* this serial team was already used 1150 TODO increase performance by making this locks more specific */ 1151 kmp_team_t *new_team; 1152 1153 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1154 1155 new_team = 1156 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1157 #if OMPT_SUPPORT 1158 ompt_parallel_data, 1159 #endif 1160 proc_bind, &this_thr->th.th_current_task->td_icvs, 1161 0 USE_NESTED_HOT_ARG(NULL)); 1162 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1163 KMP_ASSERT(new_team); 1164 1165 /* setup new serialized team and install it */ 1166 new_team->t.t_threads[0] = this_thr; 1167 new_team->t.t_parent = this_thr->th.th_team; 1168 serial_team = new_team; 1169 this_thr->th.th_serial_team = serial_team; 1170 1171 KF_TRACE( 1172 10, 1173 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1174 global_tid, serial_team)); 1175 1176 /* TODO the above breaks the requirement that if we run out of resources, 1177 then we can still guarantee that serialized teams are ok, since we may 1178 need to allocate a new one */ 1179 } else { 1180 KF_TRACE( 1181 10, 1182 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1183 global_tid, serial_team)); 1184 } 1185 1186 /* we have to initialize this serial team */ 1187 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1188 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1189 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1190 serial_team->t.t_ident = loc; 1191 serial_team->t.t_serialized = 1; 1192 serial_team->t.t_nproc = 1; 1193 serial_team->t.t_parent = this_thr->th.th_team; 1194 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1195 this_thr->th.th_team = serial_team; 1196 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1197 1198 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1199 this_thr->th.th_current_task)); 1200 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1201 this_thr->th.th_current_task->td_flags.executing = 0; 1202 1203 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1204 1205 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1206 implicit task for each serialized task represented by 1207 team->t.t_serialized? */ 1208 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1209 &this_thr->th.th_current_task->td_parent->td_icvs); 1210 1211 // Thread value exists in the nested nthreads array for the next nested 1212 // level 1213 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1214 this_thr->th.th_current_task->td_icvs.nproc = 1215 __kmp_nested_nth.nth[level + 1]; 1216 } 1217 1218 if (__kmp_nested_proc_bind.used && 1219 (level + 1 < __kmp_nested_proc_bind.used)) { 1220 this_thr->th.th_current_task->td_icvs.proc_bind = 1221 __kmp_nested_proc_bind.bind_types[level + 1]; 1222 } 1223 1224 #if USE_DEBUGGER 1225 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1226 #endif 1227 this_thr->th.th_info.ds.ds_tid = 0; 1228 1229 /* set thread cache values */ 1230 this_thr->th.th_team_nproc = 1; 1231 this_thr->th.th_team_master = this_thr; 1232 this_thr->th.th_team_serialized = 1; 1233 1234 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1235 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1236 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1237 1238 propagateFPControl(serial_team); 1239 1240 /* check if we need to allocate dispatch buffers stack */ 1241 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1242 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1243 serial_team->t.t_dispatch->th_disp_buffer = 1244 (dispatch_private_info_t *)__kmp_allocate( 1245 sizeof(dispatch_private_info_t)); 1246 } 1247 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1248 1249 KMP_MB(); 1250 1251 } else { 1252 /* this serialized team is already being used, 1253 * that's fine, just add another nested level */ 1254 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1255 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1256 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1257 ++serial_team->t.t_serialized; 1258 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1259 1260 // Nested level will be an index in the nested nthreads array 1261 int level = this_thr->th.th_team->t.t_level; 1262 // Thread value exists in the nested nthreads array for the next nested 1263 // level 1264 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1265 this_thr->th.th_current_task->td_icvs.nproc = 1266 __kmp_nested_nth.nth[level + 1]; 1267 } 1268 serial_team->t.t_level++; 1269 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1270 "of serial team %p to %d\n", 1271 global_tid, serial_team, serial_team->t.t_level)); 1272 1273 /* allocate/push dispatch buffers stack */ 1274 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1275 { 1276 dispatch_private_info_t *disp_buffer = 1277 (dispatch_private_info_t *)__kmp_allocate( 1278 sizeof(dispatch_private_info_t)); 1279 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1280 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1281 } 1282 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1283 1284 KMP_MB(); 1285 } 1286 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1287 1288 // Perform the display affinity functionality for 1289 // serialized parallel regions 1290 if (__kmp_display_affinity) { 1291 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1292 this_thr->th.th_prev_num_threads != 1) { 1293 // NULL means use the affinity-format-var ICV 1294 __kmp_aux_display_affinity(global_tid, NULL); 1295 this_thr->th.th_prev_level = serial_team->t.t_level; 1296 this_thr->th.th_prev_num_threads = 1; 1297 } 1298 } 1299 1300 if (__kmp_env_consistency_check) 1301 __kmp_push_parallel(global_tid, NULL); 1302 #if OMPT_SUPPORT 1303 serial_team->t.ompt_team_info.master_return_address = codeptr; 1304 if (ompt_enabled.enabled && 1305 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1306 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1307 OMPT_GET_FRAME_ADDRESS(0); 1308 1309 ompt_lw_taskteam_t lw_taskteam; 1310 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1311 &ompt_parallel_data, codeptr); 1312 1313 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1314 // don't use lw_taskteam after linking. content was swaped 1315 1316 /* OMPT implicit task begin */ 1317 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1318 if (ompt_enabled.ompt_callback_implicit_task) { 1319 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1320 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1321 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1322 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1323 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1324 __kmp_tid_from_gtid(global_tid); 1325 } 1326 1327 /* OMPT state */ 1328 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1329 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1330 OMPT_GET_FRAME_ADDRESS(0); 1331 } 1332 #endif 1333 } 1334 1335 /* most of the work for a fork */ 1336 /* return true if we really went parallel, false if serialized */ 1337 int __kmp_fork_call(ident_t *loc, int gtid, 1338 enum fork_context_e call_context, // Intel, GNU, ... 1339 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1340 kmp_va_list ap) { 1341 void **argv; 1342 int i; 1343 int master_tid; 1344 int master_this_cons; 1345 kmp_team_t *team; 1346 kmp_team_t *parent_team; 1347 kmp_info_t *master_th; 1348 kmp_root_t *root; 1349 int nthreads; 1350 int master_active; 1351 int master_set_numthreads; 1352 int level; 1353 int active_level; 1354 int teams_level; 1355 #if KMP_NESTED_HOT_TEAMS 1356 kmp_hot_team_ptr_t **p_hot_teams; 1357 #endif 1358 { // KMP_TIME_BLOCK 1359 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1360 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1361 1362 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1363 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1364 /* Some systems prefer the stack for the root thread(s) to start with */ 1365 /* some gap from the parent stack to prevent false sharing. */ 1366 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1367 /* These 2 lines below are so this does not get optimized out */ 1368 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1369 __kmp_stkpadding += (short)((kmp_int64)dummy); 1370 } 1371 1372 /* initialize if needed */ 1373 KMP_DEBUG_ASSERT( 1374 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1375 if (!TCR_4(__kmp_init_parallel)) 1376 __kmp_parallel_initialize(); 1377 __kmp_resume_if_soft_paused(); 1378 1379 /* setup current data */ 1380 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1381 // shutdown 1382 parent_team = master_th->th.th_team; 1383 master_tid = master_th->th.th_info.ds.ds_tid; 1384 master_this_cons = master_th->th.th_local.this_construct; 1385 root = master_th->th.th_root; 1386 master_active = root->r.r_active; 1387 master_set_numthreads = master_th->th.th_set_nproc; 1388 1389 #if OMPT_SUPPORT 1390 ompt_data_t ompt_parallel_data = ompt_data_none; 1391 ompt_data_t *parent_task_data; 1392 ompt_frame_t *ompt_frame; 1393 ompt_data_t *implicit_task_data; 1394 void *return_address = NULL; 1395 1396 if (ompt_enabled.enabled) { 1397 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1398 NULL, NULL); 1399 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1400 } 1401 #endif 1402 1403 // Nested level will be an index in the nested nthreads array 1404 level = parent_team->t.t_level; 1405 // used to launch non-serial teams even if nested is not allowed 1406 active_level = parent_team->t.t_active_level; 1407 // needed to check nesting inside the teams 1408 teams_level = master_th->th.th_teams_level; 1409 #if KMP_NESTED_HOT_TEAMS 1410 p_hot_teams = &master_th->th.th_hot_teams; 1411 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1412 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1413 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1414 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1415 // it is either actual or not needed (when active_level > 0) 1416 (*p_hot_teams)[0].hot_team_nth = 1; 1417 } 1418 #endif 1419 1420 #if OMPT_SUPPORT 1421 if (ompt_enabled.enabled) { 1422 if (ompt_enabled.ompt_callback_parallel_begin) { 1423 int team_size = master_set_numthreads 1424 ? master_set_numthreads 1425 : get__nproc_2(parent_team, master_tid); 1426 int flags = OMPT_INVOKER(call_context) | 1427 ((microtask == (microtask_t)__kmp_teams_master) 1428 ? ompt_parallel_league 1429 : ompt_parallel_team); 1430 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1431 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1432 return_address); 1433 } 1434 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1435 } 1436 #endif 1437 1438 master_th->th.th_ident = loc; 1439 1440 if (master_th->th.th_teams_microtask && ap && 1441 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1442 // AC: This is start of parallel that is nested inside teams construct. 1443 // The team is actual (hot), all workers are ready at the fork barrier. 1444 // No lock needed to initialize the team a bit, then free workers. 1445 parent_team->t.t_ident = loc; 1446 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1447 parent_team->t.t_argc = argc; 1448 argv = (void **)parent_team->t.t_argv; 1449 for (i = argc - 1; i >= 0; --i) 1450 *argv++ = va_arg(kmp_va_deref(ap), void *); 1451 // Increment our nested depth levels, but not increase the serialization 1452 if (parent_team == master_th->th.th_serial_team) { 1453 // AC: we are in serialized parallel 1454 __kmpc_serialized_parallel(loc, gtid); 1455 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1456 1457 if (call_context == fork_context_gnu) { 1458 // AC: need to decrement t_serialized for enquiry functions to work 1459 // correctly, will restore at join time 1460 parent_team->t.t_serialized--; 1461 return TRUE; 1462 } 1463 1464 #if OMPT_SUPPORT 1465 void *dummy; 1466 void **exit_frame_p; 1467 1468 ompt_lw_taskteam_t lw_taskteam; 1469 1470 if (ompt_enabled.enabled) { 1471 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1472 &ompt_parallel_data, return_address); 1473 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1474 1475 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1476 // don't use lw_taskteam after linking. content was swaped 1477 1478 /* OMPT implicit task begin */ 1479 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1480 if (ompt_enabled.ompt_callback_implicit_task) { 1481 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1482 __kmp_tid_from_gtid(gtid); 1483 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1484 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1485 implicit_task_data, 1, 1486 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1487 } 1488 1489 /* OMPT state */ 1490 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1491 } else { 1492 exit_frame_p = &dummy; 1493 } 1494 #endif 1495 // AC: need to decrement t_serialized for enquiry functions to work 1496 // correctly, will restore at join time 1497 parent_team->t.t_serialized--; 1498 1499 { 1500 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1501 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1502 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1503 #if OMPT_SUPPORT 1504 , 1505 exit_frame_p 1506 #endif 1507 ); 1508 } 1509 1510 #if OMPT_SUPPORT 1511 if (ompt_enabled.enabled) { 1512 *exit_frame_p = NULL; 1513 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1514 if (ompt_enabled.ompt_callback_implicit_task) { 1515 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1516 ompt_scope_end, NULL, implicit_task_data, 1, 1517 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1518 } 1519 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1520 __ompt_lw_taskteam_unlink(master_th); 1521 if (ompt_enabled.ompt_callback_parallel_end) { 1522 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1523 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1524 OMPT_INVOKER(call_context) | ompt_parallel_team, 1525 return_address); 1526 } 1527 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1528 } 1529 #endif 1530 return TRUE; 1531 } 1532 1533 parent_team->t.t_pkfn = microtask; 1534 parent_team->t.t_invoke = invoker; 1535 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1536 parent_team->t.t_active_level++; 1537 parent_team->t.t_level++; 1538 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1539 1540 #if OMPT_SUPPORT 1541 if (ompt_enabled.enabled) { 1542 ompt_lw_taskteam_t lw_taskteam; 1543 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1544 &ompt_parallel_data, return_address); 1545 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1546 } 1547 #endif 1548 1549 /* Change number of threads in the team if requested */ 1550 if (master_set_numthreads) { // The parallel has num_threads clause 1551 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1552 // AC: only can reduce number of threads dynamically, can't increase 1553 kmp_info_t **other_threads = parent_team->t.t_threads; 1554 parent_team->t.t_nproc = master_set_numthreads; 1555 for (i = 0; i < master_set_numthreads; ++i) { 1556 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1557 } 1558 // Keep extra threads hot in the team for possible next parallels 1559 } 1560 master_th->th.th_set_nproc = 0; 1561 } 1562 1563 #if USE_DEBUGGER 1564 if (__kmp_debugging) { // Let debugger override number of threads. 1565 int nth = __kmp_omp_num_threads(loc); 1566 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1567 master_set_numthreads = nth; 1568 } 1569 } 1570 #endif 1571 1572 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1573 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1574 KMP_ITT_DEBUG) && 1575 __kmp_forkjoin_frames_mode == 3 && 1576 parent_team->t.t_active_level == 1 // only report frames at level 1 1577 && master_th->th.th_teams_size.nteams == 1) { 1578 kmp_uint64 tmp_time = __itt_get_timestamp(); 1579 master_th->th.th_frame_time = tmp_time; 1580 parent_team->t.t_region_time = tmp_time; 1581 } 1582 if (__itt_stack_caller_create_ptr) { 1583 // create new stack stitching id before entering fork barrier 1584 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1585 } 1586 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1587 1588 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1589 "master_th=%p, gtid=%d\n", 1590 root, parent_team, master_th, gtid)); 1591 __kmp_internal_fork(loc, gtid, parent_team); 1592 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1593 "master_th=%p, gtid=%d\n", 1594 root, parent_team, master_th, gtid)); 1595 1596 if (call_context == fork_context_gnu) 1597 return TRUE; 1598 1599 /* Invoke microtask for MASTER thread */ 1600 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1601 parent_team->t.t_id, parent_team->t.t_pkfn)); 1602 1603 if (!parent_team->t.t_invoke(gtid)) { 1604 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1605 } 1606 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1607 parent_team->t.t_id, parent_team->t.t_pkfn)); 1608 KMP_MB(); /* Flush all pending memory write invalidates. */ 1609 1610 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1611 1612 return TRUE; 1613 } // Parallel closely nested in teams construct 1614 1615 #if KMP_DEBUG 1616 if (__kmp_tasking_mode != tskm_immediate_exec) { 1617 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1618 parent_team->t.t_task_team[master_th->th.th_task_state]); 1619 } 1620 #endif 1621 1622 if (parent_team->t.t_active_level >= 1623 master_th->th.th_current_task->td_icvs.max_active_levels) { 1624 nthreads = 1; 1625 } else { 1626 int enter_teams = ((ap == NULL && active_level == 0) || 1627 (ap && teams_level > 0 && teams_level == level)); 1628 nthreads = 1629 master_set_numthreads 1630 ? master_set_numthreads 1631 : get__nproc_2( 1632 parent_team, 1633 master_tid); // TODO: get nproc directly from current task 1634 1635 // Check if we need to take forkjoin lock? (no need for serialized 1636 // parallel out of teams construct). This code moved here from 1637 // __kmp_reserve_threads() to speedup nested serialized parallels. 1638 if (nthreads > 1) { 1639 if ((get__max_active_levels(master_th) == 1 && 1640 (root->r.r_in_parallel && !enter_teams)) || 1641 (__kmp_library == library_serial)) { 1642 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1643 " threads\n", 1644 gtid, nthreads)); 1645 nthreads = 1; 1646 } 1647 } 1648 if (nthreads > 1) { 1649 /* determine how many new threads we can use */ 1650 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1651 /* AC: If we execute teams from parallel region (on host), then teams 1652 should be created but each can only have 1 thread if nesting is 1653 disabled. If teams called from serial region, then teams and their 1654 threads should be created regardless of the nesting setting. */ 1655 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1656 nthreads, enter_teams); 1657 if (nthreads == 1) { 1658 // Free lock for single thread execution here; for multi-thread 1659 // execution it will be freed later after team of threads created 1660 // and initialized 1661 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1662 } 1663 } 1664 } 1665 KMP_DEBUG_ASSERT(nthreads > 0); 1666 1667 // If we temporarily changed the set number of threads then restore it now 1668 master_th->th.th_set_nproc = 0; 1669 1670 /* create a serialized parallel region? */ 1671 if (nthreads == 1) { 1672 /* josh todo: hypothetical question: what do we do for OS X*? */ 1673 #if KMP_OS_LINUX && \ 1674 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1675 void *args[argc]; 1676 #else 1677 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1678 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1679 KMP_ARCH_AARCH64) */ 1680 1681 KA_TRACE(20, 1682 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1683 1684 __kmpc_serialized_parallel(loc, gtid); 1685 1686 if (call_context == fork_context_intel) { 1687 /* TODO this sucks, use the compiler itself to pass args! :) */ 1688 master_th->th.th_serial_team->t.t_ident = loc; 1689 if (!ap) { 1690 // revert change made in __kmpc_serialized_parallel() 1691 master_th->th.th_serial_team->t.t_level--; 1692 // Get args from parent team for teams construct 1693 1694 #if OMPT_SUPPORT 1695 void *dummy; 1696 void **exit_frame_p; 1697 ompt_task_info_t *task_info; 1698 1699 ompt_lw_taskteam_t lw_taskteam; 1700 1701 if (ompt_enabled.enabled) { 1702 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1703 &ompt_parallel_data, return_address); 1704 1705 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1706 // don't use lw_taskteam after linking. content was swaped 1707 1708 task_info = OMPT_CUR_TASK_INFO(master_th); 1709 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1710 if (ompt_enabled.ompt_callback_implicit_task) { 1711 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1712 __kmp_tid_from_gtid(gtid); 1713 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1714 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1715 &(task_info->task_data), 1, 1716 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1717 ompt_task_implicit); 1718 } 1719 1720 /* OMPT state */ 1721 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1722 } else { 1723 exit_frame_p = &dummy; 1724 } 1725 #endif 1726 1727 { 1728 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1729 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1730 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1731 parent_team->t.t_argv 1732 #if OMPT_SUPPORT 1733 , 1734 exit_frame_p 1735 #endif 1736 ); 1737 } 1738 1739 #if OMPT_SUPPORT 1740 if (ompt_enabled.enabled) { 1741 *exit_frame_p = NULL; 1742 if (ompt_enabled.ompt_callback_implicit_task) { 1743 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1744 ompt_scope_end, NULL, &(task_info->task_data), 1, 1745 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1746 ompt_task_implicit); 1747 } 1748 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1749 __ompt_lw_taskteam_unlink(master_th); 1750 if (ompt_enabled.ompt_callback_parallel_end) { 1751 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1752 &ompt_parallel_data, parent_task_data, 1753 OMPT_INVOKER(call_context) | ompt_parallel_team, 1754 return_address); 1755 } 1756 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1757 } 1758 #endif 1759 } else if (microtask == (microtask_t)__kmp_teams_master) { 1760 KMP_DEBUG_ASSERT(master_th->th.th_team == 1761 master_th->th.th_serial_team); 1762 team = master_th->th.th_team; 1763 // team->t.t_pkfn = microtask; 1764 team->t.t_invoke = invoker; 1765 __kmp_alloc_argv_entries(argc, team, TRUE); 1766 team->t.t_argc = argc; 1767 argv = (void **)team->t.t_argv; 1768 if (ap) { 1769 for (i = argc - 1; i >= 0; --i) 1770 *argv++ = va_arg(kmp_va_deref(ap), void *); 1771 } else { 1772 for (i = 0; i < argc; ++i) 1773 // Get args from parent team for teams construct 1774 argv[i] = parent_team->t.t_argv[i]; 1775 } 1776 // AC: revert change made in __kmpc_serialized_parallel() 1777 // because initial code in teams should have level=0 1778 team->t.t_level--; 1779 // AC: call special invoker for outer "parallel" of teams construct 1780 invoker(gtid); 1781 #if OMPT_SUPPORT 1782 if (ompt_enabled.enabled) { 1783 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1784 if (ompt_enabled.ompt_callback_implicit_task) { 1785 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1786 ompt_scope_end, NULL, &(task_info->task_data), 0, 1787 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1788 } 1789 if (ompt_enabled.ompt_callback_parallel_end) { 1790 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1791 &ompt_parallel_data, parent_task_data, 1792 OMPT_INVOKER(call_context) | ompt_parallel_league, 1793 return_address); 1794 } 1795 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1796 } 1797 #endif 1798 } else { 1799 argv = args; 1800 for (i = argc - 1; i >= 0; --i) 1801 *argv++ = va_arg(kmp_va_deref(ap), void *); 1802 KMP_MB(); 1803 1804 #if OMPT_SUPPORT 1805 void *dummy; 1806 void **exit_frame_p; 1807 ompt_task_info_t *task_info; 1808 1809 ompt_lw_taskteam_t lw_taskteam; 1810 1811 if (ompt_enabled.enabled) { 1812 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1813 &ompt_parallel_data, return_address); 1814 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1815 // don't use lw_taskteam after linking. content was swaped 1816 task_info = OMPT_CUR_TASK_INFO(master_th); 1817 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1818 1819 /* OMPT implicit task begin */ 1820 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1821 if (ompt_enabled.ompt_callback_implicit_task) { 1822 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1823 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1824 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1825 ompt_task_implicit); 1826 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1827 __kmp_tid_from_gtid(gtid); 1828 } 1829 1830 /* OMPT state */ 1831 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1832 } else { 1833 exit_frame_p = &dummy; 1834 } 1835 #endif 1836 1837 { 1838 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1839 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1840 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1841 #if OMPT_SUPPORT 1842 , 1843 exit_frame_p 1844 #endif 1845 ); 1846 } 1847 1848 #if OMPT_SUPPORT 1849 if (ompt_enabled.enabled) { 1850 *exit_frame_p = NULL; 1851 if (ompt_enabled.ompt_callback_implicit_task) { 1852 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1853 ompt_scope_end, NULL, &(task_info->task_data), 1, 1854 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1855 ompt_task_implicit); 1856 } 1857 1858 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1859 __ompt_lw_taskteam_unlink(master_th); 1860 if (ompt_enabled.ompt_callback_parallel_end) { 1861 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1862 &ompt_parallel_data, parent_task_data, 1863 OMPT_INVOKER(call_context) | ompt_parallel_team, 1864 return_address); 1865 } 1866 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1867 } 1868 #endif 1869 } 1870 } else if (call_context == fork_context_gnu) { 1871 #if OMPT_SUPPORT 1872 ompt_lw_taskteam_t lwt; 1873 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1874 return_address); 1875 1876 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1877 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1878 // don't use lw_taskteam after linking. content was swaped 1879 #endif 1880 1881 // we were called from GNU native code 1882 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1883 return FALSE; 1884 } else { 1885 KMP_ASSERT2(call_context < fork_context_last, 1886 "__kmp_fork_call: unknown fork_context parameter"); 1887 } 1888 1889 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1890 KMP_MB(); 1891 return FALSE; 1892 } // if (nthreads == 1) 1893 1894 // GEH: only modify the executing flag in the case when not serialized 1895 // serialized case is handled in kmpc_serialized_parallel 1896 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1897 "curtask=%p, curtask_max_aclevel=%d\n", 1898 parent_team->t.t_active_level, master_th, 1899 master_th->th.th_current_task, 1900 master_th->th.th_current_task->td_icvs.max_active_levels)); 1901 // TODO: GEH - cannot do this assertion because root thread not set up as 1902 // executing 1903 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1904 master_th->th.th_current_task->td_flags.executing = 0; 1905 1906 if (!master_th->th.th_teams_microtask || level > teams_level) { 1907 /* Increment our nested depth level */ 1908 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1909 } 1910 1911 // See if we need to make a copy of the ICVs. 1912 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1913 if ((level + 1 < __kmp_nested_nth.used) && 1914 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1915 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1916 } else { 1917 nthreads_icv = 0; // don't update 1918 } 1919 1920 // Figure out the proc_bind_policy for the new team. 1921 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1922 kmp_proc_bind_t proc_bind_icv = 1923 proc_bind_default; // proc_bind_default means don't update 1924 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1925 proc_bind = proc_bind_false; 1926 } else { 1927 if (proc_bind == proc_bind_default) { 1928 // No proc_bind clause specified; use current proc-bind-var for this 1929 // parallel region 1930 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1931 } 1932 /* else: The proc_bind policy was specified explicitly on parallel clause. 1933 This overrides proc-bind-var for this parallel region, but does not 1934 change proc-bind-var. */ 1935 // Figure the value of proc-bind-var for the child threads. 1936 if ((level + 1 < __kmp_nested_proc_bind.used) && 1937 (__kmp_nested_proc_bind.bind_types[level + 1] != 1938 master_th->th.th_current_task->td_icvs.proc_bind)) { 1939 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1940 } 1941 } 1942 1943 // Reset for next parallel region 1944 master_th->th.th_set_proc_bind = proc_bind_default; 1945 1946 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1947 kmp_internal_control_t new_icvs; 1948 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1949 new_icvs.next = NULL; 1950 if (nthreads_icv > 0) { 1951 new_icvs.nproc = nthreads_icv; 1952 } 1953 if (proc_bind_icv != proc_bind_default) { 1954 new_icvs.proc_bind = proc_bind_icv; 1955 } 1956 1957 /* allocate a new parallel team */ 1958 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1959 team = __kmp_allocate_team(root, nthreads, nthreads, 1960 #if OMPT_SUPPORT 1961 ompt_parallel_data, 1962 #endif 1963 proc_bind, &new_icvs, 1964 argc USE_NESTED_HOT_ARG(master_th)); 1965 } else { 1966 /* allocate a new parallel team */ 1967 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1968 team = __kmp_allocate_team(root, nthreads, nthreads, 1969 #if OMPT_SUPPORT 1970 ompt_parallel_data, 1971 #endif 1972 proc_bind, 1973 &master_th->th.th_current_task->td_icvs, 1974 argc USE_NESTED_HOT_ARG(master_th)); 1975 } 1976 KF_TRACE( 1977 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 1978 1979 /* setup the new team */ 1980 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 1981 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 1982 KMP_CHECK_UPDATE(team->t.t_ident, loc); 1983 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 1984 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 1985 #if OMPT_SUPPORT 1986 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 1987 return_address); 1988 #endif 1989 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 1990 // TODO: parent_team->t.t_level == INT_MAX ??? 1991 if (!master_th->th.th_teams_microtask || level > teams_level) { 1992 int new_level = parent_team->t.t_level + 1; 1993 KMP_CHECK_UPDATE(team->t.t_level, new_level); 1994 new_level = parent_team->t.t_active_level + 1; 1995 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 1996 } else { 1997 // AC: Do not increase parallel level at start of the teams construct 1998 int new_level = parent_team->t.t_level; 1999 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2000 new_level = parent_team->t.t_active_level; 2001 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2002 } 2003 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2004 // set master's schedule as new run-time schedule 2005 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2006 2007 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2008 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2009 2010 // Update the floating point rounding in the team if required. 2011 propagateFPControl(team); 2012 2013 if (__kmp_tasking_mode != tskm_immediate_exec) { 2014 // Set master's task team to team's task team. Unless this is hot team, it 2015 // should be NULL. 2016 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2017 parent_team->t.t_task_team[master_th->th.th_task_state]); 2018 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2019 "%p, new task_team %p / team %p\n", 2020 __kmp_gtid_from_thread(master_th), 2021 master_th->th.th_task_team, parent_team, 2022 team->t.t_task_team[master_th->th.th_task_state], team)); 2023 2024 if (active_level || master_th->th.th_task_team) { 2025 // Take a memo of master's task_state 2026 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2027 if (master_th->th.th_task_state_top >= 2028 master_th->th.th_task_state_stack_sz) { // increase size 2029 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2030 kmp_uint8 *old_stack, *new_stack; 2031 kmp_uint32 i; 2032 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2033 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2034 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2035 } 2036 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2037 ++i) { // zero-init rest of stack 2038 new_stack[i] = 0; 2039 } 2040 old_stack = master_th->th.th_task_state_memo_stack; 2041 master_th->th.th_task_state_memo_stack = new_stack; 2042 master_th->th.th_task_state_stack_sz = new_size; 2043 __kmp_free(old_stack); 2044 } 2045 // Store master's task_state on stack 2046 master_th->th 2047 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2048 master_th->th.th_task_state; 2049 master_th->th.th_task_state_top++; 2050 #if KMP_NESTED_HOT_TEAMS 2051 if (master_th->th.th_hot_teams && 2052 active_level < __kmp_hot_teams_max_level && 2053 team == master_th->th.th_hot_teams[active_level].hot_team) { 2054 // Restore master's nested state if nested hot team 2055 master_th->th.th_task_state = 2056 master_th->th 2057 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2058 } else { 2059 #endif 2060 master_th->th.th_task_state = 0; 2061 #if KMP_NESTED_HOT_TEAMS 2062 } 2063 #endif 2064 } 2065 #if !KMP_NESTED_HOT_TEAMS 2066 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2067 (team == root->r.r_hot_team)); 2068 #endif 2069 } 2070 2071 KA_TRACE( 2072 20, 2073 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2074 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2075 team->t.t_nproc)); 2076 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2077 (team->t.t_master_tid == 0 && 2078 (team->t.t_parent == root->r.r_root_team || 2079 team->t.t_parent->t.t_serialized))); 2080 KMP_MB(); 2081 2082 /* now, setup the arguments */ 2083 argv = (void **)team->t.t_argv; 2084 if (ap) { 2085 for (i = argc - 1; i >= 0; --i) { 2086 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2087 KMP_CHECK_UPDATE(*argv, new_argv); 2088 argv++; 2089 } 2090 } else { 2091 for (i = 0; i < argc; ++i) { 2092 // Get args from parent team for teams construct 2093 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2094 } 2095 } 2096 2097 /* now actually fork the threads */ 2098 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2099 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2100 root->r.r_active = TRUE; 2101 2102 __kmp_fork_team_threads(root, team, master_th, gtid); 2103 __kmp_setup_icv_copy(team, nthreads, 2104 &master_th->th.th_current_task->td_icvs, loc); 2105 2106 #if OMPT_SUPPORT 2107 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2108 #endif 2109 2110 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2111 2112 #if USE_ITT_BUILD 2113 if (team->t.t_active_level == 1 // only report frames at level 1 2114 && !master_th->th.th_teams_microtask) { // not in teams construct 2115 #if USE_ITT_NOTIFY 2116 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2117 (__kmp_forkjoin_frames_mode == 3 || 2118 __kmp_forkjoin_frames_mode == 1)) { 2119 kmp_uint64 tmp_time = 0; 2120 if (__itt_get_timestamp_ptr) 2121 tmp_time = __itt_get_timestamp(); 2122 // Internal fork - report frame begin 2123 master_th->th.th_frame_time = tmp_time; 2124 if (__kmp_forkjoin_frames_mode == 3) 2125 team->t.t_region_time = tmp_time; 2126 } else 2127 // only one notification scheme (either "submit" or "forking/joined", not both) 2128 #endif /* USE_ITT_NOTIFY */ 2129 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2130 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2131 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2132 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2133 } 2134 } 2135 #endif /* USE_ITT_BUILD */ 2136 2137 /* now go on and do the work */ 2138 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2139 KMP_MB(); 2140 KF_TRACE(10, 2141 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2142 root, team, master_th, gtid)); 2143 2144 #if USE_ITT_BUILD 2145 if (__itt_stack_caller_create_ptr) { 2146 team->t.t_stack_id = 2147 __kmp_itt_stack_caller_create(); // create new stack stitching id 2148 // before entering fork barrier 2149 } 2150 #endif /* USE_ITT_BUILD */ 2151 2152 // AC: skip __kmp_internal_fork at teams construct, let only master 2153 // threads execute 2154 if (ap) { 2155 __kmp_internal_fork(loc, gtid, team); 2156 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2157 "master_th=%p, gtid=%d\n", 2158 root, team, master_th, gtid)); 2159 } 2160 2161 if (call_context == fork_context_gnu) { 2162 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2163 return TRUE; 2164 } 2165 2166 /* Invoke microtask for MASTER thread */ 2167 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2168 team->t.t_id, team->t.t_pkfn)); 2169 } // END of timer KMP_fork_call block 2170 2171 #if KMP_STATS_ENABLED 2172 // If beginning a teams construct, then change thread state 2173 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2174 if (!ap) { 2175 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2176 } 2177 #endif 2178 2179 if (!team->t.t_invoke(gtid)) { 2180 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2181 } 2182 2183 #if KMP_STATS_ENABLED 2184 // If was beginning of a teams construct, then reset thread state 2185 if (!ap) { 2186 KMP_SET_THREAD_STATE(previous_state); 2187 } 2188 #endif 2189 2190 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2191 team->t.t_id, team->t.t_pkfn)); 2192 KMP_MB(); /* Flush all pending memory write invalidates. */ 2193 2194 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2195 2196 #if OMPT_SUPPORT 2197 if (ompt_enabled.enabled) { 2198 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2199 } 2200 #endif 2201 2202 return TRUE; 2203 } 2204 2205 #if OMPT_SUPPORT 2206 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2207 kmp_team_t *team) { 2208 // restore state outside the region 2209 thread->th.ompt_thread_info.state = 2210 ((team->t.t_serialized) ? ompt_state_work_serial 2211 : ompt_state_work_parallel); 2212 } 2213 2214 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2215 kmp_team_t *team, ompt_data_t *parallel_data, 2216 int flags, void *codeptr) { 2217 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2218 if (ompt_enabled.ompt_callback_parallel_end) { 2219 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2220 parallel_data, &(task_info->task_data), flags, codeptr); 2221 } 2222 2223 task_info->frame.enter_frame = ompt_data_none; 2224 __kmp_join_restore_state(thread, team); 2225 } 2226 #endif 2227 2228 void __kmp_join_call(ident_t *loc, int gtid 2229 #if OMPT_SUPPORT 2230 , 2231 enum fork_context_e fork_context 2232 #endif 2233 , 2234 int exit_teams) { 2235 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2236 kmp_team_t *team; 2237 kmp_team_t *parent_team; 2238 kmp_info_t *master_th; 2239 kmp_root_t *root; 2240 int master_active; 2241 2242 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2243 2244 /* setup current data */ 2245 master_th = __kmp_threads[gtid]; 2246 root = master_th->th.th_root; 2247 team = master_th->th.th_team; 2248 parent_team = team->t.t_parent; 2249 2250 master_th->th.th_ident = loc; 2251 2252 #if OMPT_SUPPORT 2253 void *team_microtask = (void *)team->t.t_pkfn; 2254 // For GOMP interface with serialized parallel, need the 2255 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2256 // and end-parallel events. 2257 if (ompt_enabled.enabled && 2258 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2259 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2260 } 2261 #endif 2262 2263 #if KMP_DEBUG 2264 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2265 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2266 "th_task_team = %p\n", 2267 __kmp_gtid_from_thread(master_th), team, 2268 team->t.t_task_team[master_th->th.th_task_state], 2269 master_th->th.th_task_team)); 2270 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2271 team->t.t_task_team[master_th->th.th_task_state]); 2272 } 2273 #endif 2274 2275 if (team->t.t_serialized) { 2276 if (master_th->th.th_teams_microtask) { 2277 // We are in teams construct 2278 int level = team->t.t_level; 2279 int tlevel = master_th->th.th_teams_level; 2280 if (level == tlevel) { 2281 // AC: we haven't incremented it earlier at start of teams construct, 2282 // so do it here - at the end of teams construct 2283 team->t.t_level++; 2284 } else if (level == tlevel + 1) { 2285 // AC: we are exiting parallel inside teams, need to increment 2286 // serialization in order to restore it in the next call to 2287 // __kmpc_end_serialized_parallel 2288 team->t.t_serialized++; 2289 } 2290 } 2291 __kmpc_end_serialized_parallel(loc, gtid); 2292 2293 #if OMPT_SUPPORT 2294 if (ompt_enabled.enabled) { 2295 __kmp_join_restore_state(master_th, parent_team); 2296 } 2297 #endif 2298 2299 return; 2300 } 2301 2302 master_active = team->t.t_master_active; 2303 2304 if (!exit_teams) { 2305 // AC: No barrier for internal teams at exit from teams construct. 2306 // But there is barrier for external team (league). 2307 __kmp_internal_join(loc, gtid, team); 2308 } else { 2309 master_th->th.th_task_state = 2310 0; // AC: no tasking in teams (out of any parallel) 2311 } 2312 2313 KMP_MB(); 2314 2315 #if OMPT_SUPPORT 2316 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2317 void *codeptr = team->t.ompt_team_info.master_return_address; 2318 #endif 2319 2320 #if USE_ITT_BUILD 2321 if (__itt_stack_caller_create_ptr) { 2322 // destroy the stack stitching id after join barrier 2323 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2324 } 2325 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2326 if (team->t.t_active_level == 1 && 2327 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2328 master_th->th.th_teams_size.nteams == 1)) { 2329 master_th->th.th_ident = loc; 2330 // only one notification scheme (either "submit" or "forking/joined", not 2331 // both) 2332 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2333 __kmp_forkjoin_frames_mode == 3) 2334 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2335 master_th->th.th_frame_time, 0, loc, 2336 master_th->th.th_team_nproc, 1); 2337 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2338 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2339 __kmp_itt_region_joined(gtid); 2340 } // active_level == 1 2341 #endif /* USE_ITT_BUILD */ 2342 2343 if (master_th->th.th_teams_microtask && !exit_teams && 2344 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2345 team->t.t_level == master_th->th.th_teams_level + 1) { 2346 // AC: We need to leave the team structure intact at the end of parallel 2347 // inside the teams construct, so that at the next parallel same (hot) team 2348 // works, only adjust nesting levels 2349 #if OMPT_SUPPORT 2350 ompt_data_t ompt_parallel_data = ompt_data_none; 2351 if (ompt_enabled.enabled) { 2352 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2353 if (ompt_enabled.ompt_callback_implicit_task) { 2354 int ompt_team_size = team->t.t_nproc; 2355 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2356 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2357 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2358 } 2359 task_info->frame.exit_frame = ompt_data_none; 2360 task_info->task_data = ompt_data_none; 2361 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2362 __ompt_lw_taskteam_unlink(master_th); 2363 } 2364 #endif 2365 /* Decrement our nested depth level */ 2366 team->t.t_level--; 2367 team->t.t_active_level--; 2368 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2369 2370 // Restore number of threads in the team if needed. This code relies on 2371 // the proper adjustment of th_teams_size.nth after the fork in 2372 // __kmp_teams_master on each teams master in the case that 2373 // __kmp_reserve_threads reduced it. 2374 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2375 int old_num = master_th->th.th_team_nproc; 2376 int new_num = master_th->th.th_teams_size.nth; 2377 kmp_info_t **other_threads = team->t.t_threads; 2378 team->t.t_nproc = new_num; 2379 for (int i = 0; i < old_num; ++i) { 2380 other_threads[i]->th.th_team_nproc = new_num; 2381 } 2382 // Adjust states of non-used threads of the team 2383 for (int i = old_num; i < new_num; ++i) { 2384 // Re-initialize thread's barrier data. 2385 KMP_DEBUG_ASSERT(other_threads[i]); 2386 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2387 for (int b = 0; b < bs_last_barrier; ++b) { 2388 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2389 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2390 #if USE_DEBUGGER 2391 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2392 #endif 2393 } 2394 if (__kmp_tasking_mode != tskm_immediate_exec) { 2395 // Synchronize thread's task state 2396 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2397 } 2398 } 2399 } 2400 2401 #if OMPT_SUPPORT 2402 if (ompt_enabled.enabled) { 2403 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2404 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2405 } 2406 #endif 2407 2408 return; 2409 } 2410 2411 /* do cleanup and restore the parent team */ 2412 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2413 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2414 2415 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2416 2417 /* jc: The following lock has instructions with REL and ACQ semantics, 2418 separating the parallel user code called in this parallel region 2419 from the serial user code called after this function returns. */ 2420 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2421 2422 if (!master_th->th.th_teams_microtask || 2423 team->t.t_level > master_th->th.th_teams_level) { 2424 /* Decrement our nested depth level */ 2425 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2426 } 2427 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2428 2429 #if OMPT_SUPPORT 2430 if (ompt_enabled.enabled) { 2431 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2432 if (ompt_enabled.ompt_callback_implicit_task) { 2433 int flags = (team_microtask == (void *)__kmp_teams_master) 2434 ? ompt_task_initial 2435 : ompt_task_implicit; 2436 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2437 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2438 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2439 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2440 } 2441 task_info->frame.exit_frame = ompt_data_none; 2442 task_info->task_data = ompt_data_none; 2443 } 2444 #endif 2445 2446 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2447 master_th, team)); 2448 __kmp_pop_current_task_from_thread(master_th); 2449 2450 #if KMP_AFFINITY_SUPPORTED 2451 // Restore master thread's partition. 2452 master_th->th.th_first_place = team->t.t_first_place; 2453 master_th->th.th_last_place = team->t.t_last_place; 2454 #endif // KMP_AFFINITY_SUPPORTED 2455 master_th->th.th_def_allocator = team->t.t_def_allocator; 2456 2457 updateHWFPControl(team); 2458 2459 if (root->r.r_active != master_active) 2460 root->r.r_active = master_active; 2461 2462 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2463 master_th)); // this will free worker threads 2464 2465 /* this race was fun to find. make sure the following is in the critical 2466 region otherwise assertions may fail occasionally since the old team may be 2467 reallocated and the hierarchy appears inconsistent. it is actually safe to 2468 run and won't cause any bugs, but will cause those assertion failures. it's 2469 only one deref&assign so might as well put this in the critical region */ 2470 master_th->th.th_team = parent_team; 2471 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2472 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2473 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2474 2475 /* restore serialized team, if need be */ 2476 if (parent_team->t.t_serialized && 2477 parent_team != master_th->th.th_serial_team && 2478 parent_team != root->r.r_root_team) { 2479 __kmp_free_team(root, 2480 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2481 master_th->th.th_serial_team = parent_team; 2482 } 2483 2484 if (__kmp_tasking_mode != tskm_immediate_exec) { 2485 if (master_th->th.th_task_state_top > 2486 0) { // Restore task state from memo stack 2487 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2488 // Remember master's state if we re-use this nested hot team 2489 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2490 master_th->th.th_task_state; 2491 --master_th->th.th_task_state_top; // pop 2492 // Now restore state at this level 2493 master_th->th.th_task_state = 2494 master_th->th 2495 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2496 } 2497 // Copy the task team from the parent team to the master thread 2498 master_th->th.th_task_team = 2499 parent_team->t.t_task_team[master_th->th.th_task_state]; 2500 KA_TRACE(20, 2501 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2502 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2503 parent_team)); 2504 } 2505 2506 // TODO: GEH - cannot do this assertion because root thread not set up as 2507 // executing 2508 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2509 master_th->th.th_current_task->td_flags.executing = 1; 2510 2511 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2512 2513 #if OMPT_SUPPORT 2514 int flags = 2515 OMPT_INVOKER(fork_context) | 2516 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2517 : ompt_parallel_team); 2518 if (ompt_enabled.enabled) { 2519 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2520 codeptr); 2521 } 2522 #endif 2523 2524 KMP_MB(); 2525 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2526 } 2527 2528 /* Check whether we should push an internal control record onto the 2529 serial team stack. If so, do it. */ 2530 void __kmp_save_internal_controls(kmp_info_t *thread) { 2531 2532 if (thread->th.th_team != thread->th.th_serial_team) { 2533 return; 2534 } 2535 if (thread->th.th_team->t.t_serialized > 1) { 2536 int push = 0; 2537 2538 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2539 push = 1; 2540 } else { 2541 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2542 thread->th.th_team->t.t_serialized) { 2543 push = 1; 2544 } 2545 } 2546 if (push) { /* push a record on the serial team's stack */ 2547 kmp_internal_control_t *control = 2548 (kmp_internal_control_t *)__kmp_allocate( 2549 sizeof(kmp_internal_control_t)); 2550 2551 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2552 2553 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2554 2555 control->next = thread->th.th_team->t.t_control_stack_top; 2556 thread->th.th_team->t.t_control_stack_top = control; 2557 } 2558 } 2559 } 2560 2561 /* Changes set_nproc */ 2562 void __kmp_set_num_threads(int new_nth, int gtid) { 2563 kmp_info_t *thread; 2564 kmp_root_t *root; 2565 2566 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2567 KMP_DEBUG_ASSERT(__kmp_init_serial); 2568 2569 if (new_nth < 1) 2570 new_nth = 1; 2571 else if (new_nth > __kmp_max_nth) 2572 new_nth = __kmp_max_nth; 2573 2574 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2575 thread = __kmp_threads[gtid]; 2576 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2577 return; // nothing to do 2578 2579 __kmp_save_internal_controls(thread); 2580 2581 set__nproc(thread, new_nth); 2582 2583 // If this omp_set_num_threads() call will cause the hot team size to be 2584 // reduced (in the absence of a num_threads clause), then reduce it now, 2585 // rather than waiting for the next parallel region. 2586 root = thread->th.th_root; 2587 if (__kmp_init_parallel && (!root->r.r_active) && 2588 (root->r.r_hot_team->t.t_nproc > new_nth) 2589 #if KMP_NESTED_HOT_TEAMS 2590 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2591 #endif 2592 ) { 2593 kmp_team_t *hot_team = root->r.r_hot_team; 2594 int f; 2595 2596 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2597 2598 // Release the extra threads we don't need any more. 2599 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2600 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2601 if (__kmp_tasking_mode != tskm_immediate_exec) { 2602 // When decreasing team size, threads no longer in the team should unref 2603 // task team. 2604 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2605 } 2606 __kmp_free_thread(hot_team->t.t_threads[f]); 2607 hot_team->t.t_threads[f] = NULL; 2608 } 2609 hot_team->t.t_nproc = new_nth; 2610 #if KMP_NESTED_HOT_TEAMS 2611 if (thread->th.th_hot_teams) { 2612 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2613 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2614 } 2615 #endif 2616 2617 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2618 2619 // Update the t_nproc field in the threads that are still active. 2620 for (f = 0; f < new_nth; f++) { 2621 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2622 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2623 } 2624 // Special flag in case omp_set_num_threads() call 2625 hot_team->t.t_size_changed = -1; 2626 } 2627 } 2628 2629 /* Changes max_active_levels */ 2630 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2631 kmp_info_t *thread; 2632 2633 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2634 "%d = (%d)\n", 2635 gtid, max_active_levels)); 2636 KMP_DEBUG_ASSERT(__kmp_init_serial); 2637 2638 // validate max_active_levels 2639 if (max_active_levels < 0) { 2640 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2641 // We ignore this call if the user has specified a negative value. 2642 // The current setting won't be changed. The last valid setting will be 2643 // used. A warning will be issued (if warnings are allowed as controlled by 2644 // the KMP_WARNINGS env var). 2645 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2646 "max_active_levels for thread %d = (%d)\n", 2647 gtid, max_active_levels)); 2648 return; 2649 } 2650 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2651 // it's OK, the max_active_levels is within the valid range: [ 0; 2652 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2653 // We allow a zero value. (implementation defined behavior) 2654 } else { 2655 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2656 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2657 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2658 // Current upper limit is MAX_INT. (implementation defined behavior) 2659 // If the input exceeds the upper limit, we correct the input to be the 2660 // upper limit. (implementation defined behavior) 2661 // Actually, the flow should never get here until we use MAX_INT limit. 2662 } 2663 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2664 "max_active_levels for thread %d = (%d)\n", 2665 gtid, max_active_levels)); 2666 2667 thread = __kmp_threads[gtid]; 2668 2669 __kmp_save_internal_controls(thread); 2670 2671 set__max_active_levels(thread, max_active_levels); 2672 } 2673 2674 /* Gets max_active_levels */ 2675 int __kmp_get_max_active_levels(int gtid) { 2676 kmp_info_t *thread; 2677 2678 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2679 KMP_DEBUG_ASSERT(__kmp_init_serial); 2680 2681 thread = __kmp_threads[gtid]; 2682 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2683 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2684 "curtask_maxaclevel=%d\n", 2685 gtid, thread->th.th_current_task, 2686 thread->th.th_current_task->td_icvs.max_active_levels)); 2687 return thread->th.th_current_task->td_icvs.max_active_levels; 2688 } 2689 2690 // nteams-var per-device ICV 2691 void __kmp_set_num_teams(int num_teams) { 2692 if (num_teams > 0) 2693 __kmp_nteams = num_teams; 2694 } 2695 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2696 // teams-thread-limit-var per-device ICV 2697 void __kmp_set_teams_thread_limit(int limit) { 2698 if (limit > 0) 2699 __kmp_teams_thread_limit = limit; 2700 } 2701 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2702 2703 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2704 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2705 2706 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2707 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2708 kmp_info_t *thread; 2709 kmp_sched_t orig_kind; 2710 // kmp_team_t *team; 2711 2712 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2713 gtid, (int)kind, chunk)); 2714 KMP_DEBUG_ASSERT(__kmp_init_serial); 2715 2716 // Check if the kind parameter is valid, correct if needed. 2717 // Valid parameters should fit in one of two intervals - standard or extended: 2718 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2719 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2720 orig_kind = kind; 2721 kind = __kmp_sched_without_mods(kind); 2722 2723 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2724 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2725 // TODO: Hint needs attention in case we change the default schedule. 2726 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2727 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2728 __kmp_msg_null); 2729 kind = kmp_sched_default; 2730 chunk = 0; // ignore chunk value in case of bad kind 2731 } 2732 2733 thread = __kmp_threads[gtid]; 2734 2735 __kmp_save_internal_controls(thread); 2736 2737 if (kind < kmp_sched_upper_std) { 2738 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2739 // differ static chunked vs. unchunked: chunk should be invalid to 2740 // indicate unchunked schedule (which is the default) 2741 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2742 } else { 2743 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2744 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2745 } 2746 } else { 2747 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2748 // kmp_sched_lower - 2 ]; 2749 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2750 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2751 kmp_sched_lower - 2]; 2752 } 2753 __kmp_sched_apply_mods_intkind( 2754 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2755 if (kind == kmp_sched_auto || chunk < 1) { 2756 // ignore parameter chunk for schedule auto 2757 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2758 } else { 2759 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2760 } 2761 } 2762 2763 /* Gets def_sched_var ICV values */ 2764 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2765 kmp_info_t *thread; 2766 enum sched_type th_type; 2767 2768 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2769 KMP_DEBUG_ASSERT(__kmp_init_serial); 2770 2771 thread = __kmp_threads[gtid]; 2772 2773 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2774 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2775 case kmp_sch_static: 2776 case kmp_sch_static_greedy: 2777 case kmp_sch_static_balanced: 2778 *kind = kmp_sched_static; 2779 __kmp_sched_apply_mods_stdkind(kind, th_type); 2780 *chunk = 0; // chunk was not set, try to show this fact via zero value 2781 return; 2782 case kmp_sch_static_chunked: 2783 *kind = kmp_sched_static; 2784 break; 2785 case kmp_sch_dynamic_chunked: 2786 *kind = kmp_sched_dynamic; 2787 break; 2788 case kmp_sch_guided_chunked: 2789 case kmp_sch_guided_iterative_chunked: 2790 case kmp_sch_guided_analytical_chunked: 2791 *kind = kmp_sched_guided; 2792 break; 2793 case kmp_sch_auto: 2794 *kind = kmp_sched_auto; 2795 break; 2796 case kmp_sch_trapezoidal: 2797 *kind = kmp_sched_trapezoidal; 2798 break; 2799 #if KMP_STATIC_STEAL_ENABLED 2800 case kmp_sch_static_steal: 2801 *kind = kmp_sched_static_steal; 2802 break; 2803 #endif 2804 default: 2805 KMP_FATAL(UnknownSchedulingType, th_type); 2806 } 2807 2808 __kmp_sched_apply_mods_stdkind(kind, th_type); 2809 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2810 } 2811 2812 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2813 2814 int ii, dd; 2815 kmp_team_t *team; 2816 kmp_info_t *thr; 2817 2818 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2819 KMP_DEBUG_ASSERT(__kmp_init_serial); 2820 2821 // validate level 2822 if (level == 0) 2823 return 0; 2824 if (level < 0) 2825 return -1; 2826 thr = __kmp_threads[gtid]; 2827 team = thr->th.th_team; 2828 ii = team->t.t_level; 2829 if (level > ii) 2830 return -1; 2831 2832 if (thr->th.th_teams_microtask) { 2833 // AC: we are in teams region where multiple nested teams have same level 2834 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2835 if (level <= 2836 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2837 KMP_DEBUG_ASSERT(ii >= tlevel); 2838 // AC: As we need to pass by the teams league, we need to artificially 2839 // increase ii 2840 if (ii == tlevel) { 2841 ii += 2; // three teams have same level 2842 } else { 2843 ii++; // two teams have same level 2844 } 2845 } 2846 } 2847 2848 if (ii == level) 2849 return __kmp_tid_from_gtid(gtid); 2850 2851 dd = team->t.t_serialized; 2852 level++; 2853 while (ii > level) { 2854 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2855 } 2856 if ((team->t.t_serialized) && (!dd)) { 2857 team = team->t.t_parent; 2858 continue; 2859 } 2860 if (ii > level) { 2861 team = team->t.t_parent; 2862 dd = team->t.t_serialized; 2863 ii--; 2864 } 2865 } 2866 2867 return (dd > 1) ? (0) : (team->t.t_master_tid); 2868 } 2869 2870 int __kmp_get_team_size(int gtid, int level) { 2871 2872 int ii, dd; 2873 kmp_team_t *team; 2874 kmp_info_t *thr; 2875 2876 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2877 KMP_DEBUG_ASSERT(__kmp_init_serial); 2878 2879 // validate level 2880 if (level == 0) 2881 return 1; 2882 if (level < 0) 2883 return -1; 2884 thr = __kmp_threads[gtid]; 2885 team = thr->th.th_team; 2886 ii = team->t.t_level; 2887 if (level > ii) 2888 return -1; 2889 2890 if (thr->th.th_teams_microtask) { 2891 // AC: we are in teams region where multiple nested teams have same level 2892 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2893 if (level <= 2894 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2895 KMP_DEBUG_ASSERT(ii >= tlevel); 2896 // AC: As we need to pass by the teams league, we need to artificially 2897 // increase ii 2898 if (ii == tlevel) { 2899 ii += 2; // three teams have same level 2900 } else { 2901 ii++; // two teams have same level 2902 } 2903 } 2904 } 2905 2906 while (ii > level) { 2907 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2908 } 2909 if (team->t.t_serialized && (!dd)) { 2910 team = team->t.t_parent; 2911 continue; 2912 } 2913 if (ii > level) { 2914 team = team->t.t_parent; 2915 ii--; 2916 } 2917 } 2918 2919 return team->t.t_nproc; 2920 } 2921 2922 kmp_r_sched_t __kmp_get_schedule_global() { 2923 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2924 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2925 // independently. So one can get the updated schedule here. 2926 2927 kmp_r_sched_t r_sched; 2928 2929 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2930 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2931 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2932 // different roots (even in OMP 2.5) 2933 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2934 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2935 if (s == kmp_sch_static) { 2936 // replace STATIC with more detailed schedule (balanced or greedy) 2937 r_sched.r_sched_type = __kmp_static; 2938 } else if (s == kmp_sch_guided_chunked) { 2939 // replace GUIDED with more detailed schedule (iterative or analytical) 2940 r_sched.r_sched_type = __kmp_guided; 2941 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2942 r_sched.r_sched_type = __kmp_sched; 2943 } 2944 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2945 2946 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2947 // __kmp_chunk may be wrong here (if it was not ever set) 2948 r_sched.chunk = KMP_DEFAULT_CHUNK; 2949 } else { 2950 r_sched.chunk = __kmp_chunk; 2951 } 2952 2953 return r_sched; 2954 } 2955 2956 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2957 at least argc number of *t_argv entries for the requested team. */ 2958 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2959 2960 KMP_DEBUG_ASSERT(team); 2961 if (!realloc || argc > team->t.t_max_argc) { 2962 2963 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 2964 "current entries=%d\n", 2965 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 2966 /* if previously allocated heap space for args, free them */ 2967 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 2968 __kmp_free((void *)team->t.t_argv); 2969 2970 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 2971 /* use unused space in the cache line for arguments */ 2972 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 2973 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 2974 "argv entries\n", 2975 team->t.t_id, team->t.t_max_argc)); 2976 team->t.t_argv = &team->t.t_inline_argv[0]; 2977 if (__kmp_storage_map) { 2978 __kmp_print_storage_map_gtid( 2979 -1, &team->t.t_inline_argv[0], 2980 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 2981 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 2982 team->t.t_id); 2983 } 2984 } else { 2985 /* allocate space for arguments in the heap */ 2986 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 2987 ? KMP_MIN_MALLOC_ARGV_ENTRIES 2988 : 2 * argc; 2989 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 2990 "argv entries\n", 2991 team->t.t_id, team->t.t_max_argc)); 2992 team->t.t_argv = 2993 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 2994 if (__kmp_storage_map) { 2995 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 2996 &team->t.t_argv[team->t.t_max_argc], 2997 sizeof(void *) * team->t.t_max_argc, 2998 "team_%d.t_argv", team->t.t_id); 2999 } 3000 } 3001 } 3002 } 3003 3004 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3005 int i; 3006 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3007 team->t.t_threads = 3008 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3009 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3010 sizeof(dispatch_shared_info_t) * num_disp_buff); 3011 team->t.t_dispatch = 3012 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3013 team->t.t_implicit_task_taskdata = 3014 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3015 team->t.t_max_nproc = max_nth; 3016 3017 /* setup dispatch buffers */ 3018 for (i = 0; i < num_disp_buff; ++i) { 3019 team->t.t_disp_buffer[i].buffer_index = i; 3020 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3021 } 3022 } 3023 3024 static void __kmp_free_team_arrays(kmp_team_t *team) { 3025 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3026 int i; 3027 for (i = 0; i < team->t.t_max_nproc; ++i) { 3028 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3029 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3030 team->t.t_dispatch[i].th_disp_buffer = NULL; 3031 } 3032 } 3033 #if KMP_USE_HIER_SCHED 3034 __kmp_dispatch_free_hierarchies(team); 3035 #endif 3036 __kmp_free(team->t.t_threads); 3037 __kmp_free(team->t.t_disp_buffer); 3038 __kmp_free(team->t.t_dispatch); 3039 __kmp_free(team->t.t_implicit_task_taskdata); 3040 team->t.t_threads = NULL; 3041 team->t.t_disp_buffer = NULL; 3042 team->t.t_dispatch = NULL; 3043 team->t.t_implicit_task_taskdata = 0; 3044 } 3045 3046 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3047 kmp_info_t **oldThreads = team->t.t_threads; 3048 3049 __kmp_free(team->t.t_disp_buffer); 3050 __kmp_free(team->t.t_dispatch); 3051 __kmp_free(team->t.t_implicit_task_taskdata); 3052 __kmp_allocate_team_arrays(team, max_nth); 3053 3054 KMP_MEMCPY(team->t.t_threads, oldThreads, 3055 team->t.t_nproc * sizeof(kmp_info_t *)); 3056 3057 __kmp_free(oldThreads); 3058 } 3059 3060 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3061 3062 kmp_r_sched_t r_sched = 3063 __kmp_get_schedule_global(); // get current state of scheduling globals 3064 3065 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3066 3067 kmp_internal_control_t g_icvs = { 3068 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3069 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3070 // adjustment of threads (per thread) 3071 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3072 // whether blocktime is explicitly set 3073 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3074 #if KMP_USE_MONITOR 3075 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3076 // intervals 3077 #endif 3078 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3079 // next parallel region (per thread) 3080 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3081 __kmp_cg_max_nth, // int thread_limit; 3082 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3083 // for max_active_levels 3084 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3085 // {sched,chunk} pair 3086 __kmp_nested_proc_bind.bind_types[0], 3087 __kmp_default_device, 3088 NULL // struct kmp_internal_control *next; 3089 }; 3090 3091 return g_icvs; 3092 } 3093 3094 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3095 3096 kmp_internal_control_t gx_icvs; 3097 gx_icvs.serial_nesting_level = 3098 0; // probably =team->t.t_serial like in save_inter_controls 3099 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3100 gx_icvs.next = NULL; 3101 3102 return gx_icvs; 3103 } 3104 3105 static void __kmp_initialize_root(kmp_root_t *root) { 3106 int f; 3107 kmp_team_t *root_team; 3108 kmp_team_t *hot_team; 3109 int hot_team_max_nth; 3110 kmp_r_sched_t r_sched = 3111 __kmp_get_schedule_global(); // get current state of scheduling globals 3112 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3113 KMP_DEBUG_ASSERT(root); 3114 KMP_ASSERT(!root->r.r_begin); 3115 3116 /* setup the root state structure */ 3117 __kmp_init_lock(&root->r.r_begin_lock); 3118 root->r.r_begin = FALSE; 3119 root->r.r_active = FALSE; 3120 root->r.r_in_parallel = 0; 3121 root->r.r_blocktime = __kmp_dflt_blocktime; 3122 3123 /* setup the root team for this task */ 3124 /* allocate the root team structure */ 3125 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3126 3127 root_team = 3128 __kmp_allocate_team(root, 3129 1, // new_nproc 3130 1, // max_nproc 3131 #if OMPT_SUPPORT 3132 ompt_data_none, // root parallel id 3133 #endif 3134 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3135 0 // argc 3136 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3137 ); 3138 #if USE_DEBUGGER 3139 // Non-NULL value should be assigned to make the debugger display the root 3140 // team. 3141 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3142 #endif 3143 3144 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3145 3146 root->r.r_root_team = root_team; 3147 root_team->t.t_control_stack_top = NULL; 3148 3149 /* initialize root team */ 3150 root_team->t.t_threads[0] = NULL; 3151 root_team->t.t_nproc = 1; 3152 root_team->t.t_serialized = 1; 3153 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3154 root_team->t.t_sched.sched = r_sched.sched; 3155 KA_TRACE( 3156 20, 3157 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3158 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3159 3160 /* setup the hot team for this task */ 3161 /* allocate the hot team structure */ 3162 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3163 3164 hot_team = 3165 __kmp_allocate_team(root, 3166 1, // new_nproc 3167 __kmp_dflt_team_nth_ub * 2, // max_nproc 3168 #if OMPT_SUPPORT 3169 ompt_data_none, // root parallel id 3170 #endif 3171 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3172 0 // argc 3173 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3174 ); 3175 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3176 3177 root->r.r_hot_team = hot_team; 3178 root_team->t.t_control_stack_top = NULL; 3179 3180 /* first-time initialization */ 3181 hot_team->t.t_parent = root_team; 3182 3183 /* initialize hot team */ 3184 hot_team_max_nth = hot_team->t.t_max_nproc; 3185 for (f = 0; f < hot_team_max_nth; ++f) { 3186 hot_team->t.t_threads[f] = NULL; 3187 } 3188 hot_team->t.t_nproc = 1; 3189 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3190 hot_team->t.t_sched.sched = r_sched.sched; 3191 hot_team->t.t_size_changed = 0; 3192 } 3193 3194 #ifdef KMP_DEBUG 3195 3196 typedef struct kmp_team_list_item { 3197 kmp_team_p const *entry; 3198 struct kmp_team_list_item *next; 3199 } kmp_team_list_item_t; 3200 typedef kmp_team_list_item_t *kmp_team_list_t; 3201 3202 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3203 kmp_team_list_t list, // List of teams. 3204 kmp_team_p const *team // Team to add. 3205 ) { 3206 3207 // List must terminate with item where both entry and next are NULL. 3208 // Team is added to the list only once. 3209 // List is sorted in ascending order by team id. 3210 // Team id is *not* a key. 3211 3212 kmp_team_list_t l; 3213 3214 KMP_DEBUG_ASSERT(list != NULL); 3215 if (team == NULL) { 3216 return; 3217 } 3218 3219 __kmp_print_structure_team_accum(list, team->t.t_parent); 3220 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3221 3222 // Search list for the team. 3223 l = list; 3224 while (l->next != NULL && l->entry != team) { 3225 l = l->next; 3226 } 3227 if (l->next != NULL) { 3228 return; // Team has been added before, exit. 3229 } 3230 3231 // Team is not found. Search list again for insertion point. 3232 l = list; 3233 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3234 l = l->next; 3235 } 3236 3237 // Insert team. 3238 { 3239 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3240 sizeof(kmp_team_list_item_t)); 3241 *item = *l; 3242 l->entry = team; 3243 l->next = item; 3244 } 3245 } 3246 3247 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3248 3249 ) { 3250 __kmp_printf("%s", title); 3251 if (team != NULL) { 3252 __kmp_printf("%2x %p\n", team->t.t_id, team); 3253 } else { 3254 __kmp_printf(" - (nil)\n"); 3255 } 3256 } 3257 3258 static void __kmp_print_structure_thread(char const *title, 3259 kmp_info_p const *thread) { 3260 __kmp_printf("%s", title); 3261 if (thread != NULL) { 3262 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3263 } else { 3264 __kmp_printf(" - (nil)\n"); 3265 } 3266 } 3267 3268 void __kmp_print_structure(void) { 3269 3270 kmp_team_list_t list; 3271 3272 // Initialize list of teams. 3273 list = 3274 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3275 list->entry = NULL; 3276 list->next = NULL; 3277 3278 __kmp_printf("\n------------------------------\nGlobal Thread " 3279 "Table\n------------------------------\n"); 3280 { 3281 int gtid; 3282 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3283 __kmp_printf("%2d", gtid); 3284 if (__kmp_threads != NULL) { 3285 __kmp_printf(" %p", __kmp_threads[gtid]); 3286 } 3287 if (__kmp_root != NULL) { 3288 __kmp_printf(" %p", __kmp_root[gtid]); 3289 } 3290 __kmp_printf("\n"); 3291 } 3292 } 3293 3294 // Print out __kmp_threads array. 3295 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3296 "----------\n"); 3297 if (__kmp_threads != NULL) { 3298 int gtid; 3299 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3300 kmp_info_t const *thread = __kmp_threads[gtid]; 3301 if (thread != NULL) { 3302 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3303 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3304 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3305 __kmp_print_structure_team(" Serial Team: ", 3306 thread->th.th_serial_team); 3307 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3308 __kmp_print_structure_thread(" Master: ", 3309 thread->th.th_team_master); 3310 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3311 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3312 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3313 __kmp_print_structure_thread(" Next in pool: ", 3314 thread->th.th_next_pool); 3315 __kmp_printf("\n"); 3316 __kmp_print_structure_team_accum(list, thread->th.th_team); 3317 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3318 } 3319 } 3320 } else { 3321 __kmp_printf("Threads array is not allocated.\n"); 3322 } 3323 3324 // Print out __kmp_root array. 3325 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3326 "--------\n"); 3327 if (__kmp_root != NULL) { 3328 int gtid; 3329 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3330 kmp_root_t const *root = __kmp_root[gtid]; 3331 if (root != NULL) { 3332 __kmp_printf("GTID %2d %p:\n", gtid, root); 3333 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3334 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3335 __kmp_print_structure_thread(" Uber Thread: ", 3336 root->r.r_uber_thread); 3337 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3338 __kmp_printf(" In Parallel: %2d\n", 3339 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3340 __kmp_printf("\n"); 3341 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3342 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3343 } 3344 } 3345 } else { 3346 __kmp_printf("Ubers array is not allocated.\n"); 3347 } 3348 3349 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3350 "--------\n"); 3351 while (list->next != NULL) { 3352 kmp_team_p const *team = list->entry; 3353 int i; 3354 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3355 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3356 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3357 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3358 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3359 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3360 for (i = 0; i < team->t.t_nproc; ++i) { 3361 __kmp_printf(" Thread %2d: ", i); 3362 __kmp_print_structure_thread("", team->t.t_threads[i]); 3363 } 3364 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3365 __kmp_printf("\n"); 3366 list = list->next; 3367 } 3368 3369 // Print out __kmp_thread_pool and __kmp_team_pool. 3370 __kmp_printf("\n------------------------------\nPools\n----------------------" 3371 "--------\n"); 3372 __kmp_print_structure_thread("Thread pool: ", 3373 CCAST(kmp_info_t *, __kmp_thread_pool)); 3374 __kmp_print_structure_team("Team pool: ", 3375 CCAST(kmp_team_t *, __kmp_team_pool)); 3376 __kmp_printf("\n"); 3377 3378 // Free team list. 3379 while (list != NULL) { 3380 kmp_team_list_item_t *item = list; 3381 list = list->next; 3382 KMP_INTERNAL_FREE(item); 3383 } 3384 } 3385 3386 #endif 3387 3388 //--------------------------------------------------------------------------- 3389 // Stuff for per-thread fast random number generator 3390 // Table of primes 3391 static const unsigned __kmp_primes[] = { 3392 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3393 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3394 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3395 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3396 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3397 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3398 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3399 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3400 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3401 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3402 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3403 3404 //--------------------------------------------------------------------------- 3405 // __kmp_get_random: Get a random number using a linear congruential method. 3406 unsigned short __kmp_get_random(kmp_info_t *thread) { 3407 unsigned x = thread->th.th_x; 3408 unsigned short r = (unsigned short)(x >> 16); 3409 3410 thread->th.th_x = x * thread->th.th_a + 1; 3411 3412 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3413 thread->th.th_info.ds.ds_tid, r)); 3414 3415 return r; 3416 } 3417 //-------------------------------------------------------- 3418 // __kmp_init_random: Initialize a random number generator 3419 void __kmp_init_random(kmp_info_t *thread) { 3420 unsigned seed = thread->th.th_info.ds.ds_tid; 3421 3422 thread->th.th_a = 3423 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3424 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3425 KA_TRACE(30, 3426 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3427 } 3428 3429 #if KMP_OS_WINDOWS 3430 /* reclaim array entries for root threads that are already dead, returns number 3431 * reclaimed */ 3432 static int __kmp_reclaim_dead_roots(void) { 3433 int i, r = 0; 3434 3435 for (i = 0; i < __kmp_threads_capacity; ++i) { 3436 if (KMP_UBER_GTID(i) && 3437 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3438 !__kmp_root[i] 3439 ->r.r_active) { // AC: reclaim only roots died in non-active state 3440 r += __kmp_unregister_root_other_thread(i); 3441 } 3442 } 3443 return r; 3444 } 3445 #endif 3446 3447 /* This function attempts to create free entries in __kmp_threads and 3448 __kmp_root, and returns the number of free entries generated. 3449 3450 For Windows* OS static library, the first mechanism used is to reclaim array 3451 entries for root threads that are already dead. 3452 3453 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3454 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3455 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3456 threadprivate cache array has been created. Synchronization with 3457 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3458 3459 After any dead root reclamation, if the clipping value allows array expansion 3460 to result in the generation of a total of nNeed free slots, the function does 3461 that expansion. If not, nothing is done beyond the possible initial root 3462 thread reclamation. 3463 3464 If any argument is negative, the behavior is undefined. */ 3465 static int __kmp_expand_threads(int nNeed) { 3466 int added = 0; 3467 int minimumRequiredCapacity; 3468 int newCapacity; 3469 kmp_info_t **newThreads; 3470 kmp_root_t **newRoot; 3471 3472 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3473 // resizing __kmp_threads does not need additional protection if foreign 3474 // threads are present 3475 3476 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3477 /* only for Windows static library */ 3478 /* reclaim array entries for root threads that are already dead */ 3479 added = __kmp_reclaim_dead_roots(); 3480 3481 if (nNeed) { 3482 nNeed -= added; 3483 if (nNeed < 0) 3484 nNeed = 0; 3485 } 3486 #endif 3487 if (nNeed <= 0) 3488 return added; 3489 3490 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3491 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3492 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3493 // > __kmp_max_nth in one of two ways: 3494 // 3495 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3496 // may not be reused by another thread, so we may need to increase 3497 // __kmp_threads_capacity to __kmp_max_nth + 1. 3498 // 3499 // 2) New foreign root(s) are encountered. We always register new foreign 3500 // roots. This may cause a smaller # of threads to be allocated at 3501 // subsequent parallel regions, but the worker threads hang around (and 3502 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3503 // 3504 // Anyway, that is the reason for moving the check to see if 3505 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3506 // instead of having it performed here. -BB 3507 3508 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3509 3510 /* compute expansion headroom to check if we can expand */ 3511 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3512 /* possible expansion too small -- give up */ 3513 return added; 3514 } 3515 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3516 3517 newCapacity = __kmp_threads_capacity; 3518 do { 3519 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3520 : __kmp_sys_max_nth; 3521 } while (newCapacity < minimumRequiredCapacity); 3522 newThreads = (kmp_info_t **)__kmp_allocate( 3523 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3524 newRoot = 3525 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3526 KMP_MEMCPY(newThreads, __kmp_threads, 3527 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3528 KMP_MEMCPY(newRoot, __kmp_root, 3529 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3530 3531 kmp_info_t **temp_threads = __kmp_threads; 3532 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3533 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3534 __kmp_free(temp_threads); 3535 added += newCapacity - __kmp_threads_capacity; 3536 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3537 3538 if (newCapacity > __kmp_tp_capacity) { 3539 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3540 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3541 __kmp_threadprivate_resize_cache(newCapacity); 3542 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3543 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3544 } 3545 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3546 } 3547 3548 return added; 3549 } 3550 3551 /* Register the current thread as a root thread and obtain our gtid. We must 3552 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3553 thread that calls from __kmp_do_serial_initialize() */ 3554 int __kmp_register_root(int initial_thread) { 3555 kmp_info_t *root_thread; 3556 kmp_root_t *root; 3557 int gtid; 3558 int capacity; 3559 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3560 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3561 KMP_MB(); 3562 3563 /* 2007-03-02: 3564 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3565 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3566 work as expected -- it may return false (that means there is at least one 3567 empty slot in __kmp_threads array), but it is possible the only free slot 3568 is #0, which is reserved for initial thread and so cannot be used for this 3569 one. Following code workarounds this bug. 3570 3571 However, right solution seems to be not reserving slot #0 for initial 3572 thread because: 3573 (1) there is no magic in slot #0, 3574 (2) we cannot detect initial thread reliably (the first thread which does 3575 serial initialization may be not a real initial thread). 3576 */ 3577 capacity = __kmp_threads_capacity; 3578 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3579 --capacity; 3580 } 3581 3582 /* see if there are too many threads */ 3583 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3584 if (__kmp_tp_cached) { 3585 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3586 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3587 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3588 } else { 3589 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3590 __kmp_msg_null); 3591 } 3592 } 3593 3594 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3595 // 0: initial thread, also a regular OpenMP thread. 3596 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3597 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3598 // regular OpenMP threads. 3599 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3600 // Find an available thread slot for hidden helper thread. Slots for hidden 3601 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3602 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3603 gtid <= __kmp_hidden_helper_threads_num; 3604 gtid++) 3605 ; 3606 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3607 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3608 "hidden helper thread: T#%d\n", 3609 gtid)); 3610 } else { 3611 /* find an available thread slot */ 3612 // Don't reassign the zero slot since we need that to only be used by 3613 // initial thread. Slots for hidden helper threads should also be skipped. 3614 if (initial_thread && __kmp_threads[0] == NULL) { 3615 gtid = 0; 3616 } else { 3617 for (gtid = __kmp_hidden_helper_threads_num + 1; 3618 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3619 ; 3620 } 3621 KA_TRACE( 3622 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3623 KMP_ASSERT(gtid < __kmp_threads_capacity); 3624 } 3625 3626 /* update global accounting */ 3627 __kmp_all_nth++; 3628 TCW_4(__kmp_nth, __kmp_nth + 1); 3629 3630 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3631 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3632 if (__kmp_adjust_gtid_mode) { 3633 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3634 if (TCR_4(__kmp_gtid_mode) != 2) { 3635 TCW_4(__kmp_gtid_mode, 2); 3636 } 3637 } else { 3638 if (TCR_4(__kmp_gtid_mode) != 1) { 3639 TCW_4(__kmp_gtid_mode, 1); 3640 } 3641 } 3642 } 3643 3644 #ifdef KMP_ADJUST_BLOCKTIME 3645 /* Adjust blocktime to zero if necessary */ 3646 /* Middle initialization might not have occurred yet */ 3647 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3648 if (__kmp_nth > __kmp_avail_proc) { 3649 __kmp_zero_bt = TRUE; 3650 } 3651 } 3652 #endif /* KMP_ADJUST_BLOCKTIME */ 3653 3654 /* setup this new hierarchy */ 3655 if (!(root = __kmp_root[gtid])) { 3656 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3657 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3658 } 3659 3660 #if KMP_STATS_ENABLED 3661 // Initialize stats as soon as possible (right after gtid assignment). 3662 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3663 __kmp_stats_thread_ptr->startLife(); 3664 KMP_SET_THREAD_STATE(SERIAL_REGION); 3665 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3666 #endif 3667 __kmp_initialize_root(root); 3668 3669 /* setup new root thread structure */ 3670 if (root->r.r_uber_thread) { 3671 root_thread = root->r.r_uber_thread; 3672 } else { 3673 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3674 if (__kmp_storage_map) { 3675 __kmp_print_thread_storage_map(root_thread, gtid); 3676 } 3677 root_thread->th.th_info.ds.ds_gtid = gtid; 3678 #if OMPT_SUPPORT 3679 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3680 #endif 3681 root_thread->th.th_root = root; 3682 if (__kmp_env_consistency_check) { 3683 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3684 } 3685 #if USE_FAST_MEMORY 3686 __kmp_initialize_fast_memory(root_thread); 3687 #endif /* USE_FAST_MEMORY */ 3688 3689 #if KMP_USE_BGET 3690 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3691 __kmp_initialize_bget(root_thread); 3692 #endif 3693 __kmp_init_random(root_thread); // Initialize random number generator 3694 } 3695 3696 /* setup the serial team held in reserve by the root thread */ 3697 if (!root_thread->th.th_serial_team) { 3698 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3699 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3700 root_thread->th.th_serial_team = __kmp_allocate_team( 3701 root, 1, 1, 3702 #if OMPT_SUPPORT 3703 ompt_data_none, // root parallel id 3704 #endif 3705 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3706 } 3707 KMP_ASSERT(root_thread->th.th_serial_team); 3708 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3709 root_thread->th.th_serial_team)); 3710 3711 /* drop root_thread into place */ 3712 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3713 3714 root->r.r_root_team->t.t_threads[0] = root_thread; 3715 root->r.r_hot_team->t.t_threads[0] = root_thread; 3716 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3717 // AC: the team created in reserve, not for execution (it is unused for now). 3718 root_thread->th.th_serial_team->t.t_serialized = 0; 3719 root->r.r_uber_thread = root_thread; 3720 3721 /* initialize the thread, get it ready to go */ 3722 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3723 TCW_4(__kmp_init_gtid, TRUE); 3724 3725 /* prepare the master thread for get_gtid() */ 3726 __kmp_gtid_set_specific(gtid); 3727 3728 #if USE_ITT_BUILD 3729 __kmp_itt_thread_name(gtid); 3730 #endif /* USE_ITT_BUILD */ 3731 3732 #ifdef KMP_TDATA_GTID 3733 __kmp_gtid = gtid; 3734 #endif 3735 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3736 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3737 3738 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3739 "plain=%u\n", 3740 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3741 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3742 KMP_INIT_BARRIER_STATE)); 3743 { // Initialize barrier data. 3744 int b; 3745 for (b = 0; b < bs_last_barrier; ++b) { 3746 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3747 #if USE_DEBUGGER 3748 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3749 #endif 3750 } 3751 } 3752 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3753 KMP_INIT_BARRIER_STATE); 3754 3755 #if KMP_AFFINITY_SUPPORTED 3756 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3757 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3758 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3759 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3760 if (TCR_4(__kmp_init_middle)) { 3761 __kmp_affinity_set_init_mask(gtid, TRUE); 3762 } 3763 #endif /* KMP_AFFINITY_SUPPORTED */ 3764 root_thread->th.th_def_allocator = __kmp_def_allocator; 3765 root_thread->th.th_prev_level = 0; 3766 root_thread->th.th_prev_num_threads = 1; 3767 3768 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3769 tmp->cg_root = root_thread; 3770 tmp->cg_thread_limit = __kmp_cg_max_nth; 3771 tmp->cg_nthreads = 1; 3772 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3773 " cg_nthreads init to 1\n", 3774 root_thread, tmp)); 3775 tmp->up = NULL; 3776 root_thread->th.th_cg_roots = tmp; 3777 3778 __kmp_root_counter++; 3779 3780 #if OMPT_SUPPORT 3781 if (!initial_thread && ompt_enabled.enabled) { 3782 3783 kmp_info_t *root_thread = ompt_get_thread(); 3784 3785 ompt_set_thread_state(root_thread, ompt_state_overhead); 3786 3787 if (ompt_enabled.ompt_callback_thread_begin) { 3788 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3789 ompt_thread_initial, __ompt_get_thread_data_internal()); 3790 } 3791 ompt_data_t *task_data; 3792 ompt_data_t *parallel_data; 3793 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3794 NULL); 3795 if (ompt_enabled.ompt_callback_implicit_task) { 3796 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3797 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3798 } 3799 3800 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3801 } 3802 #endif 3803 3804 KMP_MB(); 3805 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3806 3807 return gtid; 3808 } 3809 3810 #if KMP_NESTED_HOT_TEAMS 3811 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3812 const int max_level) { 3813 int i, n, nth; 3814 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3815 if (!hot_teams || !hot_teams[level].hot_team) { 3816 return 0; 3817 } 3818 KMP_DEBUG_ASSERT(level < max_level); 3819 kmp_team_t *team = hot_teams[level].hot_team; 3820 nth = hot_teams[level].hot_team_nth; 3821 n = nth - 1; // master is not freed 3822 if (level < max_level - 1) { 3823 for (i = 0; i < nth; ++i) { 3824 kmp_info_t *th = team->t.t_threads[i]; 3825 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3826 if (i > 0 && th->th.th_hot_teams) { 3827 __kmp_free(th->th.th_hot_teams); 3828 th->th.th_hot_teams = NULL; 3829 } 3830 } 3831 } 3832 __kmp_free_team(root, team, NULL); 3833 return n; 3834 } 3835 #endif 3836 3837 // Resets a root thread and clear its root and hot teams. 3838 // Returns the number of __kmp_threads entries directly and indirectly freed. 3839 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3840 kmp_team_t *root_team = root->r.r_root_team; 3841 kmp_team_t *hot_team = root->r.r_hot_team; 3842 int n = hot_team->t.t_nproc; 3843 int i; 3844 3845 KMP_DEBUG_ASSERT(!root->r.r_active); 3846 3847 root->r.r_root_team = NULL; 3848 root->r.r_hot_team = NULL; 3849 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3850 // before call to __kmp_free_team(). 3851 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3852 #if KMP_NESTED_HOT_TEAMS 3853 if (__kmp_hot_teams_max_level > 3854 0) { // need to free nested hot teams and their threads if any 3855 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3856 kmp_info_t *th = hot_team->t.t_threads[i]; 3857 if (__kmp_hot_teams_max_level > 1) { 3858 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3859 } 3860 if (th->th.th_hot_teams) { 3861 __kmp_free(th->th.th_hot_teams); 3862 th->th.th_hot_teams = NULL; 3863 } 3864 } 3865 } 3866 #endif 3867 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3868 3869 // Before we can reap the thread, we need to make certain that all other 3870 // threads in the teams that had this root as ancestor have stopped trying to 3871 // steal tasks. 3872 if (__kmp_tasking_mode != tskm_immediate_exec) { 3873 __kmp_wait_to_unref_task_teams(); 3874 } 3875 3876 #if KMP_OS_WINDOWS 3877 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3878 KA_TRACE( 3879 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3880 "\n", 3881 (LPVOID) & (root->r.r_uber_thread->th), 3882 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3883 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3884 #endif /* KMP_OS_WINDOWS */ 3885 3886 #if OMPT_SUPPORT 3887 ompt_data_t *task_data; 3888 ompt_data_t *parallel_data; 3889 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3890 NULL); 3891 if (ompt_enabled.ompt_callback_implicit_task) { 3892 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3893 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3894 } 3895 if (ompt_enabled.ompt_callback_thread_end) { 3896 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3897 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3898 } 3899 #endif 3900 3901 TCW_4(__kmp_nth, 3902 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3903 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3904 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3905 " to %d\n", 3906 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3907 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3908 if (i == 1) { 3909 // need to free contention group structure 3910 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3911 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3912 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3913 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3914 root->r.r_uber_thread->th.th_cg_roots = NULL; 3915 } 3916 __kmp_reap_thread(root->r.r_uber_thread, 1); 3917 3918 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3919 // instead of freeing. 3920 root->r.r_uber_thread = NULL; 3921 /* mark root as no longer in use */ 3922 root->r.r_begin = FALSE; 3923 3924 return n; 3925 } 3926 3927 void __kmp_unregister_root_current_thread(int gtid) { 3928 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3929 /* this lock should be ok, since unregister_root_current_thread is never 3930 called during an abort, only during a normal close. furthermore, if you 3931 have the forkjoin lock, you should never try to get the initz lock */ 3932 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3933 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3934 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3935 "exiting T#%d\n", 3936 gtid)); 3937 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3938 return; 3939 } 3940 kmp_root_t *root = __kmp_root[gtid]; 3941 3942 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3943 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3944 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3945 KMP_ASSERT(root->r.r_active == FALSE); 3946 3947 KMP_MB(); 3948 3949 kmp_info_t *thread = __kmp_threads[gtid]; 3950 kmp_team_t *team = thread->th.th_team; 3951 kmp_task_team_t *task_team = thread->th.th_task_team; 3952 3953 // we need to wait for the proxy tasks before finishing the thread 3954 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3955 #if OMPT_SUPPORT 3956 // the runtime is shutting down so we won't report any events 3957 thread->th.ompt_thread_info.state = ompt_state_undefined; 3958 #endif 3959 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3960 } 3961 3962 __kmp_reset_root(gtid, root); 3963 3964 KMP_MB(); 3965 KC_TRACE(10, 3966 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3967 3968 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3969 } 3970 3971 #if KMP_OS_WINDOWS 3972 /* __kmp_forkjoin_lock must be already held 3973 Unregisters a root thread that is not the current thread. Returns the number 3974 of __kmp_threads entries freed as a result. */ 3975 static int __kmp_unregister_root_other_thread(int gtid) { 3976 kmp_root_t *root = __kmp_root[gtid]; 3977 int r; 3978 3979 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 3980 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3981 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3982 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3983 KMP_ASSERT(root->r.r_active == FALSE); 3984 3985 r = __kmp_reset_root(gtid, root); 3986 KC_TRACE(10, 3987 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 3988 return r; 3989 } 3990 #endif 3991 3992 #if KMP_DEBUG 3993 void __kmp_task_info() { 3994 3995 kmp_int32 gtid = __kmp_entry_gtid(); 3996 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 3997 kmp_info_t *this_thr = __kmp_threads[gtid]; 3998 kmp_team_t *steam = this_thr->th.th_serial_team; 3999 kmp_team_t *team = this_thr->th.th_team; 4000 4001 __kmp_printf( 4002 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4003 "ptask=%p\n", 4004 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4005 team->t.t_implicit_task_taskdata[tid].td_parent); 4006 } 4007 #endif // KMP_DEBUG 4008 4009 /* TODO optimize with one big memclr, take out what isn't needed, split 4010 responsibility to workers as much as possible, and delay initialization of 4011 features as much as possible */ 4012 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4013 int tid, int gtid) { 4014 /* this_thr->th.th_info.ds.ds_gtid is setup in 4015 kmp_allocate_thread/create_worker. 4016 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4017 kmp_info_t *master = team->t.t_threads[0]; 4018 KMP_DEBUG_ASSERT(this_thr != NULL); 4019 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4020 KMP_DEBUG_ASSERT(team); 4021 KMP_DEBUG_ASSERT(team->t.t_threads); 4022 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4023 KMP_DEBUG_ASSERT(master); 4024 KMP_DEBUG_ASSERT(master->th.th_root); 4025 4026 KMP_MB(); 4027 4028 TCW_SYNC_PTR(this_thr->th.th_team, team); 4029 4030 this_thr->th.th_info.ds.ds_tid = tid; 4031 this_thr->th.th_set_nproc = 0; 4032 if (__kmp_tasking_mode != tskm_immediate_exec) 4033 // When tasking is possible, threads are not safe to reap until they are 4034 // done tasking; this will be set when tasking code is exited in wait 4035 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4036 else // no tasking --> always safe to reap 4037 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4038 this_thr->th.th_set_proc_bind = proc_bind_default; 4039 #if KMP_AFFINITY_SUPPORTED 4040 this_thr->th.th_new_place = this_thr->th.th_current_place; 4041 #endif 4042 this_thr->th.th_root = master->th.th_root; 4043 4044 /* setup the thread's cache of the team structure */ 4045 this_thr->th.th_team_nproc = team->t.t_nproc; 4046 this_thr->th.th_team_master = master; 4047 this_thr->th.th_team_serialized = team->t.t_serialized; 4048 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4049 4050 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4051 4052 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4053 tid, gtid, this_thr, this_thr->th.th_current_task)); 4054 4055 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4056 team, tid, TRUE); 4057 4058 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4059 tid, gtid, this_thr, this_thr->th.th_current_task)); 4060 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4061 // __kmp_initialize_team()? 4062 4063 /* TODO no worksharing in speculative threads */ 4064 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4065 4066 this_thr->th.th_local.this_construct = 0; 4067 4068 if (!this_thr->th.th_pri_common) { 4069 this_thr->th.th_pri_common = 4070 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4071 if (__kmp_storage_map) { 4072 __kmp_print_storage_map_gtid( 4073 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4074 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4075 } 4076 this_thr->th.th_pri_head = NULL; 4077 } 4078 4079 if (this_thr != master && // Master's CG root is initialized elsewhere 4080 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4081 // Make new thread's CG root same as master's 4082 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4083 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4084 if (tmp) { 4085 // worker changes CG, need to check if old CG should be freed 4086 int i = tmp->cg_nthreads--; 4087 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4088 " on node %p of thread %p to %d\n", 4089 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4090 if (i == 1) { 4091 __kmp_free(tmp); // last thread left CG --> free it 4092 } 4093 } 4094 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4095 // Increment new thread's CG root's counter to add the new thread 4096 this_thr->th.th_cg_roots->cg_nthreads++; 4097 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4098 " node %p of thread %p to %d\n", 4099 this_thr, this_thr->th.th_cg_roots, 4100 this_thr->th.th_cg_roots->cg_root, 4101 this_thr->th.th_cg_roots->cg_nthreads)); 4102 this_thr->th.th_current_task->td_icvs.thread_limit = 4103 this_thr->th.th_cg_roots->cg_thread_limit; 4104 } 4105 4106 /* Initialize dynamic dispatch */ 4107 { 4108 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4109 // Use team max_nproc since this will never change for the team. 4110 size_t disp_size = 4111 sizeof(dispatch_private_info_t) * 4112 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4113 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4114 team->t.t_max_nproc)); 4115 KMP_ASSERT(dispatch); 4116 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4117 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4118 4119 dispatch->th_disp_index = 0; 4120 dispatch->th_doacross_buf_idx = 0; 4121 if (!dispatch->th_disp_buffer) { 4122 dispatch->th_disp_buffer = 4123 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4124 4125 if (__kmp_storage_map) { 4126 __kmp_print_storage_map_gtid( 4127 gtid, &dispatch->th_disp_buffer[0], 4128 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4129 ? 1 4130 : __kmp_dispatch_num_buffers], 4131 disp_size, 4132 "th_%d.th_dispatch.th_disp_buffer " 4133 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4134 gtid, team->t.t_id, gtid); 4135 } 4136 } else { 4137 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4138 } 4139 4140 dispatch->th_dispatch_pr_current = 0; 4141 dispatch->th_dispatch_sh_current = 0; 4142 4143 dispatch->th_deo_fcn = 0; /* ORDERED */ 4144 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4145 } 4146 4147 this_thr->th.th_next_pool = NULL; 4148 4149 if (!this_thr->th.th_task_state_memo_stack) { 4150 size_t i; 4151 this_thr->th.th_task_state_memo_stack = 4152 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4153 this_thr->th.th_task_state_top = 0; 4154 this_thr->th.th_task_state_stack_sz = 4; 4155 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4156 ++i) // zero init the stack 4157 this_thr->th.th_task_state_memo_stack[i] = 0; 4158 } 4159 4160 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4161 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4162 4163 KMP_MB(); 4164 } 4165 4166 /* allocate a new thread for the requesting team. this is only called from 4167 within a forkjoin critical section. we will first try to get an available 4168 thread from the thread pool. if none is available, we will fork a new one 4169 assuming we are able to create a new one. this should be assured, as the 4170 caller should check on this first. */ 4171 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4172 int new_tid) { 4173 kmp_team_t *serial_team; 4174 kmp_info_t *new_thr; 4175 int new_gtid; 4176 4177 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4178 KMP_DEBUG_ASSERT(root && team); 4179 #if !KMP_NESTED_HOT_TEAMS 4180 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4181 #endif 4182 KMP_MB(); 4183 4184 /* first, try to get one from the thread pool */ 4185 if (__kmp_thread_pool) { 4186 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4187 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4188 if (new_thr == __kmp_thread_pool_insert_pt) { 4189 __kmp_thread_pool_insert_pt = NULL; 4190 } 4191 TCW_4(new_thr->th.th_in_pool, FALSE); 4192 __kmp_suspend_initialize_thread(new_thr); 4193 __kmp_lock_suspend_mx(new_thr); 4194 if (new_thr->th.th_active_in_pool == TRUE) { 4195 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4196 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4197 new_thr->th.th_active_in_pool = FALSE; 4198 } 4199 __kmp_unlock_suspend_mx(new_thr); 4200 4201 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4202 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4203 KMP_ASSERT(!new_thr->th.th_team); 4204 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4205 4206 /* setup the thread structure */ 4207 __kmp_initialize_info(new_thr, team, new_tid, 4208 new_thr->th.th_info.ds.ds_gtid); 4209 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4210 4211 TCW_4(__kmp_nth, __kmp_nth + 1); 4212 4213 new_thr->th.th_task_state = 0; 4214 new_thr->th.th_task_state_top = 0; 4215 new_thr->th.th_task_state_stack_sz = 4; 4216 4217 #ifdef KMP_ADJUST_BLOCKTIME 4218 /* Adjust blocktime back to zero if necessary */ 4219 /* Middle initialization might not have occurred yet */ 4220 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4221 if (__kmp_nth > __kmp_avail_proc) { 4222 __kmp_zero_bt = TRUE; 4223 } 4224 } 4225 #endif /* KMP_ADJUST_BLOCKTIME */ 4226 4227 #if KMP_DEBUG 4228 // If thread entered pool via __kmp_free_thread, wait_flag should != 4229 // KMP_BARRIER_PARENT_FLAG. 4230 int b; 4231 kmp_balign_t *balign = new_thr->th.th_bar; 4232 for (b = 0; b < bs_last_barrier; ++b) 4233 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4234 #endif 4235 4236 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4237 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4238 4239 KMP_MB(); 4240 return new_thr; 4241 } 4242 4243 /* no, well fork a new one */ 4244 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4245 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4246 4247 #if KMP_USE_MONITOR 4248 // If this is the first worker thread the RTL is creating, then also 4249 // launch the monitor thread. We try to do this as early as possible. 4250 if (!TCR_4(__kmp_init_monitor)) { 4251 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4252 if (!TCR_4(__kmp_init_monitor)) { 4253 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4254 TCW_4(__kmp_init_monitor, 1); 4255 __kmp_create_monitor(&__kmp_monitor); 4256 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4257 #if KMP_OS_WINDOWS 4258 // AC: wait until monitor has started. This is a fix for CQ232808. 4259 // The reason is that if the library is loaded/unloaded in a loop with 4260 // small (parallel) work in between, then there is high probability that 4261 // monitor thread started after the library shutdown. At shutdown it is 4262 // too late to cope with the problem, because when the master is in 4263 // DllMain (process detach) the monitor has no chances to start (it is 4264 // blocked), and master has no means to inform the monitor that the 4265 // library has gone, because all the memory which the monitor can access 4266 // is going to be released/reset. 4267 while (TCR_4(__kmp_init_monitor) < 2) { 4268 KMP_YIELD(TRUE); 4269 } 4270 KF_TRACE(10, ("after monitor thread has started\n")); 4271 #endif 4272 } 4273 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4274 } 4275 #endif 4276 4277 KMP_MB(); 4278 4279 { 4280 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4281 ? 1 4282 : __kmp_hidden_helper_threads_num + 1; 4283 4284 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4285 ++new_gtid) { 4286 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4287 } 4288 4289 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4290 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4291 } 4292 } 4293 4294 /* allocate space for it. */ 4295 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4296 4297 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4298 4299 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4300 // suppress race conditions detection on synchronization flags in debug mode 4301 // this helps to analyze library internals eliminating false positives 4302 __itt_suppress_mark_range( 4303 __itt_suppress_range, __itt_suppress_threading_errors, 4304 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4305 __itt_suppress_mark_range( 4306 __itt_suppress_range, __itt_suppress_threading_errors, 4307 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4308 #if KMP_OS_WINDOWS 4309 __itt_suppress_mark_range( 4310 __itt_suppress_range, __itt_suppress_threading_errors, 4311 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4312 #else 4313 __itt_suppress_mark_range(__itt_suppress_range, 4314 __itt_suppress_threading_errors, 4315 &new_thr->th.th_suspend_init_count, 4316 sizeof(new_thr->th.th_suspend_init_count)); 4317 #endif 4318 // TODO: check if we need to also suppress b_arrived flags 4319 __itt_suppress_mark_range(__itt_suppress_range, 4320 __itt_suppress_threading_errors, 4321 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4322 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4323 __itt_suppress_mark_range(__itt_suppress_range, 4324 __itt_suppress_threading_errors, 4325 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4326 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4327 __itt_suppress_mark_range(__itt_suppress_range, 4328 __itt_suppress_threading_errors, 4329 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4330 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4331 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4332 if (__kmp_storage_map) { 4333 __kmp_print_thread_storage_map(new_thr, new_gtid); 4334 } 4335 4336 // add the reserve serialized team, initialized from the team's master thread 4337 { 4338 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4339 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4340 new_thr->th.th_serial_team = serial_team = 4341 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4342 #if OMPT_SUPPORT 4343 ompt_data_none, // root parallel id 4344 #endif 4345 proc_bind_default, &r_icvs, 4346 0 USE_NESTED_HOT_ARG(NULL)); 4347 } 4348 KMP_ASSERT(serial_team); 4349 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4350 // execution (it is unused for now). 4351 serial_team->t.t_threads[0] = new_thr; 4352 KF_TRACE(10, 4353 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4354 new_thr)); 4355 4356 /* setup the thread structures */ 4357 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4358 4359 #if USE_FAST_MEMORY 4360 __kmp_initialize_fast_memory(new_thr); 4361 #endif /* USE_FAST_MEMORY */ 4362 4363 #if KMP_USE_BGET 4364 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4365 __kmp_initialize_bget(new_thr); 4366 #endif 4367 4368 __kmp_init_random(new_thr); // Initialize random number generator 4369 4370 /* Initialize these only once when thread is grabbed for a team allocation */ 4371 KA_TRACE(20, 4372 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4373 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4374 4375 int b; 4376 kmp_balign_t *balign = new_thr->th.th_bar; 4377 for (b = 0; b < bs_last_barrier; ++b) { 4378 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4379 balign[b].bb.team = NULL; 4380 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4381 balign[b].bb.use_oncore_barrier = 0; 4382 } 4383 4384 new_thr->th.th_spin_here = FALSE; 4385 new_thr->th.th_next_waiting = 0; 4386 #if KMP_OS_UNIX 4387 new_thr->th.th_blocking = false; 4388 #endif 4389 4390 #if KMP_AFFINITY_SUPPORTED 4391 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4392 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4393 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4394 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4395 #endif 4396 new_thr->th.th_def_allocator = __kmp_def_allocator; 4397 new_thr->th.th_prev_level = 0; 4398 new_thr->th.th_prev_num_threads = 1; 4399 4400 TCW_4(new_thr->th.th_in_pool, FALSE); 4401 new_thr->th.th_active_in_pool = FALSE; 4402 TCW_4(new_thr->th.th_active, TRUE); 4403 4404 /* adjust the global counters */ 4405 __kmp_all_nth++; 4406 __kmp_nth++; 4407 4408 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4409 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4410 if (__kmp_adjust_gtid_mode) { 4411 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4412 if (TCR_4(__kmp_gtid_mode) != 2) { 4413 TCW_4(__kmp_gtid_mode, 2); 4414 } 4415 } else { 4416 if (TCR_4(__kmp_gtid_mode) != 1) { 4417 TCW_4(__kmp_gtid_mode, 1); 4418 } 4419 } 4420 } 4421 4422 #ifdef KMP_ADJUST_BLOCKTIME 4423 /* Adjust blocktime back to zero if necessary */ 4424 /* Middle initialization might not have occurred yet */ 4425 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4426 if (__kmp_nth > __kmp_avail_proc) { 4427 __kmp_zero_bt = TRUE; 4428 } 4429 } 4430 #endif /* KMP_ADJUST_BLOCKTIME */ 4431 4432 /* actually fork it and create the new worker thread */ 4433 KF_TRACE( 4434 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4435 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4436 KF_TRACE(10, 4437 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4438 4439 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4440 new_gtid)); 4441 KMP_MB(); 4442 return new_thr; 4443 } 4444 4445 /* Reinitialize team for reuse. 4446 The hot team code calls this case at every fork barrier, so EPCC barrier 4447 test are extremely sensitive to changes in it, esp. writes to the team 4448 struct, which cause a cache invalidation in all threads. 4449 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4450 static void __kmp_reinitialize_team(kmp_team_t *team, 4451 kmp_internal_control_t *new_icvs, 4452 ident_t *loc) { 4453 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4454 team->t.t_threads[0], team)); 4455 KMP_DEBUG_ASSERT(team && new_icvs); 4456 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4457 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4458 4459 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4460 // Copy ICVs to the master thread's implicit taskdata 4461 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4462 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4463 4464 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4465 team->t.t_threads[0], team)); 4466 } 4467 4468 /* Initialize the team data structure. 4469 This assumes the t_threads and t_max_nproc are already set. 4470 Also, we don't touch the arguments */ 4471 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4472 kmp_internal_control_t *new_icvs, 4473 ident_t *loc) { 4474 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4475 4476 /* verify */ 4477 KMP_DEBUG_ASSERT(team); 4478 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4479 KMP_DEBUG_ASSERT(team->t.t_threads); 4480 KMP_MB(); 4481 4482 team->t.t_master_tid = 0; /* not needed */ 4483 /* team->t.t_master_bar; not needed */ 4484 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4485 team->t.t_nproc = new_nproc; 4486 4487 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4488 team->t.t_next_pool = NULL; 4489 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4490 * up hot team */ 4491 4492 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4493 team->t.t_invoke = NULL; /* not needed */ 4494 4495 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4496 team->t.t_sched.sched = new_icvs->sched.sched; 4497 4498 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4499 team->t.t_fp_control_saved = FALSE; /* not needed */ 4500 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4501 team->t.t_mxcsr = 0; /* not needed */ 4502 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4503 4504 team->t.t_construct = 0; 4505 4506 team->t.t_ordered.dt.t_value = 0; 4507 team->t.t_master_active = FALSE; 4508 4509 #ifdef KMP_DEBUG 4510 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4511 #endif 4512 #if KMP_OS_WINDOWS 4513 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4514 #endif 4515 4516 team->t.t_control_stack_top = NULL; 4517 4518 __kmp_reinitialize_team(team, new_icvs, loc); 4519 4520 KMP_MB(); 4521 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4522 } 4523 4524 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4525 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4526 static void 4527 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4528 if (KMP_AFFINITY_CAPABLE()) { 4529 int status; 4530 if (old_mask != NULL) { 4531 status = __kmp_get_system_affinity(old_mask, TRUE); 4532 int error = errno; 4533 if (status != 0) { 4534 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4535 __kmp_msg_null); 4536 } 4537 } 4538 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4539 } 4540 } 4541 #endif 4542 4543 #if KMP_AFFINITY_SUPPORTED 4544 4545 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4546 // It calculates the worker + master thread's partition based upon the parent 4547 // thread's partition, and binds each worker to a thread in their partition. 4548 // The master thread's partition should already include its current binding. 4549 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4550 // Copy the master thread's place partition to the team struct 4551 kmp_info_t *master_th = team->t.t_threads[0]; 4552 KMP_DEBUG_ASSERT(master_th != NULL); 4553 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4554 int first_place = master_th->th.th_first_place; 4555 int last_place = master_th->th.th_last_place; 4556 int masters_place = master_th->th.th_current_place; 4557 team->t.t_first_place = first_place; 4558 team->t.t_last_place = last_place; 4559 4560 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4561 "bound to place %d partition = [%d,%d]\n", 4562 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4563 team->t.t_id, masters_place, first_place, last_place)); 4564 4565 switch (proc_bind) { 4566 4567 case proc_bind_default: 4568 // serial teams might have the proc_bind policy set to proc_bind_default. It 4569 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4570 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4571 break; 4572 4573 case proc_bind_master: { 4574 int f; 4575 int n_th = team->t.t_nproc; 4576 for (f = 1; f < n_th; f++) { 4577 kmp_info_t *th = team->t.t_threads[f]; 4578 KMP_DEBUG_ASSERT(th != NULL); 4579 th->th.th_first_place = first_place; 4580 th->th.th_last_place = last_place; 4581 th->th.th_new_place = masters_place; 4582 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4583 team->t.t_display_affinity != 1) { 4584 team->t.t_display_affinity = 1; 4585 } 4586 4587 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4588 "partition = [%d,%d]\n", 4589 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4590 f, masters_place, first_place, last_place)); 4591 } 4592 } break; 4593 4594 case proc_bind_close: { 4595 int f; 4596 int n_th = team->t.t_nproc; 4597 int n_places; 4598 if (first_place <= last_place) { 4599 n_places = last_place - first_place + 1; 4600 } else { 4601 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4602 } 4603 if (n_th <= n_places) { 4604 int place = masters_place; 4605 for (f = 1; f < n_th; f++) { 4606 kmp_info_t *th = team->t.t_threads[f]; 4607 KMP_DEBUG_ASSERT(th != NULL); 4608 4609 if (place == last_place) { 4610 place = first_place; 4611 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4612 place = 0; 4613 } else { 4614 place++; 4615 } 4616 th->th.th_first_place = first_place; 4617 th->th.th_last_place = last_place; 4618 th->th.th_new_place = place; 4619 if (__kmp_display_affinity && place != th->th.th_current_place && 4620 team->t.t_display_affinity != 1) { 4621 team->t.t_display_affinity = 1; 4622 } 4623 4624 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4625 "partition = [%d,%d]\n", 4626 __kmp_gtid_from_thread(team->t.t_threads[f]), 4627 team->t.t_id, f, place, first_place, last_place)); 4628 } 4629 } else { 4630 int S, rem, gap, s_count; 4631 S = n_th / n_places; 4632 s_count = 0; 4633 rem = n_th - (S * n_places); 4634 gap = rem > 0 ? n_places / rem : n_places; 4635 int place = masters_place; 4636 int gap_ct = gap; 4637 for (f = 0; f < n_th; f++) { 4638 kmp_info_t *th = team->t.t_threads[f]; 4639 KMP_DEBUG_ASSERT(th != NULL); 4640 4641 th->th.th_first_place = first_place; 4642 th->th.th_last_place = last_place; 4643 th->th.th_new_place = place; 4644 if (__kmp_display_affinity && place != th->th.th_current_place && 4645 team->t.t_display_affinity != 1) { 4646 team->t.t_display_affinity = 1; 4647 } 4648 s_count++; 4649 4650 if ((s_count == S) && rem && (gap_ct == gap)) { 4651 // do nothing, add an extra thread to place on next iteration 4652 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4653 // we added an extra thread to this place; move to next place 4654 if (place == last_place) { 4655 place = first_place; 4656 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4657 place = 0; 4658 } else { 4659 place++; 4660 } 4661 s_count = 0; 4662 gap_ct = 1; 4663 rem--; 4664 } else if (s_count == S) { // place full; don't add extra 4665 if (place == last_place) { 4666 place = first_place; 4667 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4668 place = 0; 4669 } else { 4670 place++; 4671 } 4672 gap_ct++; 4673 s_count = 0; 4674 } 4675 4676 KA_TRACE(100, 4677 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4678 "partition = [%d,%d]\n", 4679 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4680 th->th.th_new_place, first_place, last_place)); 4681 } 4682 KMP_DEBUG_ASSERT(place == masters_place); 4683 } 4684 } break; 4685 4686 case proc_bind_spread: { 4687 int f; 4688 int n_th = team->t.t_nproc; 4689 int n_places; 4690 int thidx; 4691 if (first_place <= last_place) { 4692 n_places = last_place - first_place + 1; 4693 } else { 4694 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4695 } 4696 if (n_th <= n_places) { 4697 int place = -1; 4698 4699 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4700 int S = n_places / n_th; 4701 int s_count, rem, gap, gap_ct; 4702 4703 place = masters_place; 4704 rem = n_places - n_th * S; 4705 gap = rem ? n_th / rem : 1; 4706 gap_ct = gap; 4707 thidx = n_th; 4708 if (update_master_only == 1) 4709 thidx = 1; 4710 for (f = 0; f < thidx; f++) { 4711 kmp_info_t *th = team->t.t_threads[f]; 4712 KMP_DEBUG_ASSERT(th != NULL); 4713 4714 th->th.th_first_place = place; 4715 th->th.th_new_place = place; 4716 if (__kmp_display_affinity && place != th->th.th_current_place && 4717 team->t.t_display_affinity != 1) { 4718 team->t.t_display_affinity = 1; 4719 } 4720 s_count = 1; 4721 while (s_count < S) { 4722 if (place == last_place) { 4723 place = first_place; 4724 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4725 place = 0; 4726 } else { 4727 place++; 4728 } 4729 s_count++; 4730 } 4731 if (rem && (gap_ct == gap)) { 4732 if (place == last_place) { 4733 place = first_place; 4734 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4735 place = 0; 4736 } else { 4737 place++; 4738 } 4739 rem--; 4740 gap_ct = 0; 4741 } 4742 th->th.th_last_place = place; 4743 gap_ct++; 4744 4745 if (place == last_place) { 4746 place = first_place; 4747 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4748 place = 0; 4749 } else { 4750 place++; 4751 } 4752 4753 KA_TRACE(100, 4754 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4755 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4756 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4757 f, th->th.th_new_place, th->th.th_first_place, 4758 th->th.th_last_place, __kmp_affinity_num_masks)); 4759 } 4760 } else { 4761 /* Having uniform space of available computation places I can create 4762 T partitions of round(P/T) size and put threads into the first 4763 place of each partition. */ 4764 double current = static_cast<double>(masters_place); 4765 double spacing = 4766 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4767 int first, last; 4768 kmp_info_t *th; 4769 4770 thidx = n_th + 1; 4771 if (update_master_only == 1) 4772 thidx = 1; 4773 for (f = 0; f < thidx; f++) { 4774 first = static_cast<int>(current); 4775 last = static_cast<int>(current + spacing) - 1; 4776 KMP_DEBUG_ASSERT(last >= first); 4777 if (first >= n_places) { 4778 if (masters_place) { 4779 first -= n_places; 4780 last -= n_places; 4781 if (first == (masters_place + 1)) { 4782 KMP_DEBUG_ASSERT(f == n_th); 4783 first--; 4784 } 4785 if (last == masters_place) { 4786 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4787 last--; 4788 } 4789 } else { 4790 KMP_DEBUG_ASSERT(f == n_th); 4791 first = 0; 4792 last = 0; 4793 } 4794 } 4795 if (last >= n_places) { 4796 last = (n_places - 1); 4797 } 4798 place = first; 4799 current += spacing; 4800 if (f < n_th) { 4801 KMP_DEBUG_ASSERT(0 <= first); 4802 KMP_DEBUG_ASSERT(n_places > first); 4803 KMP_DEBUG_ASSERT(0 <= last); 4804 KMP_DEBUG_ASSERT(n_places > last); 4805 KMP_DEBUG_ASSERT(last_place >= first_place); 4806 th = team->t.t_threads[f]; 4807 KMP_DEBUG_ASSERT(th); 4808 th->th.th_first_place = first; 4809 th->th.th_new_place = place; 4810 th->th.th_last_place = last; 4811 if (__kmp_display_affinity && place != th->th.th_current_place && 4812 team->t.t_display_affinity != 1) { 4813 team->t.t_display_affinity = 1; 4814 } 4815 KA_TRACE(100, 4816 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4817 "partition = [%d,%d], spacing = %.4f\n", 4818 __kmp_gtid_from_thread(team->t.t_threads[f]), 4819 team->t.t_id, f, th->th.th_new_place, 4820 th->th.th_first_place, th->th.th_last_place, spacing)); 4821 } 4822 } 4823 } 4824 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4825 } else { 4826 int S, rem, gap, s_count; 4827 S = n_th / n_places; 4828 s_count = 0; 4829 rem = n_th - (S * n_places); 4830 gap = rem > 0 ? n_places / rem : n_places; 4831 int place = masters_place; 4832 int gap_ct = gap; 4833 thidx = n_th; 4834 if (update_master_only == 1) 4835 thidx = 1; 4836 for (f = 0; f < thidx; f++) { 4837 kmp_info_t *th = team->t.t_threads[f]; 4838 KMP_DEBUG_ASSERT(th != NULL); 4839 4840 th->th.th_first_place = place; 4841 th->th.th_last_place = place; 4842 th->th.th_new_place = place; 4843 if (__kmp_display_affinity && place != th->th.th_current_place && 4844 team->t.t_display_affinity != 1) { 4845 team->t.t_display_affinity = 1; 4846 } 4847 s_count++; 4848 4849 if ((s_count == S) && rem && (gap_ct == gap)) { 4850 // do nothing, add an extra thread to place on next iteration 4851 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4852 // we added an extra thread to this place; move on to next place 4853 if (place == last_place) { 4854 place = first_place; 4855 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4856 place = 0; 4857 } else { 4858 place++; 4859 } 4860 s_count = 0; 4861 gap_ct = 1; 4862 rem--; 4863 } else if (s_count == S) { // place is full; don't add extra thread 4864 if (place == last_place) { 4865 place = first_place; 4866 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4867 place = 0; 4868 } else { 4869 place++; 4870 } 4871 gap_ct++; 4872 s_count = 0; 4873 } 4874 4875 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4876 "partition = [%d,%d]\n", 4877 __kmp_gtid_from_thread(team->t.t_threads[f]), 4878 team->t.t_id, f, th->th.th_new_place, 4879 th->th.th_first_place, th->th.th_last_place)); 4880 } 4881 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4882 } 4883 } break; 4884 4885 default: 4886 break; 4887 } 4888 4889 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4890 } 4891 4892 #endif // KMP_AFFINITY_SUPPORTED 4893 4894 /* allocate a new team data structure to use. take one off of the free pool if 4895 available */ 4896 kmp_team_t * 4897 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4898 #if OMPT_SUPPORT 4899 ompt_data_t ompt_parallel_data, 4900 #endif 4901 kmp_proc_bind_t new_proc_bind, 4902 kmp_internal_control_t *new_icvs, 4903 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4904 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4905 int f; 4906 kmp_team_t *team; 4907 int use_hot_team = !root->r.r_active; 4908 int level = 0; 4909 4910 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4911 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4912 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4913 KMP_MB(); 4914 4915 #if KMP_NESTED_HOT_TEAMS 4916 kmp_hot_team_ptr_t *hot_teams; 4917 if (master) { 4918 team = master->th.th_team; 4919 level = team->t.t_active_level; 4920 if (master->th.th_teams_microtask) { // in teams construct? 4921 if (master->th.th_teams_size.nteams > 1 && 4922 ( // #teams > 1 4923 team->t.t_pkfn == 4924 (microtask_t)__kmp_teams_master || // inner fork of the teams 4925 master->th.th_teams_level < 4926 team->t.t_level)) { // or nested parallel inside the teams 4927 ++level; // not increment if #teams==1, or for outer fork of the teams; 4928 // increment otherwise 4929 } 4930 } 4931 hot_teams = master->th.th_hot_teams; 4932 if (level < __kmp_hot_teams_max_level && hot_teams && 4933 hot_teams[level].hot_team) { 4934 // hot team has already been allocated for given level 4935 use_hot_team = 1; 4936 } else { 4937 use_hot_team = 0; 4938 } 4939 } else { 4940 // check we won't access uninitialized hot_teams, just in case 4941 KMP_DEBUG_ASSERT(new_nproc == 1); 4942 } 4943 #endif 4944 // Optimization to use a "hot" team 4945 if (use_hot_team && new_nproc > 1) { 4946 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4947 #if KMP_NESTED_HOT_TEAMS 4948 team = hot_teams[level].hot_team; 4949 #else 4950 team = root->r.r_hot_team; 4951 #endif 4952 #if KMP_DEBUG 4953 if (__kmp_tasking_mode != tskm_immediate_exec) { 4954 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4955 "task_team[1] = %p before reinit\n", 4956 team->t.t_task_team[0], team->t.t_task_team[1])); 4957 } 4958 #endif 4959 4960 // Has the number of threads changed? 4961 /* Let's assume the most common case is that the number of threads is 4962 unchanged, and put that case first. */ 4963 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4964 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4965 // This case can mean that omp_set_num_threads() was called and the hot 4966 // team size was already reduced, so we check the special flag 4967 if (team->t.t_size_changed == -1) { 4968 team->t.t_size_changed = 1; 4969 } else { 4970 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4971 } 4972 4973 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4974 kmp_r_sched_t new_sched = new_icvs->sched; 4975 // set master's schedule as new run-time schedule 4976 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 4977 4978 __kmp_reinitialize_team(team, new_icvs, 4979 root->r.r_uber_thread->th.th_ident); 4980 4981 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 4982 team->t.t_threads[0], team)); 4983 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4984 4985 #if KMP_AFFINITY_SUPPORTED 4986 if ((team->t.t_size_changed == 0) && 4987 (team->t.t_proc_bind == new_proc_bind)) { 4988 if (new_proc_bind == proc_bind_spread) { 4989 __kmp_partition_places( 4990 team, 1); // add flag to update only master for spread 4991 } 4992 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 4993 "proc_bind = %d, partition = [%d,%d]\n", 4994 team->t.t_id, new_proc_bind, team->t.t_first_place, 4995 team->t.t_last_place)); 4996 } else { 4997 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4998 __kmp_partition_places(team); 4999 } 5000 #else 5001 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5002 #endif /* KMP_AFFINITY_SUPPORTED */ 5003 } else if (team->t.t_nproc > new_nproc) { 5004 KA_TRACE(20, 5005 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5006 new_nproc)); 5007 5008 team->t.t_size_changed = 1; 5009 #if KMP_NESTED_HOT_TEAMS 5010 if (__kmp_hot_teams_mode == 0) { 5011 // AC: saved number of threads should correspond to team's value in this 5012 // mode, can be bigger in mode 1, when hot team has threads in reserve 5013 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5014 hot_teams[level].hot_team_nth = new_nproc; 5015 #endif // KMP_NESTED_HOT_TEAMS 5016 /* release the extra threads we don't need any more */ 5017 for (f = new_nproc; f < team->t.t_nproc; f++) { 5018 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5019 if (__kmp_tasking_mode != tskm_immediate_exec) { 5020 // When decreasing team size, threads no longer in the team should 5021 // unref task team. 5022 team->t.t_threads[f]->th.th_task_team = NULL; 5023 } 5024 __kmp_free_thread(team->t.t_threads[f]); 5025 team->t.t_threads[f] = NULL; 5026 } 5027 #if KMP_NESTED_HOT_TEAMS 5028 } // (__kmp_hot_teams_mode == 0) 5029 else { 5030 // When keeping extra threads in team, switch threads to wait on own 5031 // b_go flag 5032 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5033 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5034 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5035 for (int b = 0; b < bs_last_barrier; ++b) { 5036 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5037 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5038 } 5039 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5040 } 5041 } 5042 } 5043 #endif // KMP_NESTED_HOT_TEAMS 5044 team->t.t_nproc = new_nproc; 5045 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5046 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5047 __kmp_reinitialize_team(team, new_icvs, 5048 root->r.r_uber_thread->th.th_ident); 5049 5050 // Update remaining threads 5051 for (f = 0; f < new_nproc; ++f) { 5052 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5053 } 5054 5055 // restore the current task state of the master thread: should be the 5056 // implicit task 5057 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5058 team->t.t_threads[0], team)); 5059 5060 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5061 5062 #ifdef KMP_DEBUG 5063 for (f = 0; f < team->t.t_nproc; f++) { 5064 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5065 team->t.t_threads[f]->th.th_team_nproc == 5066 team->t.t_nproc); 5067 } 5068 #endif 5069 5070 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5071 #if KMP_AFFINITY_SUPPORTED 5072 __kmp_partition_places(team); 5073 #endif 5074 } else { // team->t.t_nproc < new_nproc 5075 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5076 kmp_affin_mask_t *old_mask; 5077 if (KMP_AFFINITY_CAPABLE()) { 5078 KMP_CPU_ALLOC(old_mask); 5079 } 5080 #endif 5081 5082 KA_TRACE(20, 5083 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5084 new_nproc)); 5085 5086 team->t.t_size_changed = 1; 5087 5088 #if KMP_NESTED_HOT_TEAMS 5089 int avail_threads = hot_teams[level].hot_team_nth; 5090 if (new_nproc < avail_threads) 5091 avail_threads = new_nproc; 5092 kmp_info_t **other_threads = team->t.t_threads; 5093 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5094 // Adjust barrier data of reserved threads (if any) of the team 5095 // Other data will be set in __kmp_initialize_info() below. 5096 int b; 5097 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5098 for (b = 0; b < bs_last_barrier; ++b) { 5099 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5100 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5101 #if USE_DEBUGGER 5102 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5103 #endif 5104 } 5105 } 5106 if (hot_teams[level].hot_team_nth >= new_nproc) { 5107 // we have all needed threads in reserve, no need to allocate any 5108 // this only possible in mode 1, cannot have reserved threads in mode 0 5109 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5110 team->t.t_nproc = new_nproc; // just get reserved threads involved 5111 } else { 5112 // we may have some threads in reserve, but not enough 5113 team->t.t_nproc = 5114 hot_teams[level] 5115 .hot_team_nth; // get reserved threads involved if any 5116 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5117 #endif // KMP_NESTED_HOT_TEAMS 5118 if (team->t.t_max_nproc < new_nproc) { 5119 /* reallocate larger arrays */ 5120 __kmp_reallocate_team_arrays(team, new_nproc); 5121 __kmp_reinitialize_team(team, new_icvs, NULL); 5122 } 5123 5124 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5125 /* Temporarily set full mask for master thread before creation of 5126 workers. The reason is that workers inherit the affinity from master, 5127 so if a lot of workers are created on the single core quickly, they 5128 don't get a chance to set their own affinity for a long time. */ 5129 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5130 #endif 5131 5132 /* allocate new threads for the hot team */ 5133 for (f = team->t.t_nproc; f < new_nproc; f++) { 5134 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5135 KMP_DEBUG_ASSERT(new_worker); 5136 team->t.t_threads[f] = new_worker; 5137 5138 KA_TRACE(20, 5139 ("__kmp_allocate_team: team %d init T#%d arrived: " 5140 "join=%llu, plain=%llu\n", 5141 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5142 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5143 team->t.t_bar[bs_plain_barrier].b_arrived)); 5144 5145 { // Initialize barrier data for new threads. 5146 int b; 5147 kmp_balign_t *balign = new_worker->th.th_bar; 5148 for (b = 0; b < bs_last_barrier; ++b) { 5149 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5150 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5151 KMP_BARRIER_PARENT_FLAG); 5152 #if USE_DEBUGGER 5153 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5154 #endif 5155 } 5156 } 5157 } 5158 5159 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5160 if (KMP_AFFINITY_CAPABLE()) { 5161 /* Restore initial master thread's affinity mask */ 5162 __kmp_set_system_affinity(old_mask, TRUE); 5163 KMP_CPU_FREE(old_mask); 5164 } 5165 #endif 5166 #if KMP_NESTED_HOT_TEAMS 5167 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5168 #endif // KMP_NESTED_HOT_TEAMS 5169 /* make sure everyone is syncronized */ 5170 int old_nproc = team->t.t_nproc; // save old value and use to update only 5171 // new threads below 5172 __kmp_initialize_team(team, new_nproc, new_icvs, 5173 root->r.r_uber_thread->th.th_ident); 5174 5175 /* reinitialize the threads */ 5176 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5177 for (f = 0; f < team->t.t_nproc; ++f) 5178 __kmp_initialize_info(team->t.t_threads[f], team, f, 5179 __kmp_gtid_from_tid(f, team)); 5180 5181 if (level) { // set th_task_state for new threads in nested hot team 5182 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5183 // only need to set the th_task_state for the new threads. th_task_state 5184 // for master thread will not be accurate until after this in 5185 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5186 // correct value. 5187 for (f = old_nproc; f < team->t.t_nproc; ++f) 5188 team->t.t_threads[f]->th.th_task_state = 5189 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5190 } else { // set th_task_state for new threads in non-nested hot team 5191 kmp_uint8 old_state = 5192 team->t.t_threads[0]->th.th_task_state; // copy master's state 5193 for (f = old_nproc; f < team->t.t_nproc; ++f) 5194 team->t.t_threads[f]->th.th_task_state = old_state; 5195 } 5196 5197 #ifdef KMP_DEBUG 5198 for (f = 0; f < team->t.t_nproc; ++f) { 5199 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5200 team->t.t_threads[f]->th.th_team_nproc == 5201 team->t.t_nproc); 5202 } 5203 #endif 5204 5205 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5206 #if KMP_AFFINITY_SUPPORTED 5207 __kmp_partition_places(team); 5208 #endif 5209 } // Check changes in number of threads 5210 5211 kmp_info_t *master = team->t.t_threads[0]; 5212 if (master->th.th_teams_microtask) { 5213 for (f = 1; f < new_nproc; ++f) { 5214 // propagate teams construct specific info to workers 5215 kmp_info_t *thr = team->t.t_threads[f]; 5216 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5217 thr->th.th_teams_level = master->th.th_teams_level; 5218 thr->th.th_teams_size = master->th.th_teams_size; 5219 } 5220 } 5221 #if KMP_NESTED_HOT_TEAMS 5222 if (level) { 5223 // Sync barrier state for nested hot teams, not needed for outermost hot 5224 // team. 5225 for (f = 1; f < new_nproc; ++f) { 5226 kmp_info_t *thr = team->t.t_threads[f]; 5227 int b; 5228 kmp_balign_t *balign = thr->th.th_bar; 5229 for (b = 0; b < bs_last_barrier; ++b) { 5230 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5231 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5232 #if USE_DEBUGGER 5233 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5234 #endif 5235 } 5236 } 5237 } 5238 #endif // KMP_NESTED_HOT_TEAMS 5239 5240 /* reallocate space for arguments if necessary */ 5241 __kmp_alloc_argv_entries(argc, team, TRUE); 5242 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5243 // The hot team re-uses the previous task team, 5244 // if untouched during the previous release->gather phase. 5245 5246 KF_TRACE(10, (" hot_team = %p\n", team)); 5247 5248 #if KMP_DEBUG 5249 if (__kmp_tasking_mode != tskm_immediate_exec) { 5250 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5251 "task_team[1] = %p after reinit\n", 5252 team->t.t_task_team[0], team->t.t_task_team[1])); 5253 } 5254 #endif 5255 5256 #if OMPT_SUPPORT 5257 __ompt_team_assign_id(team, ompt_parallel_data); 5258 #endif 5259 5260 KMP_MB(); 5261 5262 return team; 5263 } 5264 5265 /* next, let's try to take one from the team pool */ 5266 KMP_MB(); 5267 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5268 /* TODO: consider resizing undersized teams instead of reaping them, now 5269 that we have a resizing mechanism */ 5270 if (team->t.t_max_nproc >= max_nproc) { 5271 /* take this team from the team pool */ 5272 __kmp_team_pool = team->t.t_next_pool; 5273 5274 /* setup the team for fresh use */ 5275 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5276 5277 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5278 "task_team[1] %p to NULL\n", 5279 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5280 team->t.t_task_team[0] = NULL; 5281 team->t.t_task_team[1] = NULL; 5282 5283 /* reallocate space for arguments if necessary */ 5284 __kmp_alloc_argv_entries(argc, team, TRUE); 5285 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5286 5287 KA_TRACE( 5288 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5289 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5290 { // Initialize barrier data. 5291 int b; 5292 for (b = 0; b < bs_last_barrier; ++b) { 5293 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5294 #if USE_DEBUGGER 5295 team->t.t_bar[b].b_master_arrived = 0; 5296 team->t.t_bar[b].b_team_arrived = 0; 5297 #endif 5298 } 5299 } 5300 5301 team->t.t_proc_bind = new_proc_bind; 5302 5303 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5304 team->t.t_id)); 5305 5306 #if OMPT_SUPPORT 5307 __ompt_team_assign_id(team, ompt_parallel_data); 5308 #endif 5309 5310 KMP_MB(); 5311 5312 return team; 5313 } 5314 5315 /* reap team if it is too small, then loop back and check the next one */ 5316 // not sure if this is wise, but, will be redone during the hot-teams 5317 // rewrite. 5318 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5319 team = __kmp_reap_team(team); 5320 __kmp_team_pool = team; 5321 } 5322 5323 /* nothing available in the pool, no matter, make a new team! */ 5324 KMP_MB(); 5325 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5326 5327 /* and set it up */ 5328 team->t.t_max_nproc = max_nproc; 5329 /* NOTE well, for some reason allocating one big buffer and dividing it up 5330 seems to really hurt performance a lot on the P4, so, let's not use this */ 5331 __kmp_allocate_team_arrays(team, max_nproc); 5332 5333 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5334 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5335 5336 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5337 "%p to NULL\n", 5338 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5339 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5340 // memory, no need to duplicate 5341 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5342 // memory, no need to duplicate 5343 5344 if (__kmp_storage_map) { 5345 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5346 } 5347 5348 /* allocate space for arguments */ 5349 __kmp_alloc_argv_entries(argc, team, FALSE); 5350 team->t.t_argc = argc; 5351 5352 KA_TRACE(20, 5353 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5354 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5355 { // Initialize barrier data. 5356 int b; 5357 for (b = 0; b < bs_last_barrier; ++b) { 5358 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5359 #if USE_DEBUGGER 5360 team->t.t_bar[b].b_master_arrived = 0; 5361 team->t.t_bar[b].b_team_arrived = 0; 5362 #endif 5363 } 5364 } 5365 5366 team->t.t_proc_bind = new_proc_bind; 5367 5368 #if OMPT_SUPPORT 5369 __ompt_team_assign_id(team, ompt_parallel_data); 5370 team->t.ompt_serialized_team_info = NULL; 5371 #endif 5372 5373 KMP_MB(); 5374 5375 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5376 team->t.t_id)); 5377 5378 return team; 5379 } 5380 5381 /* TODO implement hot-teams at all levels */ 5382 /* TODO implement lazy thread release on demand (disband request) */ 5383 5384 /* free the team. return it to the team pool. release all the threads 5385 * associated with it */ 5386 void __kmp_free_team(kmp_root_t *root, 5387 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5388 int f; 5389 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5390 team->t.t_id)); 5391 5392 /* verify state */ 5393 KMP_DEBUG_ASSERT(root); 5394 KMP_DEBUG_ASSERT(team); 5395 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5396 KMP_DEBUG_ASSERT(team->t.t_threads); 5397 5398 int use_hot_team = team == root->r.r_hot_team; 5399 #if KMP_NESTED_HOT_TEAMS 5400 int level; 5401 kmp_hot_team_ptr_t *hot_teams; 5402 if (master) { 5403 level = team->t.t_active_level - 1; 5404 if (master->th.th_teams_microtask) { // in teams construct? 5405 if (master->th.th_teams_size.nteams > 1) { 5406 ++level; // level was not increased in teams construct for 5407 // team_of_masters 5408 } 5409 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5410 master->th.th_teams_level == team->t.t_level) { 5411 ++level; // level was not increased in teams construct for 5412 // team_of_workers before the parallel 5413 } // team->t.t_level will be increased inside parallel 5414 } 5415 hot_teams = master->th.th_hot_teams; 5416 if (level < __kmp_hot_teams_max_level) { 5417 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5418 use_hot_team = 1; 5419 } 5420 } 5421 #endif // KMP_NESTED_HOT_TEAMS 5422 5423 /* team is done working */ 5424 TCW_SYNC_PTR(team->t.t_pkfn, 5425 NULL); // Important for Debugging Support Library. 5426 #if KMP_OS_WINDOWS 5427 team->t.t_copyin_counter = 0; // init counter for possible reuse 5428 #endif 5429 // Do not reset pointer to parent team to NULL for hot teams. 5430 5431 /* if we are non-hot team, release our threads */ 5432 if (!use_hot_team) { 5433 if (__kmp_tasking_mode != tskm_immediate_exec) { 5434 // Wait for threads to reach reapable state 5435 for (f = 1; f < team->t.t_nproc; ++f) { 5436 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5437 kmp_info_t *th = team->t.t_threads[f]; 5438 volatile kmp_uint32 *state = &th->th.th_reap_state; 5439 while (*state != KMP_SAFE_TO_REAP) { 5440 #if KMP_OS_WINDOWS 5441 // On Windows a thread can be killed at any time, check this 5442 DWORD ecode; 5443 if (!__kmp_is_thread_alive(th, &ecode)) { 5444 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5445 break; 5446 } 5447 #endif 5448 // first check if thread is sleeping 5449 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5450 if (fl.is_sleeping()) 5451 fl.resume(__kmp_gtid_from_thread(th)); 5452 KMP_CPU_PAUSE(); 5453 } 5454 } 5455 5456 // Delete task teams 5457 int tt_idx; 5458 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5459 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5460 if (task_team != NULL) { 5461 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5462 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5463 team->t.t_threads[f]->th.th_task_team = NULL; 5464 } 5465 KA_TRACE( 5466 20, 5467 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5468 __kmp_get_gtid(), task_team, team->t.t_id)); 5469 #if KMP_NESTED_HOT_TEAMS 5470 __kmp_free_task_team(master, task_team); 5471 #endif 5472 team->t.t_task_team[tt_idx] = NULL; 5473 } 5474 } 5475 } 5476 5477 // Reset pointer to parent team only for non-hot teams. 5478 team->t.t_parent = NULL; 5479 team->t.t_level = 0; 5480 team->t.t_active_level = 0; 5481 5482 /* free the worker threads */ 5483 for (f = 1; f < team->t.t_nproc; ++f) { 5484 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5485 __kmp_free_thread(team->t.t_threads[f]); 5486 team->t.t_threads[f] = NULL; 5487 } 5488 5489 /* put the team back in the team pool */ 5490 /* TODO limit size of team pool, call reap_team if pool too large */ 5491 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5492 __kmp_team_pool = (volatile kmp_team_t *)team; 5493 } else { // Check if team was created for the masters in a teams construct 5494 // See if first worker is a CG root 5495 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5496 team->t.t_threads[1]->th.th_cg_roots); 5497 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5498 // Clean up the CG root nodes on workers so that this team can be re-used 5499 for (f = 1; f < team->t.t_nproc; ++f) { 5500 kmp_info_t *thr = team->t.t_threads[f]; 5501 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5502 thr->th.th_cg_roots->cg_root == thr); 5503 // Pop current CG root off list 5504 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5505 thr->th.th_cg_roots = tmp->up; 5506 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5507 " up to node %p. cg_nthreads was %d\n", 5508 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5509 int i = tmp->cg_nthreads--; 5510 if (i == 1) { 5511 __kmp_free(tmp); // free CG if we are the last thread in it 5512 } 5513 // Restore current task's thread_limit from CG root 5514 if (thr->th.th_cg_roots) 5515 thr->th.th_current_task->td_icvs.thread_limit = 5516 thr->th.th_cg_roots->cg_thread_limit; 5517 } 5518 } 5519 } 5520 5521 KMP_MB(); 5522 } 5523 5524 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5525 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5526 kmp_team_t *next_pool = team->t.t_next_pool; 5527 5528 KMP_DEBUG_ASSERT(team); 5529 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5530 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5531 KMP_DEBUG_ASSERT(team->t.t_threads); 5532 KMP_DEBUG_ASSERT(team->t.t_argv); 5533 5534 /* TODO clean the threads that are a part of this? */ 5535 5536 /* free stuff */ 5537 __kmp_free_team_arrays(team); 5538 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5539 __kmp_free((void *)team->t.t_argv); 5540 __kmp_free(team); 5541 5542 KMP_MB(); 5543 return next_pool; 5544 } 5545 5546 // Free the thread. Don't reap it, just place it on the pool of available 5547 // threads. 5548 // 5549 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5550 // binding for the affinity mechanism to be useful. 5551 // 5552 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5553 // However, we want to avoid a potential performance problem by always 5554 // scanning through the list to find the correct point at which to insert 5555 // the thread (potential N**2 behavior). To do this we keep track of the 5556 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5557 // With single-level parallelism, threads will always be added to the tail 5558 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5559 // parallelism, all bets are off and we may need to scan through the entire 5560 // free list. 5561 // 5562 // This change also has a potentially large performance benefit, for some 5563 // applications. Previously, as threads were freed from the hot team, they 5564 // would be placed back on the free list in inverse order. If the hot team 5565 // grew back to it's original size, then the freed thread would be placed 5566 // back on the hot team in reverse order. This could cause bad cache 5567 // locality problems on programs where the size of the hot team regularly 5568 // grew and shrunk. 5569 // 5570 // Now, for single-level parallelism, the OMP tid is always == gtid. 5571 void __kmp_free_thread(kmp_info_t *this_th) { 5572 int gtid; 5573 kmp_info_t **scan; 5574 5575 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5576 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5577 5578 KMP_DEBUG_ASSERT(this_th); 5579 5580 // When moving thread to pool, switch thread to wait on own b_go flag, and 5581 // uninitialized (NULL team). 5582 int b; 5583 kmp_balign_t *balign = this_th->th.th_bar; 5584 for (b = 0; b < bs_last_barrier; ++b) { 5585 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5586 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5587 balign[b].bb.team = NULL; 5588 balign[b].bb.leaf_kids = 0; 5589 } 5590 this_th->th.th_task_state = 0; 5591 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5592 5593 /* put thread back on the free pool */ 5594 TCW_PTR(this_th->th.th_team, NULL); 5595 TCW_PTR(this_th->th.th_root, NULL); 5596 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5597 5598 while (this_th->th.th_cg_roots) { 5599 this_th->th.th_cg_roots->cg_nthreads--; 5600 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5601 " %p of thread %p to %d\n", 5602 this_th, this_th->th.th_cg_roots, 5603 this_th->th.th_cg_roots->cg_root, 5604 this_th->th.th_cg_roots->cg_nthreads)); 5605 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5606 if (tmp->cg_root == this_th) { // Thread is a cg_root 5607 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5608 KA_TRACE( 5609 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5610 this_th->th.th_cg_roots = tmp->up; 5611 __kmp_free(tmp); 5612 } else { // Worker thread 5613 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5614 __kmp_free(tmp); 5615 } 5616 this_th->th.th_cg_roots = NULL; 5617 break; 5618 } 5619 } 5620 5621 /* If the implicit task assigned to this thread can be used by other threads 5622 * -> multiple threads can share the data and try to free the task at 5623 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5624 * with higher probability when hot team is disabled but can occurs even when 5625 * the hot team is enabled */ 5626 __kmp_free_implicit_task(this_th); 5627 this_th->th.th_current_task = NULL; 5628 5629 // If the __kmp_thread_pool_insert_pt is already past the new insert 5630 // point, then we need to re-scan the entire list. 5631 gtid = this_th->th.th_info.ds.ds_gtid; 5632 if (__kmp_thread_pool_insert_pt != NULL) { 5633 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5634 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5635 __kmp_thread_pool_insert_pt = NULL; 5636 } 5637 } 5638 5639 // Scan down the list to find the place to insert the thread. 5640 // scan is the address of a link in the list, possibly the address of 5641 // __kmp_thread_pool itself. 5642 // 5643 // In the absence of nested parallelism, the for loop will have 0 iterations. 5644 if (__kmp_thread_pool_insert_pt != NULL) { 5645 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5646 } else { 5647 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5648 } 5649 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5650 scan = &((*scan)->th.th_next_pool)) 5651 ; 5652 5653 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5654 // to its address. 5655 TCW_PTR(this_th->th.th_next_pool, *scan); 5656 __kmp_thread_pool_insert_pt = *scan = this_th; 5657 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5658 (this_th->th.th_info.ds.ds_gtid < 5659 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5660 TCW_4(this_th->th.th_in_pool, TRUE); 5661 __kmp_suspend_initialize_thread(this_th); 5662 __kmp_lock_suspend_mx(this_th); 5663 if (this_th->th.th_active == TRUE) { 5664 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5665 this_th->th.th_active_in_pool = TRUE; 5666 } 5667 #if KMP_DEBUG 5668 else { 5669 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5670 } 5671 #endif 5672 __kmp_unlock_suspend_mx(this_th); 5673 5674 TCW_4(__kmp_nth, __kmp_nth - 1); 5675 5676 #ifdef KMP_ADJUST_BLOCKTIME 5677 /* Adjust blocktime back to user setting or default if necessary */ 5678 /* Middle initialization might never have occurred */ 5679 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5680 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5681 if (__kmp_nth <= __kmp_avail_proc) { 5682 __kmp_zero_bt = FALSE; 5683 } 5684 } 5685 #endif /* KMP_ADJUST_BLOCKTIME */ 5686 5687 KMP_MB(); 5688 } 5689 5690 /* ------------------------------------------------------------------------ */ 5691 5692 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5693 #if OMP_PROFILING_SUPPORT 5694 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5695 // TODO: add a configuration option for time granularity 5696 if (ProfileTraceFile) 5697 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5698 #endif 5699 5700 int gtid = this_thr->th.th_info.ds.ds_gtid; 5701 /* void *stack_data;*/ 5702 kmp_team_t **volatile pteam; 5703 5704 KMP_MB(); 5705 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5706 5707 if (__kmp_env_consistency_check) { 5708 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5709 } 5710 5711 #if OMPT_SUPPORT 5712 ompt_data_t *thread_data; 5713 if (ompt_enabled.enabled) { 5714 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5715 *thread_data = ompt_data_none; 5716 5717 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5718 this_thr->th.ompt_thread_info.wait_id = 0; 5719 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5720 this_thr->th.ompt_thread_info.parallel_flags = 0; 5721 if (ompt_enabled.ompt_callback_thread_begin) { 5722 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5723 ompt_thread_worker, thread_data); 5724 } 5725 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5726 } 5727 #endif 5728 5729 /* This is the place where threads wait for work */ 5730 while (!TCR_4(__kmp_global.g.g_done)) { 5731 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5732 KMP_MB(); 5733 5734 /* wait for work to do */ 5735 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5736 5737 /* No tid yet since not part of a team */ 5738 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5739 5740 #if OMPT_SUPPORT 5741 if (ompt_enabled.enabled) { 5742 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5743 } 5744 #endif 5745 5746 pteam = &this_thr->th.th_team; 5747 5748 /* have we been allocated? */ 5749 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5750 /* we were just woken up, so run our new task */ 5751 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5752 int rc; 5753 KA_TRACE(20, 5754 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5755 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5756 (*pteam)->t.t_pkfn)); 5757 5758 updateHWFPControl(*pteam); 5759 5760 #if OMPT_SUPPORT 5761 if (ompt_enabled.enabled) { 5762 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5763 } 5764 #endif 5765 5766 rc = (*pteam)->t.t_invoke(gtid); 5767 KMP_ASSERT(rc); 5768 5769 KMP_MB(); 5770 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5771 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5772 (*pteam)->t.t_pkfn)); 5773 } 5774 #if OMPT_SUPPORT 5775 if (ompt_enabled.enabled) { 5776 /* no frame set while outside task */ 5777 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5778 5779 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5780 } 5781 #endif 5782 /* join barrier after parallel region */ 5783 __kmp_join_barrier(gtid); 5784 } 5785 } 5786 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5787 5788 #if OMPT_SUPPORT 5789 if (ompt_enabled.ompt_callback_thread_end) { 5790 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5791 } 5792 #endif 5793 5794 this_thr->th.th_task_team = NULL; 5795 /* run the destructors for the threadprivate data for this thread */ 5796 __kmp_common_destroy_gtid(gtid); 5797 5798 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5799 KMP_MB(); 5800 5801 #if OMP_PROFILING_SUPPORT 5802 llvm::timeTraceProfilerFinishThread(); 5803 #endif 5804 return this_thr; 5805 } 5806 5807 /* ------------------------------------------------------------------------ */ 5808 5809 void __kmp_internal_end_dest(void *specific_gtid) { 5810 // Make sure no significant bits are lost 5811 int gtid; 5812 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5813 5814 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5815 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5816 * this is because 0 is reserved for the nothing-stored case */ 5817 5818 __kmp_internal_end_thread(gtid); 5819 } 5820 5821 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5822 5823 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5824 __kmp_internal_end_atexit(); 5825 } 5826 5827 #endif 5828 5829 /* [Windows] josh: when the atexit handler is called, there may still be more 5830 than one thread alive */ 5831 void __kmp_internal_end_atexit(void) { 5832 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5833 /* [Windows] 5834 josh: ideally, we want to completely shutdown the library in this atexit 5835 handler, but stat code that depends on thread specific data for gtid fails 5836 because that data becomes unavailable at some point during the shutdown, so 5837 we call __kmp_internal_end_thread instead. We should eventually remove the 5838 dependency on __kmp_get_specific_gtid in the stat code and use 5839 __kmp_internal_end_library to cleanly shutdown the library. 5840 5841 // TODO: Can some of this comment about GVS be removed? 5842 I suspect that the offending stat code is executed when the calling thread 5843 tries to clean up a dead root thread's data structures, resulting in GVS 5844 code trying to close the GVS structures for that thread, but since the stat 5845 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5846 the calling thread is cleaning up itself instead of another thread, it get 5847 confused. This happens because allowing a thread to unregister and cleanup 5848 another thread is a recent modification for addressing an issue. 5849 Based on the current design (20050722), a thread may end up 5850 trying to unregister another thread only if thread death does not trigger 5851 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5852 thread specific data destructor function to detect thread death. For 5853 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5854 is nothing. Thus, the workaround is applicable only for Windows static 5855 stat library. */ 5856 __kmp_internal_end_library(-1); 5857 #if KMP_OS_WINDOWS 5858 __kmp_close_console(); 5859 #endif 5860 } 5861 5862 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5863 // It is assumed __kmp_forkjoin_lock is acquired. 5864 5865 int gtid; 5866 5867 KMP_DEBUG_ASSERT(thread != NULL); 5868 5869 gtid = thread->th.th_info.ds.ds_gtid; 5870 5871 if (!is_root) { 5872 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5873 /* Assume the threads are at the fork barrier here */ 5874 KA_TRACE( 5875 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5876 gtid)); 5877 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5878 * (GEH) */ 5879 ANNOTATE_HAPPENS_BEFORE(thread); 5880 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5881 thread); 5882 __kmp_release_64(&flag); 5883 } 5884 5885 // Terminate OS thread. 5886 __kmp_reap_worker(thread); 5887 5888 // The thread was killed asynchronously. If it was actively 5889 // spinning in the thread pool, decrement the global count. 5890 // 5891 // There is a small timing hole here - if the worker thread was just waking 5892 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5893 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5894 // the global counter might not get updated. 5895 // 5896 // Currently, this can only happen as the library is unloaded, 5897 // so there are no harmful side effects. 5898 if (thread->th.th_active_in_pool) { 5899 thread->th.th_active_in_pool = FALSE; 5900 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5901 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5902 } 5903 } 5904 5905 __kmp_free_implicit_task(thread); 5906 5907 // Free the fast memory for tasking 5908 #if USE_FAST_MEMORY 5909 __kmp_free_fast_memory(thread); 5910 #endif /* USE_FAST_MEMORY */ 5911 5912 __kmp_suspend_uninitialize_thread(thread); 5913 5914 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5915 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5916 5917 --__kmp_all_nth; 5918 // __kmp_nth was decremented when thread is added to the pool. 5919 5920 #ifdef KMP_ADJUST_BLOCKTIME 5921 /* Adjust blocktime back to user setting or default if necessary */ 5922 /* Middle initialization might never have occurred */ 5923 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5924 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5925 if (__kmp_nth <= __kmp_avail_proc) { 5926 __kmp_zero_bt = FALSE; 5927 } 5928 } 5929 #endif /* KMP_ADJUST_BLOCKTIME */ 5930 5931 /* free the memory being used */ 5932 if (__kmp_env_consistency_check) { 5933 if (thread->th.th_cons) { 5934 __kmp_free_cons_stack(thread->th.th_cons); 5935 thread->th.th_cons = NULL; 5936 } 5937 } 5938 5939 if (thread->th.th_pri_common != NULL) { 5940 __kmp_free(thread->th.th_pri_common); 5941 thread->th.th_pri_common = NULL; 5942 } 5943 5944 if (thread->th.th_task_state_memo_stack != NULL) { 5945 __kmp_free(thread->th.th_task_state_memo_stack); 5946 thread->th.th_task_state_memo_stack = NULL; 5947 } 5948 5949 #if KMP_USE_BGET 5950 if (thread->th.th_local.bget_data != NULL) { 5951 __kmp_finalize_bget(thread); 5952 } 5953 #endif 5954 5955 #if KMP_AFFINITY_SUPPORTED 5956 if (thread->th.th_affin_mask != NULL) { 5957 KMP_CPU_FREE(thread->th.th_affin_mask); 5958 thread->th.th_affin_mask = NULL; 5959 } 5960 #endif /* KMP_AFFINITY_SUPPORTED */ 5961 5962 #if KMP_USE_HIER_SCHED 5963 if (thread->th.th_hier_bar_data != NULL) { 5964 __kmp_free(thread->th.th_hier_bar_data); 5965 thread->th.th_hier_bar_data = NULL; 5966 } 5967 #endif 5968 5969 __kmp_reap_team(thread->th.th_serial_team); 5970 thread->th.th_serial_team = NULL; 5971 __kmp_free(thread); 5972 5973 KMP_MB(); 5974 5975 } // __kmp_reap_thread 5976 5977 static void __kmp_internal_end(void) { 5978 int i; 5979 5980 /* First, unregister the library */ 5981 __kmp_unregister_library(); 5982 5983 #if KMP_OS_WINDOWS 5984 /* In Win static library, we can't tell when a root actually dies, so we 5985 reclaim the data structures for any root threads that have died but not 5986 unregistered themselves, in order to shut down cleanly. 5987 In Win dynamic library we also can't tell when a thread dies. */ 5988 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 5989 // dead roots 5990 #endif 5991 5992 for (i = 0; i < __kmp_threads_capacity; i++) 5993 if (__kmp_root[i]) 5994 if (__kmp_root[i]->r.r_active) 5995 break; 5996 KMP_MB(); /* Flush all pending memory write invalidates. */ 5997 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 5998 5999 if (i < __kmp_threads_capacity) { 6000 #if KMP_USE_MONITOR 6001 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6002 KMP_MB(); /* Flush all pending memory write invalidates. */ 6003 6004 // Need to check that monitor was initialized before reaping it. If we are 6005 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6006 // __kmp_monitor will appear to contain valid data, but it is only valid in 6007 // the parent process, not the child. 6008 // New behavior (201008): instead of keying off of the flag 6009 // __kmp_init_parallel, the monitor thread creation is keyed off 6010 // of the new flag __kmp_init_monitor. 6011 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6012 if (TCR_4(__kmp_init_monitor)) { 6013 __kmp_reap_monitor(&__kmp_monitor); 6014 TCW_4(__kmp_init_monitor, 0); 6015 } 6016 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6017 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6018 #endif // KMP_USE_MONITOR 6019 } else { 6020 /* TODO move this to cleanup code */ 6021 #ifdef KMP_DEBUG 6022 /* make sure that everything has properly ended */ 6023 for (i = 0; i < __kmp_threads_capacity; i++) { 6024 if (__kmp_root[i]) { 6025 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6026 // there can be uber threads alive here 6027 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6028 } 6029 } 6030 #endif 6031 6032 KMP_MB(); 6033 6034 // Reap the worker threads. 6035 // This is valid for now, but be careful if threads are reaped sooner. 6036 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6037 // Get the next thread from the pool. 6038 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6039 __kmp_thread_pool = thread->th.th_next_pool; 6040 // Reap it. 6041 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6042 thread->th.th_next_pool = NULL; 6043 thread->th.th_in_pool = FALSE; 6044 __kmp_reap_thread(thread, 0); 6045 } 6046 __kmp_thread_pool_insert_pt = NULL; 6047 6048 // Reap teams. 6049 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6050 // Get the next team from the pool. 6051 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6052 __kmp_team_pool = team->t.t_next_pool; 6053 // Reap it. 6054 team->t.t_next_pool = NULL; 6055 __kmp_reap_team(team); 6056 } 6057 6058 __kmp_reap_task_teams(); 6059 6060 #if KMP_OS_UNIX 6061 // Threads that are not reaped should not access any resources since they 6062 // are going to be deallocated soon, so the shutdown sequence should wait 6063 // until all threads either exit the final spin-waiting loop or begin 6064 // sleeping after the given blocktime. 6065 for (i = 0; i < __kmp_threads_capacity; i++) { 6066 kmp_info_t *thr = __kmp_threads[i]; 6067 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6068 KMP_CPU_PAUSE(); 6069 } 6070 #endif 6071 6072 for (i = 0; i < __kmp_threads_capacity; ++i) { 6073 // TBD: Add some checking... 6074 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6075 } 6076 6077 /* Make sure all threadprivate destructors get run by joining with all 6078 worker threads before resetting this flag */ 6079 TCW_SYNC_4(__kmp_init_common, FALSE); 6080 6081 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6082 KMP_MB(); 6083 6084 #if KMP_USE_MONITOR 6085 // See note above: One of the possible fixes for CQ138434 / CQ140126 6086 // 6087 // FIXME: push both code fragments down and CSE them? 6088 // push them into __kmp_cleanup() ? 6089 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6090 if (TCR_4(__kmp_init_monitor)) { 6091 __kmp_reap_monitor(&__kmp_monitor); 6092 TCW_4(__kmp_init_monitor, 0); 6093 } 6094 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6095 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6096 #endif 6097 } /* else !__kmp_global.t_active */ 6098 TCW_4(__kmp_init_gtid, FALSE); 6099 KMP_MB(); /* Flush all pending memory write invalidates. */ 6100 6101 __kmp_cleanup(); 6102 #if OMPT_SUPPORT 6103 ompt_fini(); 6104 #endif 6105 } 6106 6107 void __kmp_internal_end_library(int gtid_req) { 6108 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6109 /* this shouldn't be a race condition because __kmp_internal_end() is the 6110 only place to clear __kmp_serial_init */ 6111 /* we'll check this later too, after we get the lock */ 6112 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6113 // redundant, because the next check will work in any case. 6114 if (__kmp_global.g.g_abort) { 6115 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6116 /* TODO abort? */ 6117 return; 6118 } 6119 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6120 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6121 return; 6122 } 6123 6124 KMP_MB(); /* Flush all pending memory write invalidates. */ 6125 /* find out who we are and what we should do */ 6126 { 6127 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6128 KA_TRACE( 6129 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6130 if (gtid == KMP_GTID_SHUTDOWN) { 6131 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6132 "already shutdown\n")); 6133 return; 6134 } else if (gtid == KMP_GTID_MONITOR) { 6135 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6136 "registered, or system shutdown\n")); 6137 return; 6138 } else if (gtid == KMP_GTID_DNE) { 6139 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6140 "shutdown\n")); 6141 /* we don't know who we are, but we may still shutdown the library */ 6142 } else if (KMP_UBER_GTID(gtid)) { 6143 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6144 if (__kmp_root[gtid]->r.r_active) { 6145 __kmp_global.g.g_abort = -1; 6146 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6147 __kmp_unregister_library(); 6148 KA_TRACE(10, 6149 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6150 gtid)); 6151 return; 6152 } else { 6153 KA_TRACE( 6154 10, 6155 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6156 __kmp_unregister_root_current_thread(gtid); 6157 } 6158 } else { 6159 /* worker threads may call this function through the atexit handler, if they 6160 * call exit() */ 6161 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6162 TODO: do a thorough shutdown instead */ 6163 #ifdef DUMP_DEBUG_ON_EXIT 6164 if (__kmp_debug_buf) 6165 __kmp_dump_debug_buffer(); 6166 #endif 6167 // added unregister library call here when we switch to shm linux 6168 // if we don't, it will leave lots of files in /dev/shm 6169 // cleanup shared memory file before exiting. 6170 __kmp_unregister_library(); 6171 return; 6172 } 6173 } 6174 /* synchronize the termination process */ 6175 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6176 6177 /* have we already finished */ 6178 if (__kmp_global.g.g_abort) { 6179 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6180 /* TODO abort? */ 6181 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6182 return; 6183 } 6184 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6185 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6186 return; 6187 } 6188 6189 /* We need this lock to enforce mutex between this reading of 6190 __kmp_threads_capacity and the writing by __kmp_register_root. 6191 Alternatively, we can use a counter of roots that is atomically updated by 6192 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6193 __kmp_internal_end_*. */ 6194 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6195 6196 /* now we can safely conduct the actual termination */ 6197 __kmp_internal_end(); 6198 6199 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6200 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6201 6202 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6203 6204 #ifdef DUMP_DEBUG_ON_EXIT 6205 if (__kmp_debug_buf) 6206 __kmp_dump_debug_buffer(); 6207 #endif 6208 6209 #if KMP_OS_WINDOWS 6210 __kmp_close_console(); 6211 #endif 6212 6213 __kmp_fini_allocator(); 6214 6215 } // __kmp_internal_end_library 6216 6217 void __kmp_internal_end_thread(int gtid_req) { 6218 int i; 6219 6220 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6221 /* this shouldn't be a race condition because __kmp_internal_end() is the 6222 * only place to clear __kmp_serial_init */ 6223 /* we'll check this later too, after we get the lock */ 6224 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6225 // redundant, because the next check will work in any case. 6226 if (__kmp_global.g.g_abort) { 6227 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6228 /* TODO abort? */ 6229 return; 6230 } 6231 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6232 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6233 return; 6234 } 6235 6236 // If hidden helper team has been initialized, we need to deinit it 6237 if (TCR_4(__kmp_init_hidden_helper)) { 6238 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6239 // First release the main thread to let it continue its work 6240 __kmp_hidden_helper_main_thread_release(); 6241 // Wait until the hidden helper team has been destroyed 6242 __kmp_hidden_helper_threads_deinitz_wait(); 6243 } 6244 6245 KMP_MB(); /* Flush all pending memory write invalidates. */ 6246 6247 /* find out who we are and what we should do */ 6248 { 6249 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6250 KA_TRACE(10, 6251 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6252 if (gtid == KMP_GTID_SHUTDOWN) { 6253 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6254 "already shutdown\n")); 6255 return; 6256 } else if (gtid == KMP_GTID_MONITOR) { 6257 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6258 "registered, or system shutdown\n")); 6259 return; 6260 } else if (gtid == KMP_GTID_DNE) { 6261 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6262 "shutdown\n")); 6263 return; 6264 /* we don't know who we are */ 6265 } else if (KMP_UBER_GTID(gtid)) { 6266 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6267 if (__kmp_root[gtid]->r.r_active) { 6268 __kmp_global.g.g_abort = -1; 6269 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6270 KA_TRACE(10, 6271 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6272 gtid)); 6273 return; 6274 } else { 6275 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6276 gtid)); 6277 __kmp_unregister_root_current_thread(gtid); 6278 } 6279 } else { 6280 /* just a worker thread, let's leave */ 6281 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6282 6283 if (gtid >= 0) { 6284 __kmp_threads[gtid]->th.th_task_team = NULL; 6285 } 6286 6287 KA_TRACE(10, 6288 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6289 gtid)); 6290 return; 6291 } 6292 } 6293 #if KMP_DYNAMIC_LIB 6294 if (__kmp_pause_status != kmp_hard_paused) 6295 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6296 // because we will better shutdown later in the library destructor. 6297 { 6298 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6299 return; 6300 } 6301 #endif 6302 /* synchronize the termination process */ 6303 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6304 6305 /* have we already finished */ 6306 if (__kmp_global.g.g_abort) { 6307 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6308 /* TODO abort? */ 6309 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6310 return; 6311 } 6312 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6313 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6314 return; 6315 } 6316 6317 /* We need this lock to enforce mutex between this reading of 6318 __kmp_threads_capacity and the writing by __kmp_register_root. 6319 Alternatively, we can use a counter of roots that is atomically updated by 6320 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6321 __kmp_internal_end_*. */ 6322 6323 /* should we finish the run-time? are all siblings done? */ 6324 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6325 6326 for (i = 0; i < __kmp_threads_capacity; ++i) { 6327 if (KMP_UBER_GTID(i)) { 6328 KA_TRACE( 6329 10, 6330 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6331 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6332 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6333 return; 6334 } 6335 } 6336 6337 /* now we can safely conduct the actual termination */ 6338 6339 __kmp_internal_end(); 6340 6341 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6342 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6343 6344 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6345 6346 #ifdef DUMP_DEBUG_ON_EXIT 6347 if (__kmp_debug_buf) 6348 __kmp_dump_debug_buffer(); 6349 #endif 6350 } // __kmp_internal_end_thread 6351 6352 // ----------------------------------------------------------------------------- 6353 // Library registration stuff. 6354 6355 static long __kmp_registration_flag = 0; 6356 // Random value used to indicate library initialization. 6357 static char *__kmp_registration_str = NULL; 6358 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6359 6360 static inline char *__kmp_reg_status_name() { 6361 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6362 each thread. If registration and unregistration go in different threads 6363 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6364 env var can not be found, because the name will contain different pid. */ 6365 // macOS* complains about name being too long with additional getuid() 6366 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6367 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6368 (int)getuid()); 6369 #else 6370 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6371 #endif 6372 } // __kmp_reg_status_get 6373 6374 void __kmp_register_library_startup(void) { 6375 6376 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6377 int done = 0; 6378 union { 6379 double dtime; 6380 long ltime; 6381 } time; 6382 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6383 __kmp_initialize_system_tick(); 6384 #endif 6385 __kmp_read_system_time(&time.dtime); 6386 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6387 __kmp_registration_str = 6388 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6389 __kmp_registration_flag, KMP_LIBRARY_FILE); 6390 6391 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6392 __kmp_registration_str)); 6393 6394 while (!done) { 6395 6396 char *value = NULL; // Actual value of the environment variable. 6397 6398 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6399 char *shm_name = __kmp_str_format("/%s", name); 6400 int shm_preexist = 0; 6401 char *data1; 6402 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6403 if ((fd1 == -1) && (errno == EEXIST)) { 6404 // file didn't open because it already exists. 6405 // try opening existing file 6406 fd1 = shm_open(shm_name, O_RDWR, 0666); 6407 if (fd1 == -1) { // file didn't open 6408 // error out here 6409 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6410 __kmp_msg_null); 6411 } else { 6412 // able to open existing file 6413 shm_preexist = 1; 6414 } 6415 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6416 // already exists. 6417 // error out here. 6418 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6419 __kmp_msg_null); 6420 } 6421 if (shm_preexist == 0) { 6422 // we created SHM now set size 6423 if (ftruncate(fd1, SHM_SIZE) == -1) { 6424 // error occured setting size; 6425 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6426 KMP_ERR(errno), __kmp_msg_null); 6427 } 6428 } 6429 data1 = 6430 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6431 if (data1 == MAP_FAILED) { 6432 // failed to map shared memory 6433 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6434 __kmp_msg_null); 6435 } 6436 if (shm_preexist == 0) { // set data to SHM, set value 6437 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6438 } 6439 // Read value from either what we just wrote or existing file. 6440 value = __kmp_str_format("%s", data1); // read value from SHM 6441 munmap(data1, SHM_SIZE); 6442 close(fd1); 6443 #else // Windows and unix with static library 6444 // Set environment variable, but do not overwrite if it is exist. 6445 __kmp_env_set(name, __kmp_registration_str, 0); 6446 // read value to see if it got set 6447 value = __kmp_env_get(name); 6448 #endif 6449 6450 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6451 done = 1; // Ok, environment variable set successfully, exit the loop. 6452 } else { 6453 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6454 // Check whether it alive or dead. 6455 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6456 char *tail = value; 6457 char *flag_addr_str = NULL; 6458 char *flag_val_str = NULL; 6459 char const *file_name = NULL; 6460 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6461 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6462 file_name = tail; 6463 if (tail != NULL) { 6464 long *flag_addr = 0; 6465 long flag_val = 0; 6466 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6467 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6468 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6469 // First, check whether environment-encoded address is mapped into 6470 // addr space. 6471 // If so, dereference it to see if it still has the right value. 6472 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6473 neighbor = 1; 6474 } else { 6475 // If not, then we know the other copy of the library is no longer 6476 // running. 6477 neighbor = 2; 6478 } 6479 } 6480 } 6481 switch (neighbor) { 6482 case 0: // Cannot parse environment variable -- neighbor status unknown. 6483 // Assume it is the incompatible format of future version of the 6484 // library. Assume the other library is alive. 6485 // WARN( ... ); // TODO: Issue a warning. 6486 file_name = "unknown library"; 6487 KMP_FALLTHROUGH(); 6488 // Attention! Falling to the next case. That's intentional. 6489 case 1: { // Neighbor is alive. 6490 // Check it is allowed. 6491 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6492 if (!__kmp_str_match_true(duplicate_ok)) { 6493 // That's not allowed. Issue fatal error. 6494 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6495 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6496 } 6497 KMP_INTERNAL_FREE(duplicate_ok); 6498 __kmp_duplicate_library_ok = 1; 6499 done = 1; // Exit the loop. 6500 } break; 6501 case 2: { // Neighbor is dead. 6502 6503 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6504 // close shared memory. 6505 shm_unlink(shm_name); // this removes file in /dev/shm 6506 #else 6507 // Clear the variable and try to register library again. 6508 __kmp_env_unset(name); 6509 #endif 6510 } break; 6511 default: { 6512 KMP_DEBUG_ASSERT(0); 6513 } break; 6514 } 6515 } 6516 KMP_INTERNAL_FREE((void *)value); 6517 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6518 KMP_INTERNAL_FREE((void *)shm_name); 6519 #endif 6520 } // while 6521 KMP_INTERNAL_FREE((void *)name); 6522 6523 } // func __kmp_register_library_startup 6524 6525 void __kmp_unregister_library(void) { 6526 6527 char *name = __kmp_reg_status_name(); 6528 char *value = NULL; 6529 6530 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6531 char *shm_name = __kmp_str_format("/%s", name); 6532 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6533 if (fd1 == -1) { 6534 // file did not open. return. 6535 return; 6536 } 6537 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6538 if (data1 != MAP_FAILED) { 6539 value = __kmp_str_format("%s", data1); // read value from SHM 6540 munmap(data1, SHM_SIZE); 6541 } 6542 close(fd1); 6543 #else 6544 value = __kmp_env_get(name); 6545 #endif 6546 6547 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6548 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6549 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6550 // Ok, this is our variable. Delete it. 6551 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6552 shm_unlink(shm_name); // this removes file in /dev/shm 6553 #else 6554 __kmp_env_unset(name); 6555 #endif 6556 } 6557 6558 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6559 KMP_INTERNAL_FREE(shm_name); 6560 #endif 6561 6562 KMP_INTERNAL_FREE(__kmp_registration_str); 6563 KMP_INTERNAL_FREE(value); 6564 KMP_INTERNAL_FREE(name); 6565 6566 __kmp_registration_flag = 0; 6567 __kmp_registration_str = NULL; 6568 6569 } // __kmp_unregister_library 6570 6571 // End of Library registration stuff. 6572 // ----------------------------------------------------------------------------- 6573 6574 #if KMP_MIC_SUPPORTED 6575 6576 static void __kmp_check_mic_type() { 6577 kmp_cpuid_t cpuid_state = {0}; 6578 kmp_cpuid_t *cs_p = &cpuid_state; 6579 __kmp_x86_cpuid(1, 0, cs_p); 6580 // We don't support mic1 at the moment 6581 if ((cs_p->eax & 0xff0) == 0xB10) { 6582 __kmp_mic_type = mic2; 6583 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6584 __kmp_mic_type = mic3; 6585 } else { 6586 __kmp_mic_type = non_mic; 6587 } 6588 } 6589 6590 #endif /* KMP_MIC_SUPPORTED */ 6591 6592 #if KMP_HAVE_UMWAIT 6593 static void __kmp_user_level_mwait_init() { 6594 struct kmp_cpuid buf; 6595 __kmp_x86_cpuid(7, 0, &buf); 6596 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6597 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6598 __kmp_umwait_enabled)); 6599 } 6600 #elif KMP_HAVE_MWAIT 6601 #ifndef AT_INTELPHIUSERMWAIT 6602 // Spurious, non-existent value that should always fail to return anything. 6603 // Will be replaced with the correct value when we know that. 6604 #define AT_INTELPHIUSERMWAIT 10000 6605 #endif 6606 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6607 // earlier OS is used to build the RTL, we'll use the following internal 6608 // function when the entry is not found. 6609 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6610 unsigned long getauxval(unsigned long) { return 0; } 6611 6612 static void __kmp_user_level_mwait_init() { 6613 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6614 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6615 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6616 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6617 if (__kmp_mic_type == mic3) { 6618 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6619 if ((res & 0x1) || __kmp_user_level_mwait) { 6620 __kmp_mwait_enabled = TRUE; 6621 if (__kmp_user_level_mwait) { 6622 KMP_INFORM(EnvMwaitWarn); 6623 } 6624 } else { 6625 __kmp_mwait_enabled = FALSE; 6626 } 6627 } 6628 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6629 "__kmp_mwait_enabled = %d\n", 6630 __kmp_mic_type, __kmp_mwait_enabled)); 6631 } 6632 #endif /* KMP_HAVE_UMWAIT */ 6633 6634 static void __kmp_do_serial_initialize(void) { 6635 int i, gtid; 6636 size_t size; 6637 6638 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6639 6640 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6641 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6642 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6643 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6644 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6645 6646 #if OMPT_SUPPORT 6647 ompt_pre_init(); 6648 #endif 6649 6650 __kmp_validate_locks(); 6651 6652 /* Initialize internal memory allocator */ 6653 __kmp_init_allocator(); 6654 6655 /* Register the library startup via an environment variable and check to see 6656 whether another copy of the library is already registered. */ 6657 6658 __kmp_register_library_startup(); 6659 6660 /* TODO reinitialization of library */ 6661 if (TCR_4(__kmp_global.g.g_done)) { 6662 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6663 } 6664 6665 __kmp_global.g.g_abort = 0; 6666 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6667 6668 /* initialize the locks */ 6669 #if KMP_USE_ADAPTIVE_LOCKS 6670 #if KMP_DEBUG_ADAPTIVE_LOCKS 6671 __kmp_init_speculative_stats(); 6672 #endif 6673 #endif 6674 #if KMP_STATS_ENABLED 6675 __kmp_stats_init(); 6676 #endif 6677 __kmp_init_lock(&__kmp_global_lock); 6678 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6679 __kmp_init_lock(&__kmp_debug_lock); 6680 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6681 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6682 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6683 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6684 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6685 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6686 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6687 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6688 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6689 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6690 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6691 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6692 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6693 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6694 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6695 #if KMP_USE_MONITOR 6696 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6697 #endif 6698 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6699 6700 /* conduct initialization and initial setup of configuration */ 6701 6702 __kmp_runtime_initialize(); 6703 6704 #if KMP_MIC_SUPPORTED 6705 __kmp_check_mic_type(); 6706 #endif 6707 6708 // Some global variable initialization moved here from kmp_env_initialize() 6709 #ifdef KMP_DEBUG 6710 kmp_diag = 0; 6711 #endif 6712 __kmp_abort_delay = 0; 6713 6714 // From __kmp_init_dflt_team_nth() 6715 /* assume the entire machine will be used */ 6716 __kmp_dflt_team_nth_ub = __kmp_xproc; 6717 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6718 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6719 } 6720 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6721 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6722 } 6723 __kmp_max_nth = __kmp_sys_max_nth; 6724 __kmp_cg_max_nth = __kmp_sys_max_nth; 6725 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6726 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6727 __kmp_teams_max_nth = __kmp_sys_max_nth; 6728 } 6729 6730 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6731 // part 6732 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6733 #if KMP_USE_MONITOR 6734 __kmp_monitor_wakeups = 6735 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6736 __kmp_bt_intervals = 6737 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6738 #endif 6739 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6740 __kmp_library = library_throughput; 6741 // From KMP_SCHEDULE initialization 6742 __kmp_static = kmp_sch_static_balanced; 6743 // AC: do not use analytical here, because it is non-monotonous 6744 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6745 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6746 // need to repeat assignment 6747 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6748 // bit control and barrier method control parts 6749 #if KMP_FAST_REDUCTION_BARRIER 6750 #define kmp_reduction_barrier_gather_bb ((int)1) 6751 #define kmp_reduction_barrier_release_bb ((int)1) 6752 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6753 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6754 #endif // KMP_FAST_REDUCTION_BARRIER 6755 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6756 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6757 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6758 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6759 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6760 #if KMP_FAST_REDUCTION_BARRIER 6761 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6762 // lin_64 ): hyper,1 6763 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6764 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6765 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6766 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6767 } 6768 #endif // KMP_FAST_REDUCTION_BARRIER 6769 } 6770 #if KMP_FAST_REDUCTION_BARRIER 6771 #undef kmp_reduction_barrier_release_pat 6772 #undef kmp_reduction_barrier_gather_pat 6773 #undef kmp_reduction_barrier_release_bb 6774 #undef kmp_reduction_barrier_gather_bb 6775 #endif // KMP_FAST_REDUCTION_BARRIER 6776 #if KMP_MIC_SUPPORTED 6777 if (__kmp_mic_type == mic2) { // KNC 6778 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6779 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6780 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6781 1; // forkjoin release 6782 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6783 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6784 } 6785 #if KMP_FAST_REDUCTION_BARRIER 6786 if (__kmp_mic_type == mic2) { // KNC 6787 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6788 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6789 } 6790 #endif // KMP_FAST_REDUCTION_BARRIER 6791 #endif // KMP_MIC_SUPPORTED 6792 6793 // From KMP_CHECKS initialization 6794 #ifdef KMP_DEBUG 6795 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6796 #else 6797 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6798 #endif 6799 6800 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6801 __kmp_foreign_tp = TRUE; 6802 6803 __kmp_global.g.g_dynamic = FALSE; 6804 __kmp_global.g.g_dynamic_mode = dynamic_default; 6805 6806 __kmp_env_initialize(NULL); 6807 6808 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6809 __kmp_user_level_mwait_init(); 6810 #endif 6811 // Print all messages in message catalog for testing purposes. 6812 #ifdef KMP_DEBUG 6813 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6814 if (__kmp_str_match_true(val)) { 6815 kmp_str_buf_t buffer; 6816 __kmp_str_buf_init(&buffer); 6817 __kmp_i18n_dump_catalog(&buffer); 6818 __kmp_printf("%s", buffer.str); 6819 __kmp_str_buf_free(&buffer); 6820 } 6821 __kmp_env_free(&val); 6822 #endif 6823 6824 __kmp_threads_capacity = 6825 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6826 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6827 __kmp_tp_capacity = __kmp_default_tp_capacity( 6828 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6829 6830 // If the library is shut down properly, both pools must be NULL. Just in 6831 // case, set them to NULL -- some memory may leak, but subsequent code will 6832 // work even if pools are not freed. 6833 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6834 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6835 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6836 __kmp_thread_pool = NULL; 6837 __kmp_thread_pool_insert_pt = NULL; 6838 __kmp_team_pool = NULL; 6839 6840 /* Allocate all of the variable sized records */ 6841 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6842 * expandable */ 6843 /* Since allocation is cache-aligned, just add extra padding at the end */ 6844 size = 6845 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6846 CACHE_LINE; 6847 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6848 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6849 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6850 6851 /* init thread counts */ 6852 KMP_DEBUG_ASSERT(__kmp_all_nth == 6853 0); // Asserts fail if the library is reinitializing and 6854 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6855 __kmp_all_nth = 0; 6856 __kmp_nth = 0; 6857 6858 /* setup the uber master thread and hierarchy */ 6859 gtid = __kmp_register_root(TRUE); 6860 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6861 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6862 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6863 6864 KMP_MB(); /* Flush all pending memory write invalidates. */ 6865 6866 __kmp_common_initialize(); 6867 6868 #if KMP_OS_UNIX 6869 /* invoke the child fork handler */ 6870 __kmp_register_atfork(); 6871 #endif 6872 6873 #if !KMP_DYNAMIC_LIB 6874 { 6875 /* Invoke the exit handler when the program finishes, only for static 6876 library. For dynamic library, we already have _fini and DllMain. */ 6877 int rc = atexit(__kmp_internal_end_atexit); 6878 if (rc != 0) { 6879 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6880 __kmp_msg_null); 6881 } 6882 } 6883 #endif 6884 6885 #if KMP_HANDLE_SIGNALS 6886 #if KMP_OS_UNIX 6887 /* NOTE: make sure that this is called before the user installs their own 6888 signal handlers so that the user handlers are called first. this way they 6889 can return false, not call our handler, avoid terminating the library, and 6890 continue execution where they left off. */ 6891 __kmp_install_signals(FALSE); 6892 #endif /* KMP_OS_UNIX */ 6893 #if KMP_OS_WINDOWS 6894 __kmp_install_signals(TRUE); 6895 #endif /* KMP_OS_WINDOWS */ 6896 #endif 6897 6898 /* we have finished the serial initialization */ 6899 __kmp_init_counter++; 6900 6901 __kmp_init_serial = TRUE; 6902 6903 if (__kmp_settings) { 6904 __kmp_env_print(); 6905 } 6906 6907 if (__kmp_display_env || __kmp_display_env_verbose) { 6908 __kmp_env_print_2(); 6909 } 6910 6911 #if OMPT_SUPPORT 6912 ompt_post_init(); 6913 #endif 6914 6915 KMP_MB(); 6916 6917 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6918 } 6919 6920 void __kmp_serial_initialize(void) { 6921 if (__kmp_init_serial) { 6922 return; 6923 } 6924 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6925 if (__kmp_init_serial) { 6926 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6927 return; 6928 } 6929 __kmp_do_serial_initialize(); 6930 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6931 } 6932 6933 static void __kmp_do_middle_initialize(void) { 6934 int i, j; 6935 int prev_dflt_team_nth; 6936 6937 if (!__kmp_init_serial) { 6938 __kmp_do_serial_initialize(); 6939 } 6940 6941 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6942 6943 // Save the previous value for the __kmp_dflt_team_nth so that 6944 // we can avoid some reinitialization if it hasn't changed. 6945 prev_dflt_team_nth = __kmp_dflt_team_nth; 6946 6947 #if KMP_AFFINITY_SUPPORTED 6948 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6949 // number of cores on the machine. 6950 __kmp_affinity_initialize(); 6951 6952 // Run through the __kmp_threads array and set the affinity mask 6953 // for each root thread that is currently registered with the RTL. 6954 for (i = 0; i < __kmp_threads_capacity; i++) { 6955 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6956 __kmp_affinity_set_init_mask(i, TRUE); 6957 } 6958 } 6959 #endif /* KMP_AFFINITY_SUPPORTED */ 6960 6961 KMP_ASSERT(__kmp_xproc > 0); 6962 if (__kmp_avail_proc == 0) { 6963 __kmp_avail_proc = __kmp_xproc; 6964 } 6965 6966 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6967 // correct them now 6968 j = 0; 6969 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6970 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6971 __kmp_avail_proc; 6972 j++; 6973 } 6974 6975 if (__kmp_dflt_team_nth == 0) { 6976 #ifdef KMP_DFLT_NTH_CORES 6977 // Default #threads = #cores 6978 __kmp_dflt_team_nth = __kmp_ncores; 6979 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6980 "__kmp_ncores (%d)\n", 6981 __kmp_dflt_team_nth)); 6982 #else 6983 // Default #threads = #available OS procs 6984 __kmp_dflt_team_nth = __kmp_avail_proc; 6985 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6986 "__kmp_avail_proc(%d)\n", 6987 __kmp_dflt_team_nth)); 6988 #endif /* KMP_DFLT_NTH_CORES */ 6989 } 6990 6991 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 6992 __kmp_dflt_team_nth = KMP_MIN_NTH; 6993 } 6994 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 6995 __kmp_dflt_team_nth = __kmp_sys_max_nth; 6996 } 6997 6998 // There's no harm in continuing if the following check fails, 6999 // but it indicates an error in the previous logic. 7000 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7001 7002 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7003 // Run through the __kmp_threads array and set the num threads icv for each 7004 // root thread that is currently registered with the RTL (which has not 7005 // already explicitly set its nthreads-var with a call to 7006 // omp_set_num_threads()). 7007 for (i = 0; i < __kmp_threads_capacity; i++) { 7008 kmp_info_t *thread = __kmp_threads[i]; 7009 if (thread == NULL) 7010 continue; 7011 if (thread->th.th_current_task->td_icvs.nproc != 0) 7012 continue; 7013 7014 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7015 } 7016 } 7017 KA_TRACE( 7018 20, 7019 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7020 __kmp_dflt_team_nth)); 7021 7022 #ifdef KMP_ADJUST_BLOCKTIME 7023 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7024 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7025 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7026 if (__kmp_nth > __kmp_avail_proc) { 7027 __kmp_zero_bt = TRUE; 7028 } 7029 } 7030 #endif /* KMP_ADJUST_BLOCKTIME */ 7031 7032 /* we have finished middle initialization */ 7033 TCW_SYNC_4(__kmp_init_middle, TRUE); 7034 7035 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7036 } 7037 7038 void __kmp_middle_initialize(void) { 7039 if (__kmp_init_middle) { 7040 return; 7041 } 7042 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7043 if (__kmp_init_middle) { 7044 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7045 return; 7046 } 7047 __kmp_do_middle_initialize(); 7048 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7049 } 7050 7051 void __kmp_parallel_initialize(void) { 7052 int gtid = __kmp_entry_gtid(); // this might be a new root 7053 7054 /* synchronize parallel initialization (for sibling) */ 7055 if (TCR_4(__kmp_init_parallel)) 7056 return; 7057 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7058 if (TCR_4(__kmp_init_parallel)) { 7059 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7060 return; 7061 } 7062 7063 /* TODO reinitialization after we have already shut down */ 7064 if (TCR_4(__kmp_global.g.g_done)) { 7065 KA_TRACE( 7066 10, 7067 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7068 __kmp_infinite_loop(); 7069 } 7070 7071 /* jc: The lock __kmp_initz_lock is already held, so calling 7072 __kmp_serial_initialize would cause a deadlock. So we call 7073 __kmp_do_serial_initialize directly. */ 7074 if (!__kmp_init_middle) { 7075 __kmp_do_middle_initialize(); 7076 } 7077 __kmp_resume_if_hard_paused(); 7078 7079 /* begin initialization */ 7080 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7081 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7082 7083 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7084 // Save the FP control regs. 7085 // Worker threads will set theirs to these values at thread startup. 7086 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7087 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7088 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7089 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7090 7091 #if KMP_OS_UNIX 7092 #if KMP_HANDLE_SIGNALS 7093 /* must be after __kmp_serial_initialize */ 7094 __kmp_install_signals(TRUE); 7095 #endif 7096 #endif 7097 7098 __kmp_suspend_initialize(); 7099 7100 #if defined(USE_LOAD_BALANCE) 7101 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7102 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7103 } 7104 #else 7105 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7106 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7107 } 7108 #endif 7109 7110 if (__kmp_version) { 7111 __kmp_print_version_2(); 7112 } 7113 7114 /* we have finished parallel initialization */ 7115 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7116 7117 KMP_MB(); 7118 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7119 7120 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7121 } 7122 7123 void __kmp_hidden_helper_initialize() { 7124 if (TCR_4(__kmp_init_hidden_helper)) 7125 return; 7126 7127 // __kmp_parallel_initialize is required before we initialize hidden helper 7128 if (!TCR_4(__kmp_init_parallel)) 7129 __kmp_parallel_initialize(); 7130 7131 // Double check. Note that this double check should not be placed before 7132 // __kmp_parallel_initialize as it will cause dead lock. 7133 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7134 if (TCR_4(__kmp_init_hidden_helper)) { 7135 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7136 return; 7137 } 7138 7139 // Set the count of hidden helper tasks to be executed to zero 7140 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7141 7142 // Set the global variable indicating that we're initializing hidden helper 7143 // team/threads 7144 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7145 7146 // Platform independent initialization 7147 __kmp_do_initialize_hidden_helper_threads(); 7148 7149 // Wait here for the finish of initialization of hidden helper teams 7150 __kmp_hidden_helper_threads_initz_wait(); 7151 7152 // We have finished hidden helper initialization 7153 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7154 7155 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7156 } 7157 7158 /* ------------------------------------------------------------------------ */ 7159 7160 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7161 kmp_team_t *team) { 7162 kmp_disp_t *dispatch; 7163 7164 KMP_MB(); 7165 7166 /* none of the threads have encountered any constructs, yet. */ 7167 this_thr->th.th_local.this_construct = 0; 7168 #if KMP_CACHE_MANAGE 7169 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7170 #endif /* KMP_CACHE_MANAGE */ 7171 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7172 KMP_DEBUG_ASSERT(dispatch); 7173 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7174 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7175 // this_thr->th.th_info.ds.ds_tid ] ); 7176 7177 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7178 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7179 if (__kmp_env_consistency_check) 7180 __kmp_push_parallel(gtid, team->t.t_ident); 7181 7182 KMP_MB(); /* Flush all pending memory write invalidates. */ 7183 } 7184 7185 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7186 kmp_team_t *team) { 7187 if (__kmp_env_consistency_check) 7188 __kmp_pop_parallel(gtid, team->t.t_ident); 7189 7190 __kmp_finish_implicit_task(this_thr); 7191 } 7192 7193 int __kmp_invoke_task_func(int gtid) { 7194 int rc; 7195 int tid = __kmp_tid_from_gtid(gtid); 7196 kmp_info_t *this_thr = __kmp_threads[gtid]; 7197 kmp_team_t *team = this_thr->th.th_team; 7198 7199 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7200 #if USE_ITT_BUILD 7201 if (__itt_stack_caller_create_ptr) { 7202 __kmp_itt_stack_callee_enter( 7203 (__itt_caller) 7204 team->t.t_stack_id); // inform ittnotify about entering user's code 7205 } 7206 #endif /* USE_ITT_BUILD */ 7207 #if INCLUDE_SSC_MARKS 7208 SSC_MARK_INVOKING(); 7209 #endif 7210 7211 #if OMPT_SUPPORT 7212 void *dummy; 7213 void **exit_frame_p; 7214 ompt_data_t *my_task_data; 7215 ompt_data_t *my_parallel_data; 7216 int ompt_team_size; 7217 7218 if (ompt_enabled.enabled) { 7219 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7220 .ompt_task_info.frame.exit_frame.ptr); 7221 } else { 7222 exit_frame_p = &dummy; 7223 } 7224 7225 my_task_data = 7226 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7227 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7228 if (ompt_enabled.ompt_callback_implicit_task) { 7229 ompt_team_size = team->t.t_nproc; 7230 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7231 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7232 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7233 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7234 } 7235 #endif 7236 7237 #if KMP_STATS_ENABLED 7238 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7239 if (previous_state == stats_state_e::TEAMS_REGION) { 7240 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7241 } else { 7242 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7243 } 7244 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7245 #endif 7246 7247 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7248 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7249 #if OMPT_SUPPORT 7250 , 7251 exit_frame_p 7252 #endif 7253 ); 7254 #if OMPT_SUPPORT 7255 *exit_frame_p = NULL; 7256 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7257 #endif 7258 7259 #if KMP_STATS_ENABLED 7260 if (previous_state == stats_state_e::TEAMS_REGION) { 7261 KMP_SET_THREAD_STATE(previous_state); 7262 } 7263 KMP_POP_PARTITIONED_TIMER(); 7264 #endif 7265 7266 #if USE_ITT_BUILD 7267 if (__itt_stack_caller_create_ptr) { 7268 __kmp_itt_stack_callee_leave( 7269 (__itt_caller) 7270 team->t.t_stack_id); // inform ittnotify about leaving user's code 7271 } 7272 #endif /* USE_ITT_BUILD */ 7273 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7274 7275 return rc; 7276 } 7277 7278 void __kmp_teams_master(int gtid) { 7279 // This routine is called by all master threads in teams construct 7280 kmp_info_t *thr = __kmp_threads[gtid]; 7281 kmp_team_t *team = thr->th.th_team; 7282 ident_t *loc = team->t.t_ident; 7283 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7284 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7285 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7286 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7287 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7288 7289 // This thread is a new CG root. Set up the proper variables. 7290 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7291 tmp->cg_root = thr; // Make thr the CG root 7292 // Init to thread limit that was stored when league masters were forked 7293 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7294 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7295 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7296 " cg_nthreads to 1\n", 7297 thr, tmp)); 7298 tmp->up = thr->th.th_cg_roots; 7299 thr->th.th_cg_roots = tmp; 7300 7301 // Launch league of teams now, but not let workers execute 7302 // (they hang on fork barrier until next parallel) 7303 #if INCLUDE_SSC_MARKS 7304 SSC_MARK_FORKING(); 7305 #endif 7306 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7307 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7308 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7309 #if INCLUDE_SSC_MARKS 7310 SSC_MARK_JOINING(); 7311 #endif 7312 // If the team size was reduced from the limit, set it to the new size 7313 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7314 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7315 // AC: last parameter "1" eliminates join barrier which won't work because 7316 // worker threads are in a fork barrier waiting for more parallel regions 7317 __kmp_join_call(loc, gtid 7318 #if OMPT_SUPPORT 7319 , 7320 fork_context_intel 7321 #endif 7322 , 7323 1); 7324 } 7325 7326 int __kmp_invoke_teams_master(int gtid) { 7327 kmp_info_t *this_thr = __kmp_threads[gtid]; 7328 kmp_team_t *team = this_thr->th.th_team; 7329 #if KMP_DEBUG 7330 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7331 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7332 (void *)__kmp_teams_master); 7333 #endif 7334 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7335 #if OMPT_SUPPORT 7336 int tid = __kmp_tid_from_gtid(gtid); 7337 ompt_data_t *task_data = 7338 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7339 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7340 if (ompt_enabled.ompt_callback_implicit_task) { 7341 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7342 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7343 ompt_task_initial); 7344 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7345 } 7346 #endif 7347 __kmp_teams_master(gtid); 7348 #if OMPT_SUPPORT 7349 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7350 #endif 7351 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7352 return 1; 7353 } 7354 7355 /* this sets the requested number of threads for the next parallel region 7356 encountered by this team. since this should be enclosed in the forkjoin 7357 critical section it should avoid race conditions with asymmetrical nested 7358 parallelism */ 7359 7360 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7361 kmp_info_t *thr = __kmp_threads[gtid]; 7362 7363 if (num_threads > 0) 7364 thr->th.th_set_nproc = num_threads; 7365 } 7366 7367 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7368 int num_threads) { 7369 KMP_DEBUG_ASSERT(thr); 7370 // Remember the number of threads for inner parallel regions 7371 if (!TCR_4(__kmp_init_middle)) 7372 __kmp_middle_initialize(); // get internal globals calculated 7373 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7374 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7375 7376 if (num_threads == 0) { 7377 if (__kmp_teams_thread_limit > 0) { 7378 num_threads = __kmp_teams_thread_limit; 7379 } else { 7380 num_threads = __kmp_avail_proc / num_teams; 7381 } 7382 // adjust num_threads w/o warning as it is not user setting 7383 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7384 // no thread_limit clause specified - do not change thread-limit-var ICV 7385 if (num_threads > __kmp_dflt_team_nth) { 7386 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7387 } 7388 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7389 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7390 } // prevent team size to exceed thread-limit-var 7391 if (num_teams * num_threads > __kmp_teams_max_nth) { 7392 num_threads = __kmp_teams_max_nth / num_teams; 7393 } 7394 if (num_threads == 0) { 7395 num_threads = 1; 7396 } 7397 } else { 7398 // This thread will be the master of the league masters 7399 // Store new thread limit; old limit is saved in th_cg_roots list 7400 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7401 // num_threads = min(num_threads, nthreads-var) 7402 if (num_threads > __kmp_dflt_team_nth) { 7403 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7404 } 7405 if (num_teams * num_threads > __kmp_teams_max_nth) { 7406 int new_threads = __kmp_teams_max_nth / num_teams; 7407 if (new_threads == 0) { 7408 new_threads = 1; 7409 } 7410 if (new_threads != num_threads) { 7411 if (!__kmp_reserve_warn) { // user asked for too many threads 7412 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7413 __kmp_msg(kmp_ms_warning, 7414 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7415 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7416 } 7417 } 7418 num_threads = new_threads; 7419 } 7420 } 7421 thr->th.th_teams_size.nth = num_threads; 7422 } 7423 7424 /* this sets the requested number of teams for the teams region and/or 7425 the number of threads for the next parallel region encountered */ 7426 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7427 int num_threads) { 7428 kmp_info_t *thr = __kmp_threads[gtid]; 7429 KMP_DEBUG_ASSERT(num_teams >= 0); 7430 KMP_DEBUG_ASSERT(num_threads >= 0); 7431 7432 if (num_teams == 0) { 7433 if (__kmp_nteams > 0) { 7434 num_teams = __kmp_nteams; 7435 } else { 7436 num_teams = 1; // default number of teams is 1. 7437 } 7438 } 7439 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7440 if (!__kmp_reserve_warn) { 7441 __kmp_reserve_warn = 1; 7442 __kmp_msg(kmp_ms_warning, 7443 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7444 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7445 } 7446 num_teams = __kmp_teams_max_nth; 7447 } 7448 // Set number of teams (number of threads in the outer "parallel" of the 7449 // teams) 7450 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7451 7452 __kmp_push_thread_limit(thr, num_teams, num_threads); 7453 } 7454 7455 /* This sets the requested number of teams for the teams region and/or 7456 the number of threads for the next parallel region encountered */ 7457 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7458 int num_teams_ub, int num_threads) { 7459 kmp_info_t *thr = __kmp_threads[gtid]; 7460 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7461 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7462 KMP_DEBUG_ASSERT(num_threads >= 0); 7463 7464 if (num_teams_lb > num_teams_ub) { 7465 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7466 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7467 } 7468 7469 int num_teams = 1; // defalt number of teams is 1. 7470 7471 if (num_teams_lb == 0 && num_teams_ub > 0) 7472 num_teams_lb = num_teams_ub; 7473 7474 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7475 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7476 if (num_teams > __kmp_teams_max_nth) { 7477 if (!__kmp_reserve_warn) { 7478 __kmp_reserve_warn = 1; 7479 __kmp_msg(kmp_ms_warning, 7480 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7481 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7482 } 7483 num_teams = __kmp_teams_max_nth; 7484 } 7485 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7486 num_teams = num_teams_ub; 7487 } else { // num_teams_lb <= num_teams <= num_teams_ub 7488 if (num_threads == 0) { 7489 if (num_teams_ub > __kmp_teams_max_nth) { 7490 num_teams = num_teams_lb; 7491 } else { 7492 num_teams = num_teams_ub; 7493 } 7494 } else { 7495 num_teams = (num_threads > __kmp_teams_max_nth) 7496 ? num_teams 7497 : __kmp_teams_max_nth / num_threads; 7498 if (num_teams < num_teams_lb) { 7499 num_teams = num_teams_lb; 7500 } else if (num_teams > num_teams_ub) { 7501 num_teams = num_teams_ub; 7502 } 7503 } 7504 } 7505 // Set number of teams (number of threads in the outer "parallel" of the 7506 // teams) 7507 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7508 7509 __kmp_push_thread_limit(thr, num_teams, num_threads); 7510 } 7511 7512 // Set the proc_bind var to use in the following parallel region. 7513 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7514 kmp_info_t *thr = __kmp_threads[gtid]; 7515 thr->th.th_set_proc_bind = proc_bind; 7516 } 7517 7518 /* Launch the worker threads into the microtask. */ 7519 7520 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7521 kmp_info_t *this_thr = __kmp_threads[gtid]; 7522 7523 #ifdef KMP_DEBUG 7524 int f; 7525 #endif /* KMP_DEBUG */ 7526 7527 KMP_DEBUG_ASSERT(team); 7528 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7529 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7530 KMP_MB(); /* Flush all pending memory write invalidates. */ 7531 7532 team->t.t_construct = 0; /* no single directives seen yet */ 7533 team->t.t_ordered.dt.t_value = 7534 0; /* thread 0 enters the ordered section first */ 7535 7536 /* Reset the identifiers on the dispatch buffer */ 7537 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7538 if (team->t.t_max_nproc > 1) { 7539 int i; 7540 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7541 team->t.t_disp_buffer[i].buffer_index = i; 7542 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7543 } 7544 } else { 7545 team->t.t_disp_buffer[0].buffer_index = 0; 7546 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7547 } 7548 7549 KMP_MB(); /* Flush all pending memory write invalidates. */ 7550 KMP_ASSERT(this_thr->th.th_team == team); 7551 7552 #ifdef KMP_DEBUG 7553 for (f = 0; f < team->t.t_nproc; f++) { 7554 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7555 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7556 } 7557 #endif /* KMP_DEBUG */ 7558 7559 /* release the worker threads so they may begin working */ 7560 __kmp_fork_barrier(gtid, 0); 7561 } 7562 7563 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7564 kmp_info_t *this_thr = __kmp_threads[gtid]; 7565 7566 KMP_DEBUG_ASSERT(team); 7567 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7568 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7569 KMP_MB(); /* Flush all pending memory write invalidates. */ 7570 7571 /* Join barrier after fork */ 7572 7573 #ifdef KMP_DEBUG 7574 if (__kmp_threads[gtid] && 7575 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7576 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7577 __kmp_threads[gtid]); 7578 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7579 "team->t.t_nproc=%d\n", 7580 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7581 team->t.t_nproc); 7582 __kmp_print_structure(); 7583 } 7584 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7585 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7586 #endif /* KMP_DEBUG */ 7587 7588 __kmp_join_barrier(gtid); /* wait for everyone */ 7589 #if OMPT_SUPPORT 7590 if (ompt_enabled.enabled && 7591 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7592 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7593 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7594 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7595 #if OMPT_OPTIONAL 7596 void *codeptr = NULL; 7597 if (KMP_MASTER_TID(ds_tid) && 7598 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7599 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7600 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7601 7602 if (ompt_enabled.ompt_callback_sync_region_wait) { 7603 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7604 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7605 codeptr); 7606 } 7607 if (ompt_enabled.ompt_callback_sync_region) { 7608 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7609 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7610 codeptr); 7611 } 7612 #endif 7613 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7614 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7615 ompt_scope_end, NULL, task_data, 0, ds_tid, 7616 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7617 } 7618 } 7619 #endif 7620 7621 KMP_MB(); /* Flush all pending memory write invalidates. */ 7622 KMP_ASSERT(this_thr->th.th_team == team); 7623 } 7624 7625 /* ------------------------------------------------------------------------ */ 7626 7627 #ifdef USE_LOAD_BALANCE 7628 7629 // Return the worker threads actively spinning in the hot team, if we 7630 // are at the outermost level of parallelism. Otherwise, return 0. 7631 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7632 int i; 7633 int retval; 7634 kmp_team_t *hot_team; 7635 7636 if (root->r.r_active) { 7637 return 0; 7638 } 7639 hot_team = root->r.r_hot_team; 7640 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7641 return hot_team->t.t_nproc - 1; // Don't count master thread 7642 } 7643 7644 // Skip the master thread - it is accounted for elsewhere. 7645 retval = 0; 7646 for (i = 1; i < hot_team->t.t_nproc; i++) { 7647 if (hot_team->t.t_threads[i]->th.th_active) { 7648 retval++; 7649 } 7650 } 7651 return retval; 7652 } 7653 7654 // Perform an automatic adjustment to the number of 7655 // threads used by the next parallel region. 7656 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7657 int retval; 7658 int pool_active; 7659 int hot_team_active; 7660 int team_curr_active; 7661 int system_active; 7662 7663 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7664 set_nproc)); 7665 KMP_DEBUG_ASSERT(root); 7666 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7667 ->th.th_current_task->td_icvs.dynamic == TRUE); 7668 KMP_DEBUG_ASSERT(set_nproc > 1); 7669 7670 if (set_nproc == 1) { 7671 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7672 return 1; 7673 } 7674 7675 // Threads that are active in the thread pool, active in the hot team for this 7676 // particular root (if we are at the outer par level), and the currently 7677 // executing thread (to become the master) are available to add to the new 7678 // team, but are currently contributing to the system load, and must be 7679 // accounted for. 7680 pool_active = __kmp_thread_pool_active_nth; 7681 hot_team_active = __kmp_active_hot_team_nproc(root); 7682 team_curr_active = pool_active + hot_team_active + 1; 7683 7684 // Check the system load. 7685 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7686 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7687 "hot team active = %d\n", 7688 system_active, pool_active, hot_team_active)); 7689 7690 if (system_active < 0) { 7691 // There was an error reading the necessary info from /proc, so use the 7692 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7693 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7694 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7695 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7696 7697 // Make this call behave like the thread limit algorithm. 7698 retval = __kmp_avail_proc - __kmp_nth + 7699 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7700 if (retval > set_nproc) { 7701 retval = set_nproc; 7702 } 7703 if (retval < KMP_MIN_NTH) { 7704 retval = KMP_MIN_NTH; 7705 } 7706 7707 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7708 retval)); 7709 return retval; 7710 } 7711 7712 // There is a slight delay in the load balance algorithm in detecting new 7713 // running procs. The real system load at this instant should be at least as 7714 // large as the #active omp thread that are available to add to the team. 7715 if (system_active < team_curr_active) { 7716 system_active = team_curr_active; 7717 } 7718 retval = __kmp_avail_proc - system_active + team_curr_active; 7719 if (retval > set_nproc) { 7720 retval = set_nproc; 7721 } 7722 if (retval < KMP_MIN_NTH) { 7723 retval = KMP_MIN_NTH; 7724 } 7725 7726 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7727 return retval; 7728 } // __kmp_load_balance_nproc() 7729 7730 #endif /* USE_LOAD_BALANCE */ 7731 7732 /* ------------------------------------------------------------------------ */ 7733 7734 /* NOTE: this is called with the __kmp_init_lock held */ 7735 void __kmp_cleanup(void) { 7736 int f; 7737 7738 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7739 7740 if (TCR_4(__kmp_init_parallel)) { 7741 #if KMP_HANDLE_SIGNALS 7742 __kmp_remove_signals(); 7743 #endif 7744 TCW_4(__kmp_init_parallel, FALSE); 7745 } 7746 7747 if (TCR_4(__kmp_init_middle)) { 7748 #if KMP_AFFINITY_SUPPORTED 7749 __kmp_affinity_uninitialize(); 7750 #endif /* KMP_AFFINITY_SUPPORTED */ 7751 __kmp_cleanup_hierarchy(); 7752 TCW_4(__kmp_init_middle, FALSE); 7753 } 7754 7755 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7756 7757 if (__kmp_init_serial) { 7758 __kmp_runtime_destroy(); 7759 __kmp_init_serial = FALSE; 7760 } 7761 7762 __kmp_cleanup_threadprivate_caches(); 7763 7764 for (f = 0; f < __kmp_threads_capacity; f++) { 7765 if (__kmp_root[f] != NULL) { 7766 __kmp_free(__kmp_root[f]); 7767 __kmp_root[f] = NULL; 7768 } 7769 } 7770 __kmp_free(__kmp_threads); 7771 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7772 // there is no need in freeing __kmp_root. 7773 __kmp_threads = NULL; 7774 __kmp_root = NULL; 7775 __kmp_threads_capacity = 0; 7776 7777 #if KMP_USE_DYNAMIC_LOCK 7778 __kmp_cleanup_indirect_user_locks(); 7779 #else 7780 __kmp_cleanup_user_locks(); 7781 #endif 7782 7783 #if KMP_AFFINITY_SUPPORTED 7784 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7785 __kmp_cpuinfo_file = NULL; 7786 #endif /* KMP_AFFINITY_SUPPORTED */ 7787 7788 #if KMP_USE_ADAPTIVE_LOCKS 7789 #if KMP_DEBUG_ADAPTIVE_LOCKS 7790 __kmp_print_speculative_stats(); 7791 #endif 7792 #endif 7793 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7794 __kmp_nested_nth.nth = NULL; 7795 __kmp_nested_nth.size = 0; 7796 __kmp_nested_nth.used = 0; 7797 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7798 __kmp_nested_proc_bind.bind_types = NULL; 7799 __kmp_nested_proc_bind.size = 0; 7800 __kmp_nested_proc_bind.used = 0; 7801 if (__kmp_affinity_format) { 7802 KMP_INTERNAL_FREE(__kmp_affinity_format); 7803 __kmp_affinity_format = NULL; 7804 } 7805 7806 __kmp_i18n_catclose(); 7807 7808 #if KMP_USE_HIER_SCHED 7809 __kmp_hier_scheds.deallocate(); 7810 #endif 7811 7812 #if KMP_STATS_ENABLED 7813 __kmp_stats_fini(); 7814 #endif 7815 7816 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7817 } 7818 7819 /* ------------------------------------------------------------------------ */ 7820 7821 int __kmp_ignore_mppbeg(void) { 7822 char *env; 7823 7824 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7825 if (__kmp_str_match_false(env)) 7826 return FALSE; 7827 } 7828 // By default __kmpc_begin() is no-op. 7829 return TRUE; 7830 } 7831 7832 int __kmp_ignore_mppend(void) { 7833 char *env; 7834 7835 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7836 if (__kmp_str_match_false(env)) 7837 return FALSE; 7838 } 7839 // By default __kmpc_end() is no-op. 7840 return TRUE; 7841 } 7842 7843 void __kmp_internal_begin(void) { 7844 int gtid; 7845 kmp_root_t *root; 7846 7847 /* this is a very important step as it will register new sibling threads 7848 and assign these new uber threads a new gtid */ 7849 gtid = __kmp_entry_gtid(); 7850 root = __kmp_threads[gtid]->th.th_root; 7851 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7852 7853 if (root->r.r_begin) 7854 return; 7855 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7856 if (root->r.r_begin) { 7857 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7858 return; 7859 } 7860 7861 root->r.r_begin = TRUE; 7862 7863 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7864 } 7865 7866 /* ------------------------------------------------------------------------ */ 7867 7868 void __kmp_user_set_library(enum library_type arg) { 7869 int gtid; 7870 kmp_root_t *root; 7871 kmp_info_t *thread; 7872 7873 /* first, make sure we are initialized so we can get our gtid */ 7874 7875 gtid = __kmp_entry_gtid(); 7876 thread = __kmp_threads[gtid]; 7877 7878 root = thread->th.th_root; 7879 7880 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7881 library_serial)); 7882 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7883 thread */ 7884 KMP_WARNING(SetLibraryIncorrectCall); 7885 return; 7886 } 7887 7888 switch (arg) { 7889 case library_serial: 7890 thread->th.th_set_nproc = 0; 7891 set__nproc(thread, 1); 7892 break; 7893 case library_turnaround: 7894 thread->th.th_set_nproc = 0; 7895 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7896 : __kmp_dflt_team_nth_ub); 7897 break; 7898 case library_throughput: 7899 thread->th.th_set_nproc = 0; 7900 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7901 : __kmp_dflt_team_nth_ub); 7902 break; 7903 default: 7904 KMP_FATAL(UnknownLibraryType, arg); 7905 } 7906 7907 __kmp_aux_set_library(arg); 7908 } 7909 7910 void __kmp_aux_set_stacksize(size_t arg) { 7911 if (!__kmp_init_serial) 7912 __kmp_serial_initialize(); 7913 7914 #if KMP_OS_DARWIN 7915 if (arg & (0x1000 - 1)) { 7916 arg &= ~(0x1000 - 1); 7917 if (arg + 0x1000) /* check for overflow if we round up */ 7918 arg += 0x1000; 7919 } 7920 #endif 7921 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7922 7923 /* only change the default stacksize before the first parallel region */ 7924 if (!TCR_4(__kmp_init_parallel)) { 7925 size_t value = arg; /* argument is in bytes */ 7926 7927 if (value < __kmp_sys_min_stksize) 7928 value = __kmp_sys_min_stksize; 7929 else if (value > KMP_MAX_STKSIZE) 7930 value = KMP_MAX_STKSIZE; 7931 7932 __kmp_stksize = value; 7933 7934 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7935 } 7936 7937 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7938 } 7939 7940 /* set the behaviour of the runtime library */ 7941 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7942 void __kmp_aux_set_library(enum library_type arg) { 7943 __kmp_library = arg; 7944 7945 switch (__kmp_library) { 7946 case library_serial: { 7947 KMP_INFORM(LibraryIsSerial); 7948 } break; 7949 case library_turnaround: 7950 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7951 __kmp_use_yield = 2; // only yield when oversubscribed 7952 break; 7953 case library_throughput: 7954 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7955 __kmp_dflt_blocktime = 200; 7956 break; 7957 default: 7958 KMP_FATAL(UnknownLibraryType, arg); 7959 } 7960 } 7961 7962 /* Getting team information common for all team API */ 7963 // Returns NULL if not in teams construct 7964 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 7965 kmp_info_t *thr = __kmp_entry_thread(); 7966 teams_serialized = 0; 7967 if (thr->th.th_teams_microtask) { 7968 kmp_team_t *team = thr->th.th_team; 7969 int tlevel = thr->th.th_teams_level; // the level of the teams construct 7970 int ii = team->t.t_level; 7971 teams_serialized = team->t.t_serialized; 7972 int level = tlevel + 1; 7973 KMP_DEBUG_ASSERT(ii >= tlevel); 7974 while (ii > level) { 7975 for (teams_serialized = team->t.t_serialized; 7976 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 7977 } 7978 if (team->t.t_serialized && (!teams_serialized)) { 7979 team = team->t.t_parent; 7980 continue; 7981 } 7982 if (ii > level) { 7983 team = team->t.t_parent; 7984 ii--; 7985 } 7986 } 7987 return team; 7988 } 7989 return NULL; 7990 } 7991 7992 int __kmp_aux_get_team_num() { 7993 int serialized; 7994 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7995 if (team) { 7996 if (serialized > 1) { 7997 return 0; // teams region is serialized ( 1 team of 1 thread ). 7998 } else { 7999 return team->t.t_master_tid; 8000 } 8001 } 8002 return 0; 8003 } 8004 8005 int __kmp_aux_get_num_teams() { 8006 int serialized; 8007 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8008 if (team) { 8009 if (serialized > 1) { 8010 return 1; 8011 } else { 8012 return team->t.t_parent->t.t_nproc; 8013 } 8014 } 8015 return 1; 8016 } 8017 8018 /* ------------------------------------------------------------------------ */ 8019 8020 /* 8021 * Affinity Format Parser 8022 * 8023 * Field is in form of: %[[[0].]size]type 8024 * % and type are required (%% means print a literal '%') 8025 * type is either single char or long name surrounded by {}, 8026 * e.g., N or {num_threads} 8027 * 0 => leading zeros 8028 * . => right justified when size is specified 8029 * by default output is left justified 8030 * size is the *minimum* field length 8031 * All other characters are printed as is 8032 * 8033 * Available field types: 8034 * L {thread_level} - omp_get_level() 8035 * n {thread_num} - omp_get_thread_num() 8036 * h {host} - name of host machine 8037 * P {process_id} - process id (integer) 8038 * T {thread_identifier} - native thread identifier (integer) 8039 * N {num_threads} - omp_get_num_threads() 8040 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8041 * a {thread_affinity} - comma separated list of integers or integer ranges 8042 * (values of affinity mask) 8043 * 8044 * Implementation-specific field types can be added 8045 * If a type is unknown, print "undefined" 8046 */ 8047 8048 // Structure holding the short name, long name, and corresponding data type 8049 // for snprintf. A table of these will represent the entire valid keyword 8050 // field types. 8051 typedef struct kmp_affinity_format_field_t { 8052 char short_name; // from spec e.g., L -> thread level 8053 const char *long_name; // from spec thread_level -> thread level 8054 char field_format; // data type for snprintf (typically 'd' or 's' 8055 // for integer or string) 8056 } kmp_affinity_format_field_t; 8057 8058 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8059 #if KMP_AFFINITY_SUPPORTED 8060 {'A', "thread_affinity", 's'}, 8061 #endif 8062 {'t', "team_num", 'd'}, 8063 {'T', "num_teams", 'd'}, 8064 {'L', "nesting_level", 'd'}, 8065 {'n', "thread_num", 'd'}, 8066 {'N', "num_threads", 'd'}, 8067 {'a', "ancestor_tnum", 'd'}, 8068 {'H', "host", 's'}, 8069 {'P', "process_id", 'd'}, 8070 {'i', "native_thread_id", 'd'}}; 8071 8072 // Return the number of characters it takes to hold field 8073 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8074 const char **ptr, 8075 kmp_str_buf_t *field_buffer) { 8076 int rc, format_index, field_value; 8077 const char *width_left, *width_right; 8078 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8079 static const int FORMAT_SIZE = 20; 8080 char format[FORMAT_SIZE] = {0}; 8081 char absolute_short_name = 0; 8082 8083 KMP_DEBUG_ASSERT(gtid >= 0); 8084 KMP_DEBUG_ASSERT(th); 8085 KMP_DEBUG_ASSERT(**ptr == '%'); 8086 KMP_DEBUG_ASSERT(field_buffer); 8087 8088 __kmp_str_buf_clear(field_buffer); 8089 8090 // Skip the initial % 8091 (*ptr)++; 8092 8093 // Check for %% first 8094 if (**ptr == '%') { 8095 __kmp_str_buf_cat(field_buffer, "%", 1); 8096 (*ptr)++; // skip over the second % 8097 return 1; 8098 } 8099 8100 // Parse field modifiers if they are present 8101 pad_zeros = false; 8102 if (**ptr == '0') { 8103 pad_zeros = true; 8104 (*ptr)++; // skip over 0 8105 } 8106 right_justify = false; 8107 if (**ptr == '.') { 8108 right_justify = true; 8109 (*ptr)++; // skip over . 8110 } 8111 // Parse width of field: [width_left, width_right) 8112 width_left = width_right = NULL; 8113 if (**ptr >= '0' && **ptr <= '9') { 8114 width_left = *ptr; 8115 SKIP_DIGITS(*ptr); 8116 width_right = *ptr; 8117 } 8118 8119 // Create the format for KMP_SNPRINTF based on flags parsed above 8120 format_index = 0; 8121 format[format_index++] = '%'; 8122 if (!right_justify) 8123 format[format_index++] = '-'; 8124 if (pad_zeros) 8125 format[format_index++] = '0'; 8126 if (width_left && width_right) { 8127 int i = 0; 8128 // Only allow 8 digit number widths. 8129 // This also prevents overflowing format variable 8130 while (i < 8 && width_left < width_right) { 8131 format[format_index++] = *width_left; 8132 width_left++; 8133 i++; 8134 } 8135 } 8136 8137 // Parse a name (long or short) 8138 // Canonicalize the name into absolute_short_name 8139 found_valid_name = false; 8140 parse_long_name = (**ptr == '{'); 8141 if (parse_long_name) 8142 (*ptr)++; // skip initial left brace 8143 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8144 sizeof(__kmp_affinity_format_table[0]); 8145 ++i) { 8146 char short_name = __kmp_affinity_format_table[i].short_name; 8147 const char *long_name = __kmp_affinity_format_table[i].long_name; 8148 char field_format = __kmp_affinity_format_table[i].field_format; 8149 if (parse_long_name) { 8150 size_t length = KMP_STRLEN(long_name); 8151 if (strncmp(*ptr, long_name, length) == 0) { 8152 found_valid_name = true; 8153 (*ptr) += length; // skip the long name 8154 } 8155 } else if (**ptr == short_name) { 8156 found_valid_name = true; 8157 (*ptr)++; // skip the short name 8158 } 8159 if (found_valid_name) { 8160 format[format_index++] = field_format; 8161 format[format_index++] = '\0'; 8162 absolute_short_name = short_name; 8163 break; 8164 } 8165 } 8166 if (parse_long_name) { 8167 if (**ptr != '}') { 8168 absolute_short_name = 0; 8169 } else { 8170 (*ptr)++; // skip over the right brace 8171 } 8172 } 8173 8174 // Attempt to fill the buffer with the requested 8175 // value using snprintf within __kmp_str_buf_print() 8176 switch (absolute_short_name) { 8177 case 't': 8178 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8179 break; 8180 case 'T': 8181 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8182 break; 8183 case 'L': 8184 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8185 break; 8186 case 'n': 8187 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8188 break; 8189 case 'H': { 8190 static const int BUFFER_SIZE = 256; 8191 char buf[BUFFER_SIZE]; 8192 __kmp_expand_host_name(buf, BUFFER_SIZE); 8193 rc = __kmp_str_buf_print(field_buffer, format, buf); 8194 } break; 8195 case 'P': 8196 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8197 break; 8198 case 'i': 8199 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8200 break; 8201 case 'N': 8202 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8203 break; 8204 case 'a': 8205 field_value = 8206 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8207 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8208 break; 8209 #if KMP_AFFINITY_SUPPORTED 8210 case 'A': { 8211 kmp_str_buf_t buf; 8212 __kmp_str_buf_init(&buf); 8213 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8214 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8215 __kmp_str_buf_free(&buf); 8216 } break; 8217 #endif 8218 default: 8219 // According to spec, If an implementation does not have info for field 8220 // type, then "undefined" is printed 8221 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8222 // Skip the field 8223 if (parse_long_name) { 8224 SKIP_TOKEN(*ptr); 8225 if (**ptr == '}') 8226 (*ptr)++; 8227 } else { 8228 (*ptr)++; 8229 } 8230 } 8231 8232 KMP_ASSERT(format_index <= FORMAT_SIZE); 8233 return rc; 8234 } 8235 8236 /* 8237 * Return number of characters needed to hold the affinity string 8238 * (not including null byte character) 8239 * The resultant string is printed to buffer, which the caller can then 8240 * handle afterwards 8241 */ 8242 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8243 kmp_str_buf_t *buffer) { 8244 const char *parse_ptr; 8245 size_t retval; 8246 const kmp_info_t *th; 8247 kmp_str_buf_t field; 8248 8249 KMP_DEBUG_ASSERT(buffer); 8250 KMP_DEBUG_ASSERT(gtid >= 0); 8251 8252 __kmp_str_buf_init(&field); 8253 __kmp_str_buf_clear(buffer); 8254 8255 th = __kmp_threads[gtid]; 8256 retval = 0; 8257 8258 // If format is NULL or zero-length string, then we use 8259 // affinity-format-var ICV 8260 parse_ptr = format; 8261 if (parse_ptr == NULL || *parse_ptr == '\0') { 8262 parse_ptr = __kmp_affinity_format; 8263 } 8264 KMP_DEBUG_ASSERT(parse_ptr); 8265 8266 while (*parse_ptr != '\0') { 8267 // Parse a field 8268 if (*parse_ptr == '%') { 8269 // Put field in the buffer 8270 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8271 __kmp_str_buf_catbuf(buffer, &field); 8272 retval += rc; 8273 } else { 8274 // Put literal character in buffer 8275 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8276 retval++; 8277 parse_ptr++; 8278 } 8279 } 8280 __kmp_str_buf_free(&field); 8281 return retval; 8282 } 8283 8284 // Displays the affinity string to stdout 8285 void __kmp_aux_display_affinity(int gtid, const char *format) { 8286 kmp_str_buf_t buf; 8287 __kmp_str_buf_init(&buf); 8288 __kmp_aux_capture_affinity(gtid, format, &buf); 8289 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8290 __kmp_str_buf_free(&buf); 8291 } 8292 8293 /* ------------------------------------------------------------------------ */ 8294 8295 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8296 int blocktime = arg; /* argument is in milliseconds */ 8297 #if KMP_USE_MONITOR 8298 int bt_intervals; 8299 #endif 8300 kmp_int8 bt_set; 8301 8302 __kmp_save_internal_controls(thread); 8303 8304 /* Normalize and set blocktime for the teams */ 8305 if (blocktime < KMP_MIN_BLOCKTIME) 8306 blocktime = KMP_MIN_BLOCKTIME; 8307 else if (blocktime > KMP_MAX_BLOCKTIME) 8308 blocktime = KMP_MAX_BLOCKTIME; 8309 8310 set__blocktime_team(thread->th.th_team, tid, blocktime); 8311 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8312 8313 #if KMP_USE_MONITOR 8314 /* Calculate and set blocktime intervals for the teams */ 8315 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8316 8317 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8318 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8319 #endif 8320 8321 /* Set whether blocktime has been set to "TRUE" */ 8322 bt_set = TRUE; 8323 8324 set__bt_set_team(thread->th.th_team, tid, bt_set); 8325 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8326 #if KMP_USE_MONITOR 8327 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8328 "bt_intervals=%d, monitor_updates=%d\n", 8329 __kmp_gtid_from_tid(tid, thread->th.th_team), 8330 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8331 __kmp_monitor_wakeups)); 8332 #else 8333 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8334 __kmp_gtid_from_tid(tid, thread->th.th_team), 8335 thread->th.th_team->t.t_id, tid, blocktime)); 8336 #endif 8337 } 8338 8339 void __kmp_aux_set_defaults(char const *str, size_t len) { 8340 if (!__kmp_init_serial) { 8341 __kmp_serial_initialize(); 8342 } 8343 __kmp_env_initialize(str); 8344 8345 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8346 __kmp_env_print(); 8347 } 8348 } // __kmp_aux_set_defaults 8349 8350 /* ------------------------------------------------------------------------ */ 8351 /* internal fast reduction routines */ 8352 8353 PACKED_REDUCTION_METHOD_T 8354 __kmp_determine_reduction_method( 8355 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8356 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8357 kmp_critical_name *lck) { 8358 8359 // Default reduction method: critical construct ( lck != NULL, like in current 8360 // PAROPT ) 8361 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8362 // can be selected by RTL 8363 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8364 // can be selected by RTL 8365 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8366 // among generated by PAROPT. 8367 8368 PACKED_REDUCTION_METHOD_T retval; 8369 8370 int team_size; 8371 8372 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8373 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8374 8375 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8376 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8377 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8378 8379 retval = critical_reduce_block; 8380 8381 // another choice of getting a team size (with 1 dynamic deference) is slower 8382 team_size = __kmp_get_team_num_threads(global_tid); 8383 if (team_size == 1) { 8384 8385 retval = empty_reduce_block; 8386 8387 } else { 8388 8389 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8390 8391 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8392 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8393 8394 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8395 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8396 8397 int teamsize_cutoff = 4; 8398 8399 #if KMP_MIC_SUPPORTED 8400 if (__kmp_mic_type != non_mic) { 8401 teamsize_cutoff = 8; 8402 } 8403 #endif 8404 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8405 if (tree_available) { 8406 if (team_size <= teamsize_cutoff) { 8407 if (atomic_available) { 8408 retval = atomic_reduce_block; 8409 } 8410 } else { 8411 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8412 } 8413 } else if (atomic_available) { 8414 retval = atomic_reduce_block; 8415 } 8416 #else 8417 #error "Unknown or unsupported OS" 8418 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8419 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8420 8421 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8422 8423 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8424 8425 // basic tuning 8426 8427 if (atomic_available) { 8428 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8429 retval = atomic_reduce_block; 8430 } 8431 } // otherwise: use critical section 8432 8433 #elif KMP_OS_DARWIN 8434 8435 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8436 if (atomic_available && (num_vars <= 3)) { 8437 retval = atomic_reduce_block; 8438 } else if (tree_available) { 8439 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8440 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8441 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8442 } 8443 } // otherwise: use critical section 8444 8445 #else 8446 #error "Unknown or unsupported OS" 8447 #endif 8448 8449 #else 8450 #error "Unknown or unsupported architecture" 8451 #endif 8452 } 8453 8454 // KMP_FORCE_REDUCTION 8455 8456 // If the team is serialized (team_size == 1), ignore the forced reduction 8457 // method and stay with the unsynchronized method (empty_reduce_block) 8458 if (__kmp_force_reduction_method != reduction_method_not_defined && 8459 team_size != 1) { 8460 8461 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8462 8463 int atomic_available, tree_available; 8464 8465 switch ((forced_retval = __kmp_force_reduction_method)) { 8466 case critical_reduce_block: 8467 KMP_ASSERT(lck); // lck should be != 0 8468 break; 8469 8470 case atomic_reduce_block: 8471 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8472 if (!atomic_available) { 8473 KMP_WARNING(RedMethodNotSupported, "atomic"); 8474 forced_retval = critical_reduce_block; 8475 } 8476 break; 8477 8478 case tree_reduce_block: 8479 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8480 if (!tree_available) { 8481 KMP_WARNING(RedMethodNotSupported, "tree"); 8482 forced_retval = critical_reduce_block; 8483 } else { 8484 #if KMP_FAST_REDUCTION_BARRIER 8485 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8486 #endif 8487 } 8488 break; 8489 8490 default: 8491 KMP_ASSERT(0); // "unsupported method specified" 8492 } 8493 8494 retval = forced_retval; 8495 } 8496 8497 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8498 8499 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8500 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8501 8502 return (retval); 8503 } 8504 // this function is for testing set/get/determine reduce method 8505 kmp_int32 __kmp_get_reduce_method(void) { 8506 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8507 } 8508 8509 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8510 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8511 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8512 8513 // Hard pause shuts down the runtime completely. Resume happens naturally when 8514 // OpenMP is used subsequently. 8515 void __kmp_hard_pause() { 8516 __kmp_pause_status = kmp_hard_paused; 8517 __kmp_internal_end_thread(-1); 8518 } 8519 8520 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8521 void __kmp_resume_if_soft_paused() { 8522 if (__kmp_pause_status == kmp_soft_paused) { 8523 __kmp_pause_status = kmp_not_paused; 8524 8525 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8526 kmp_info_t *thread = __kmp_threads[gtid]; 8527 if (thread) { // Wake it if sleeping 8528 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8529 thread); 8530 if (fl.is_sleeping()) 8531 fl.resume(gtid); 8532 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8533 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8534 } else { // thread holds the lock and may sleep soon 8535 do { // until either the thread sleeps, or we can get the lock 8536 if (fl.is_sleeping()) { 8537 fl.resume(gtid); 8538 break; 8539 } else if (__kmp_try_suspend_mx(thread)) { 8540 __kmp_unlock_suspend_mx(thread); 8541 break; 8542 } 8543 } while (1); 8544 } 8545 } 8546 } 8547 } 8548 } 8549 8550 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8551 // TODO: add warning messages 8552 int __kmp_pause_resource(kmp_pause_status_t level) { 8553 if (level == kmp_not_paused) { // requesting resume 8554 if (__kmp_pause_status == kmp_not_paused) { 8555 // error message about runtime not being paused, so can't resume 8556 return 1; 8557 } else { 8558 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8559 __kmp_pause_status == kmp_hard_paused); 8560 __kmp_pause_status = kmp_not_paused; 8561 return 0; 8562 } 8563 } else if (level == kmp_soft_paused) { // requesting soft pause 8564 if (__kmp_pause_status != kmp_not_paused) { 8565 // error message about already being paused 8566 return 1; 8567 } else { 8568 __kmp_soft_pause(); 8569 return 0; 8570 } 8571 } else if (level == kmp_hard_paused) { // requesting hard pause 8572 if (__kmp_pause_status != kmp_not_paused) { 8573 // error message about already being paused 8574 return 1; 8575 } else { 8576 __kmp_hard_pause(); 8577 return 0; 8578 } 8579 } else { 8580 // error message about invalid level 8581 return 1; 8582 } 8583 } 8584 8585 void __kmp_omp_display_env(int verbose) { 8586 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8587 if (__kmp_init_serial == 0) 8588 __kmp_do_serial_initialize(); 8589 __kmp_display_env_impl(!verbose, verbose); 8590 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8591 } 8592 8593 // Globals and functions for hidden helper task 8594 kmp_info_t **__kmp_hidden_helper_threads; 8595 kmp_info_t *__kmp_hidden_helper_main_thread; 8596 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8597 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8598 #if KMP_OS_LINUX 8599 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8600 #else 8601 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8602 #endif 8603 8604 namespace { 8605 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8606 8607 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8608 // This is an explicit synchronization on all hidden helper threads in case 8609 // that when a regular thread pushes a hidden helper task to one hidden 8610 // helper thread, the thread has not been awaken once since they're released 8611 // by the main thread after creating the team. 8612 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8613 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8614 __kmp_hidden_helper_threads_num) 8615 ; 8616 8617 // If main thread, then wait for signal 8618 if (__kmpc_master(nullptr, *gtid)) { 8619 // First, unset the initial state and release the initial thread 8620 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8621 __kmp_hidden_helper_initz_release(); 8622 __kmp_hidden_helper_main_thread_wait(); 8623 // Now wake up all worker threads 8624 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8625 __kmp_hidden_helper_worker_thread_signal(); 8626 } 8627 } 8628 } 8629 } // namespace 8630 8631 void __kmp_hidden_helper_threads_initz_routine() { 8632 // Create a new root for hidden helper team/threads 8633 const int gtid = __kmp_register_root(TRUE); 8634 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8635 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8636 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8637 __kmp_hidden_helper_threads_num; 8638 8639 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8640 8641 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8642 8643 // Set the initialization flag to FALSE 8644 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8645 8646 __kmp_hidden_helper_threads_deinitz_release(); 8647 } 8648