1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 #if OMP_PROFILING_SUPPORT 36 #include "llvm/Support/TimeProfiler.h" 37 static char *ProfileTraceFile = nullptr; 38 #endif 39 40 /* these are temporary issues to be dealt with */ 41 #define KMP_USE_PRCTL 0 42 43 #if KMP_OS_WINDOWS 44 #include <process.h> 45 #endif 46 47 #include "tsan_annotations.h" 48 49 #if KMP_OS_WINDOWS 50 // windows does not need include files as it doesn't use shared memory 51 #else 52 #include <sys/mman.h> 53 #include <sys/stat.h> 54 #include <fcntl.h> 55 #define SHM_SIZE 1024 56 #endif 57 58 #if defined(KMP_GOMP_COMPAT) 59 char const __kmp_version_alt_comp[] = 60 KMP_VERSION_PREFIX "alternative compiler support: yes"; 61 #endif /* defined(KMP_GOMP_COMPAT) */ 62 63 char const __kmp_version_omp_api[] = 64 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 65 66 #ifdef KMP_DEBUG 67 char const __kmp_version_lock[] = 68 KMP_VERSION_PREFIX "lock type: run time selectable"; 69 #endif /* KMP_DEBUG */ 70 71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 72 73 /* ------------------------------------------------------------------------ */ 74 75 #if KMP_USE_MONITOR 76 kmp_info_t __kmp_monitor; 77 #endif 78 79 /* Forward declarations */ 80 81 void __kmp_cleanup(void); 82 83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 84 int gtid); 85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 86 kmp_internal_control_t *new_icvs, 87 ident_t *loc); 88 #if KMP_AFFINITY_SUPPORTED 89 static void __kmp_partition_places(kmp_team_t *team, 90 int update_master_only = 0); 91 #endif 92 static void __kmp_do_serial_initialize(void); 93 void __kmp_fork_barrier(int gtid, int tid); 94 void __kmp_join_barrier(int gtid); 95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 96 kmp_internal_control_t *new_icvs, ident_t *loc); 97 98 #ifdef USE_LOAD_BALANCE 99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 100 #endif 101 102 static int __kmp_expand_threads(int nNeed); 103 #if KMP_OS_WINDOWS 104 static int __kmp_unregister_root_other_thread(int gtid); 105 #endif 106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 108 109 /* Calculate the identifier of the current thread */ 110 /* fast (and somewhat portable) way to get unique identifier of executing 111 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 112 int __kmp_get_global_thread_id() { 113 int i; 114 kmp_info_t **other_threads; 115 size_t stack_data; 116 char *stack_addr; 117 size_t stack_size; 118 char *stack_base; 119 120 KA_TRACE( 121 1000, 122 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 123 __kmp_nth, __kmp_all_nth)); 124 125 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 126 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 127 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 128 __kmp_init_gtid for this to work. */ 129 130 if (!TCR_4(__kmp_init_gtid)) 131 return KMP_GTID_DNE; 132 133 #ifdef KMP_TDATA_GTID 134 if (TCR_4(__kmp_gtid_mode) >= 3) { 135 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 136 return __kmp_gtid; 137 } 138 #endif 139 if (TCR_4(__kmp_gtid_mode) >= 2) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 141 return __kmp_gtid_get_specific(); 142 } 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 144 145 stack_addr = (char *)&stack_data; 146 other_threads = __kmp_threads; 147 148 /* ATT: The code below is a source of potential bugs due to unsynchronized 149 access to __kmp_threads array. For example: 150 1. Current thread loads other_threads[i] to thr and checks it, it is 151 non-NULL. 152 2. Current thread is suspended by OS. 153 3. Another thread unregisters and finishes (debug versions of free() 154 may fill memory with something like 0xEF). 155 4. Current thread is resumed. 156 5. Current thread reads junk from *thr. 157 TODO: Fix it. --ln */ 158 159 for (i = 0; i < __kmp_threads_capacity; i++) { 160 161 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 162 if (!thr) 163 continue; 164 165 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 166 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 167 168 /* stack grows down -- search through all of the active threads */ 169 170 if (stack_addr <= stack_base) { 171 size_t stack_diff = stack_base - stack_addr; 172 173 if (stack_diff <= stack_size) { 174 /* The only way we can be closer than the allocated */ 175 /* stack size is if we are running on this thread. */ 176 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 177 return i; 178 } 179 } 180 } 181 182 /* get specific to try and determine our gtid */ 183 KA_TRACE(1000, 184 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 185 "thread, using TLS\n")); 186 i = __kmp_gtid_get_specific(); 187 188 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 189 190 /* if we havn't been assigned a gtid, then return code */ 191 if (i < 0) 192 return i; 193 194 /* dynamically updated stack window for uber threads to avoid get_specific 195 call */ 196 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 197 KMP_FATAL(StackOverflow, i); 198 } 199 200 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 201 if (stack_addr > stack_base) { 202 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 203 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 204 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 205 stack_base); 206 } else { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 208 stack_base - stack_addr); 209 } 210 211 /* Reprint stack bounds for ubermaster since they have been refined */ 212 if (__kmp_storage_map) { 213 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 214 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 215 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 216 other_threads[i]->th.th_info.ds.ds_stacksize, 217 "th_%d stack (refinement)", i); 218 } 219 return i; 220 } 221 222 int __kmp_get_global_thread_id_reg() { 223 int gtid; 224 225 if (!__kmp_init_serial) { 226 gtid = KMP_GTID_DNE; 227 } else 228 #ifdef KMP_TDATA_GTID 229 if (TCR_4(__kmp_gtid_mode) >= 3) { 230 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 231 gtid = __kmp_gtid; 232 } else 233 #endif 234 if (TCR_4(__kmp_gtid_mode) >= 2) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 236 gtid = __kmp_gtid_get_specific(); 237 } else { 238 KA_TRACE(1000, 239 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 240 gtid = __kmp_get_global_thread_id(); 241 } 242 243 /* we must be a new uber master sibling thread */ 244 if (gtid == KMP_GTID_DNE) { 245 KA_TRACE(10, 246 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 247 "Registering a new gtid.\n")); 248 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 249 if (!__kmp_init_serial) { 250 __kmp_do_serial_initialize(); 251 gtid = __kmp_gtid_get_specific(); 252 } else { 253 gtid = __kmp_register_root(FALSE); 254 } 255 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 256 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 257 } 258 259 KMP_DEBUG_ASSERT(gtid >= 0); 260 261 return gtid; 262 } 263 264 /* caller must hold forkjoin_lock */ 265 void __kmp_check_stack_overlap(kmp_info_t *th) { 266 int f; 267 char *stack_beg = NULL; 268 char *stack_end = NULL; 269 int gtid; 270 271 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 272 if (__kmp_storage_map) { 273 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 274 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 275 276 gtid = __kmp_gtid_from_thread(th); 277 278 if (gtid == KMP_GTID_MONITOR) { 279 __kmp_print_storage_map_gtid( 280 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 281 "th_%s stack (%s)", "mon", 282 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 283 } else { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%d stack (%s)", gtid, 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } 289 } 290 291 /* No point in checking ubermaster threads since they use refinement and 292 * cannot overlap */ 293 gtid = __kmp_gtid_from_thread(th); 294 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 295 KA_TRACE(10, 296 ("__kmp_check_stack_overlap: performing extensive checking\n")); 297 if (stack_beg == NULL) { 298 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 299 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 300 } 301 302 for (f = 0; f < __kmp_threads_capacity; f++) { 303 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 304 305 if (f_th && f_th != th) { 306 char *other_stack_end = 307 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 308 char *other_stack_beg = 309 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 310 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 311 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 312 313 /* Print the other stack values before the abort */ 314 if (__kmp_storage_map) 315 __kmp_print_storage_map_gtid( 316 -1, other_stack_beg, other_stack_end, 317 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 318 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 319 320 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 321 __kmp_msg_null); 322 } 323 } 324 } 325 } 326 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 327 } 328 329 /* ------------------------------------------------------------------------ */ 330 331 void __kmp_infinite_loop(void) { 332 static int done = FALSE; 333 334 while (!done) { 335 KMP_YIELD(TRUE); 336 } 337 } 338 339 #define MAX_MESSAGE 512 340 341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 342 char const *format, ...) { 343 char buffer[MAX_MESSAGE]; 344 va_list ap; 345 346 va_start(ap, format); 347 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 348 p2, (unsigned long)size, format); 349 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 350 __kmp_vprintf(kmp_err, buffer, ap); 351 #if KMP_PRINT_DATA_PLACEMENT 352 int node; 353 if (gtid >= 0) { 354 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 355 if (__kmp_storage_map_verbose) { 356 node = __kmp_get_host_node(p1); 357 if (node < 0) /* doesn't work, so don't try this next time */ 358 __kmp_storage_map_verbose = FALSE; 359 else { 360 char *last; 361 int lastNode; 362 int localProc = __kmp_get_cpu_from_gtid(gtid); 363 364 const int page_size = KMP_GET_PAGE_SIZE(); 365 366 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 367 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 368 if (localProc >= 0) 369 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 370 localProc >> 1); 371 else 372 __kmp_printf_no_lock(" GTID %d\n", gtid); 373 #if KMP_USE_PRCTL 374 /* The more elaborate format is disabled for now because of the prctl 375 * hanging bug. */ 376 do { 377 last = p1; 378 lastNode = node; 379 /* This loop collates adjacent pages with the same host node. */ 380 do { 381 (char *)p1 += page_size; 382 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 383 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 384 lastNode); 385 } while (p1 <= p2); 386 #else 387 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 388 (char *)p1 + (page_size - 1), 389 __kmp_get_host_node(p1)); 390 if (p1 < p2) { 391 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 392 (char *)p2 + (page_size - 1), 393 __kmp_get_host_node(p2)); 394 } 395 #endif 396 } 397 } 398 } else 399 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 400 } 401 #endif /* KMP_PRINT_DATA_PLACEMENT */ 402 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 403 } 404 405 void __kmp_warn(char const *format, ...) { 406 char buffer[MAX_MESSAGE]; 407 va_list ap; 408 409 if (__kmp_generate_warnings == kmp_warnings_off) { 410 return; 411 } 412 413 va_start(ap, format); 414 415 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 416 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 417 __kmp_vprintf(kmp_err, buffer, ap); 418 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 419 420 va_end(ap); 421 } 422 423 void __kmp_abort_process() { 424 // Later threads may stall here, but that's ok because abort() will kill them. 425 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 426 427 if (__kmp_debug_buf) { 428 __kmp_dump_debug_buffer(); 429 } 430 431 if (KMP_OS_WINDOWS) { 432 // Let other threads know of abnormal termination and prevent deadlock 433 // if abort happened during library initialization or shutdown 434 __kmp_global.g.g_abort = SIGABRT; 435 436 /* On Windows* OS by default abort() causes pop-up error box, which stalls 437 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 438 boxes. _set_abort_behavior() works well, but this function is not 439 available in VS7 (this is not problem for DLL, but it is a problem for 440 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 441 help, at least in some versions of MS C RTL. 442 443 It seems following sequence is the only way to simulate abort() and 444 avoid pop-up error box. */ 445 raise(SIGABRT); 446 _exit(3); // Just in case, if signal ignored, exit anyway. 447 } else { 448 __kmp_unregister_library(); 449 abort(); 450 } 451 452 __kmp_infinite_loop(); 453 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 454 455 } // __kmp_abort_process 456 457 void __kmp_abort_thread(void) { 458 // TODO: Eliminate g_abort global variable and this function. 459 // In case of abort just call abort(), it will kill all the threads. 460 __kmp_infinite_loop(); 461 } // __kmp_abort_thread 462 463 /* Print out the storage map for the major kmp_info_t thread data structures 464 that are allocated together. */ 465 466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 467 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 468 gtid); 469 470 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 471 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 474 sizeof(kmp_local_t), "th_%d.th_local", gtid); 475 476 __kmp_print_storage_map_gtid( 477 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 478 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 479 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 481 &thr->th.th_bar[bs_plain_barrier + 1], 482 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 483 gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 486 &thr->th.th_bar[bs_forkjoin_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 488 gtid); 489 490 #if KMP_FAST_REDUCTION_BARRIER 491 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 492 &thr->th.th_bar[bs_reduction_barrier + 1], 493 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 494 gtid); 495 #endif // KMP_FAST_REDUCTION_BARRIER 496 } 497 498 /* Print out the storage map for the major kmp_team_t team data structures 499 that are allocated together. */ 500 501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 502 int team_id, int num_thr) { 503 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 504 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 505 header, team_id); 506 507 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 508 &team->t.t_bar[bs_last_barrier], 509 sizeof(kmp_balign_team_t) * bs_last_barrier, 510 "%s_%d.t_bar", header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 513 &team->t.t_bar[bs_plain_barrier + 1], 514 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 515 header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 518 &team->t.t_bar[bs_forkjoin_barrier + 1], 519 sizeof(kmp_balign_team_t), 520 "%s_%d.t_bar[forkjoin]", header, team_id); 521 522 #if KMP_FAST_REDUCTION_BARRIER 523 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 524 &team->t.t_bar[bs_reduction_barrier + 1], 525 sizeof(kmp_balign_team_t), 526 "%s_%d.t_bar[reduction]", header, team_id); 527 #endif // KMP_FAST_REDUCTION_BARRIER 528 529 __kmp_print_storage_map_gtid( 530 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 531 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 532 533 __kmp_print_storage_map_gtid( 534 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 535 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 536 537 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 538 &team->t.t_disp_buffer[num_disp_buff], 539 sizeof(dispatch_shared_info_t) * num_disp_buff, 540 "%s_%d.t_disp_buffer", header, team_id); 541 } 542 543 static void __kmp_init_allocator() { 544 __kmp_init_memkind(); 545 __kmp_init_target_mem(); 546 } 547 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 548 549 /* ------------------------------------------------------------------------ */ 550 551 #if KMP_DYNAMIC_LIB 552 #if KMP_OS_WINDOWS 553 554 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 555 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 556 557 switch (fdwReason) { 558 559 case DLL_PROCESS_ATTACH: 560 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 561 562 return TRUE; 563 564 case DLL_PROCESS_DETACH: 565 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 566 567 // According to Windows* documentation for DllMain entry point: 568 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 569 // lpReserved == NULL when FreeLibrary() is called, 570 // lpReserved != NULL when the process is terminated. 571 // When FreeLibrary() is called, worker threads remain alive. So the 572 // runtime's state is consistent and executing proper shutdown is OK. 573 // When the process is terminated, worker threads have exited or been 574 // forcefully terminated by the OS and only the shutdown thread remains. 575 // This can leave the runtime in an inconsistent state. 576 // Hence, only attempt proper cleanup when FreeLibrary() is called. 577 // Otherwise, rely on OS to reclaim resources. 578 if (lpReserved == NULL) 579 __kmp_internal_end_library(__kmp_gtid_get_specific()); 580 581 return TRUE; 582 583 case DLL_THREAD_ATTACH: 584 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 585 586 /* if we want to register new siblings all the time here call 587 * __kmp_get_gtid(); */ 588 return TRUE; 589 590 case DLL_THREAD_DETACH: 591 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 592 593 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 594 return TRUE; 595 } 596 597 return TRUE; 598 } 599 600 #endif /* KMP_OS_WINDOWS */ 601 #endif /* KMP_DYNAMIC_LIB */ 602 603 /* __kmp_parallel_deo -- Wait until it's our turn. */ 604 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 605 int gtid = *gtid_ref; 606 #ifdef BUILD_PARALLEL_ORDERED 607 kmp_team_t *team = __kmp_team_from_gtid(gtid); 608 #endif /* BUILD_PARALLEL_ORDERED */ 609 610 if (__kmp_env_consistency_check) { 611 if (__kmp_threads[gtid]->th.th_root->r.r_active) 612 #if KMP_USE_DYNAMIC_LOCK 613 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 614 #else 615 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 616 #endif 617 } 618 #ifdef BUILD_PARALLEL_ORDERED 619 if (!team->t.t_serialized) { 620 KMP_MB(); 621 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 622 NULL); 623 KMP_MB(); 624 } 625 #endif /* BUILD_PARALLEL_ORDERED */ 626 } 627 628 /* __kmp_parallel_dxo -- Signal the next task. */ 629 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 630 int gtid = *gtid_ref; 631 #ifdef BUILD_PARALLEL_ORDERED 632 int tid = __kmp_tid_from_gtid(gtid); 633 kmp_team_t *team = __kmp_team_from_gtid(gtid); 634 #endif /* BUILD_PARALLEL_ORDERED */ 635 636 if (__kmp_env_consistency_check) { 637 if (__kmp_threads[gtid]->th.th_root->r.r_active) 638 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 639 } 640 #ifdef BUILD_PARALLEL_ORDERED 641 if (!team->t.t_serialized) { 642 KMP_MB(); /* Flush all pending memory write invalidates. */ 643 644 /* use the tid of the next thread in this team */ 645 /* TODO replace with general release procedure */ 646 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 647 648 KMP_MB(); /* Flush all pending memory write invalidates. */ 649 } 650 #endif /* BUILD_PARALLEL_ORDERED */ 651 } 652 653 /* ------------------------------------------------------------------------ */ 654 /* The BARRIER for a SINGLE process section is always explicit */ 655 656 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 657 int status; 658 kmp_info_t *th; 659 kmp_team_t *team; 660 661 if (!TCR_4(__kmp_init_parallel)) 662 __kmp_parallel_initialize(); 663 __kmp_resume_if_soft_paused(); 664 665 th = __kmp_threads[gtid]; 666 team = th->th.th_team; 667 status = 0; 668 669 th->th.th_ident = id_ref; 670 671 if (team->t.t_serialized) { 672 status = 1; 673 } else { 674 kmp_int32 old_this = th->th.th_local.this_construct; 675 676 ++th->th.th_local.this_construct; 677 /* try to set team count to thread count--success means thread got the 678 single block */ 679 /* TODO: Should this be acquire or release? */ 680 if (team->t.t_construct == old_this) { 681 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 682 th->th.th_local.this_construct); 683 } 684 #if USE_ITT_BUILD 685 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 686 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 687 team->t.t_active_level == 688 1) { // Only report metadata by master of active team at level 1 689 __kmp_itt_metadata_single(id_ref); 690 } 691 #endif /* USE_ITT_BUILD */ 692 } 693 694 if (__kmp_env_consistency_check) { 695 if (status && push_ws) { 696 __kmp_push_workshare(gtid, ct_psingle, id_ref); 697 } else { 698 __kmp_check_workshare(gtid, ct_psingle, id_ref); 699 } 700 } 701 #if USE_ITT_BUILD 702 if (status) { 703 __kmp_itt_single_start(gtid); 704 } 705 #endif /* USE_ITT_BUILD */ 706 return status; 707 } 708 709 void __kmp_exit_single(int gtid) { 710 #if USE_ITT_BUILD 711 __kmp_itt_single_end(gtid); 712 #endif /* USE_ITT_BUILD */ 713 if (__kmp_env_consistency_check) 714 __kmp_pop_workshare(gtid, ct_psingle, NULL); 715 } 716 717 /* determine if we can go parallel or must use a serialized parallel region and 718 * how many threads we can use 719 * set_nproc is the number of threads requested for the team 720 * returns 0 if we should serialize or only use one thread, 721 * otherwise the number of threads to use 722 * The forkjoin lock is held by the caller. */ 723 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 724 int master_tid, int set_nthreads, 725 int enter_teams) { 726 int capacity; 727 int new_nthreads; 728 KMP_DEBUG_ASSERT(__kmp_init_serial); 729 KMP_DEBUG_ASSERT(root && parent_team); 730 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 731 732 // If dyn-var is set, dynamically adjust the number of desired threads, 733 // according to the method specified by dynamic_mode. 734 new_nthreads = set_nthreads; 735 if (!get__dynamic_2(parent_team, master_tid)) { 736 ; 737 } 738 #ifdef USE_LOAD_BALANCE 739 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 740 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 741 if (new_nthreads == 1) { 742 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 743 "reservation to 1 thread\n", 744 master_tid)); 745 return 1; 746 } 747 if (new_nthreads < set_nthreads) { 748 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 749 "reservation to %d threads\n", 750 master_tid, new_nthreads)); 751 } 752 } 753 #endif /* USE_LOAD_BALANCE */ 754 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 755 new_nthreads = __kmp_avail_proc - __kmp_nth + 756 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 757 if (new_nthreads <= 1) { 758 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 759 "reservation to 1 thread\n", 760 master_tid)); 761 return 1; 762 } 763 if (new_nthreads < set_nthreads) { 764 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 765 "reservation to %d threads\n", 766 master_tid, new_nthreads)); 767 } else { 768 new_nthreads = set_nthreads; 769 } 770 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 771 if (set_nthreads > 2) { 772 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 773 new_nthreads = (new_nthreads % set_nthreads) + 1; 774 if (new_nthreads == 1) { 775 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 776 "reservation to 1 thread\n", 777 master_tid)); 778 return 1; 779 } 780 if (new_nthreads < set_nthreads) { 781 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 782 "reservation to %d threads\n", 783 master_tid, new_nthreads)); 784 } 785 } 786 } else { 787 KMP_ASSERT(0); 788 } 789 790 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 791 if (__kmp_nth + new_nthreads - 792 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 793 __kmp_max_nth) { 794 int tl_nthreads = __kmp_max_nth - __kmp_nth + 795 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 796 if (tl_nthreads <= 0) { 797 tl_nthreads = 1; 798 } 799 800 // If dyn-var is false, emit a 1-time warning. 801 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 802 __kmp_reserve_warn = 1; 803 __kmp_msg(kmp_ms_warning, 804 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 805 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 806 } 807 if (tl_nthreads == 1) { 808 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 809 "reduced reservation to 1 thread\n", 810 master_tid)); 811 return 1; 812 } 813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 814 "reservation to %d threads\n", 815 master_tid, tl_nthreads)); 816 new_nthreads = tl_nthreads; 817 } 818 819 // Respect OMP_THREAD_LIMIT 820 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 821 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 822 if (cg_nthreads + new_nthreads - 823 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 824 max_cg_threads) { 825 int tl_nthreads = max_cg_threads - cg_nthreads + 826 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 827 if (tl_nthreads <= 0) { 828 tl_nthreads = 1; 829 } 830 831 // If dyn-var is false, emit a 1-time warning. 832 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 833 __kmp_reserve_warn = 1; 834 __kmp_msg(kmp_ms_warning, 835 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 836 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 837 } 838 if (tl_nthreads == 1) { 839 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 840 "reduced reservation to 1 thread\n", 841 master_tid)); 842 return 1; 843 } 844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 845 "reservation to %d threads\n", 846 master_tid, tl_nthreads)); 847 new_nthreads = tl_nthreads; 848 } 849 850 // Check if the threads array is large enough, or needs expanding. 851 // See comment in __kmp_register_root() about the adjustment if 852 // __kmp_threads[0] == NULL. 853 capacity = __kmp_threads_capacity; 854 if (TCR_PTR(__kmp_threads[0]) == NULL) { 855 --capacity; 856 } 857 if (__kmp_nth + new_nthreads - 858 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 859 capacity) { 860 // Expand the threads array. 861 int slotsRequired = __kmp_nth + new_nthreads - 862 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 863 capacity; 864 int slotsAdded = __kmp_expand_threads(slotsRequired); 865 if (slotsAdded < slotsRequired) { 866 // The threads array was not expanded enough. 867 new_nthreads -= (slotsRequired - slotsAdded); 868 KMP_ASSERT(new_nthreads >= 1); 869 870 // If dyn-var is false, emit a 1-time warning. 871 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 872 __kmp_reserve_warn = 1; 873 if (__kmp_tp_cached) { 874 __kmp_msg(kmp_ms_warning, 875 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 876 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 877 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 878 } else { 879 __kmp_msg(kmp_ms_warning, 880 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 881 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 882 } 883 } 884 } 885 } 886 887 #ifdef KMP_DEBUG 888 if (new_nthreads == 1) { 889 KC_TRACE(10, 890 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 891 "dead roots and rechecking; requested %d threads\n", 892 __kmp_get_gtid(), set_nthreads)); 893 } else { 894 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 895 " %d threads\n", 896 __kmp_get_gtid(), new_nthreads, set_nthreads)); 897 } 898 #endif // KMP_DEBUG 899 return new_nthreads; 900 } 901 902 /* Allocate threads from the thread pool and assign them to the new team. We are 903 assured that there are enough threads available, because we checked on that 904 earlier within critical section forkjoin */ 905 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 906 kmp_info_t *master_th, int master_gtid) { 907 int i; 908 int use_hot_team; 909 910 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 911 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 912 KMP_MB(); 913 914 /* first, let's setup the master thread */ 915 master_th->th.th_info.ds.ds_tid = 0; 916 master_th->th.th_team = team; 917 master_th->th.th_team_nproc = team->t.t_nproc; 918 master_th->th.th_team_master = master_th; 919 master_th->th.th_team_serialized = FALSE; 920 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 921 922 /* make sure we are not the optimized hot team */ 923 #if KMP_NESTED_HOT_TEAMS 924 use_hot_team = 0; 925 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 926 if (hot_teams) { // hot teams array is not allocated if 927 // KMP_HOT_TEAMS_MAX_LEVEL=0 928 int level = team->t.t_active_level - 1; // index in array of hot teams 929 if (master_th->th.th_teams_microtask) { // are we inside the teams? 930 if (master_th->th.th_teams_size.nteams > 1) { 931 ++level; // level was not increased in teams construct for 932 // team_of_masters 933 } 934 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 935 master_th->th.th_teams_level == team->t.t_level) { 936 ++level; // level was not increased in teams construct for 937 // team_of_workers before the parallel 938 } // team->t.t_level will be increased inside parallel 939 } 940 if (level < __kmp_hot_teams_max_level) { 941 if (hot_teams[level].hot_team) { 942 // hot team has already been allocated for given level 943 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 944 use_hot_team = 1; // the team is ready to use 945 } else { 946 use_hot_team = 0; // AC: threads are not allocated yet 947 hot_teams[level].hot_team = team; // remember new hot team 948 hot_teams[level].hot_team_nth = team->t.t_nproc; 949 } 950 } else { 951 use_hot_team = 0; 952 } 953 } 954 #else 955 use_hot_team = team == root->r.r_hot_team; 956 #endif 957 if (!use_hot_team) { 958 959 /* install the master thread */ 960 team->t.t_threads[0] = master_th; 961 __kmp_initialize_info(master_th, team, 0, master_gtid); 962 963 /* now, install the worker threads */ 964 for (i = 1; i < team->t.t_nproc; i++) { 965 966 /* fork or reallocate a new thread and install it in team */ 967 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 968 team->t.t_threads[i] = thr; 969 KMP_DEBUG_ASSERT(thr); 970 KMP_DEBUG_ASSERT(thr->th.th_team == team); 971 /* align team and thread arrived states */ 972 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 973 "T#%d(%d:%d) join =%llu, plain=%llu\n", 974 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 975 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 976 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 977 team->t.t_bar[bs_plain_barrier].b_arrived)); 978 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 979 thr->th.th_teams_level = master_th->th.th_teams_level; 980 thr->th.th_teams_size = master_th->th.th_teams_size; 981 { // Initialize threads' barrier data. 982 int b; 983 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 984 for (b = 0; b < bs_last_barrier; ++b) { 985 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 986 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 987 #if USE_DEBUGGER 988 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 989 #endif 990 } 991 } 992 } 993 994 #if KMP_AFFINITY_SUPPORTED 995 __kmp_partition_places(team); 996 #endif 997 } 998 999 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1000 for (i = 0; i < team->t.t_nproc; i++) { 1001 kmp_info_t *thr = team->t.t_threads[i]; 1002 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1003 thr->th.th_prev_level != team->t.t_level) { 1004 team->t.t_display_affinity = 1; 1005 break; 1006 } 1007 } 1008 } 1009 1010 KMP_MB(); 1011 } 1012 1013 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1014 // Propagate any changes to the floating point control registers out to the team 1015 // We try to avoid unnecessary writes to the relevant cache line in the team 1016 // structure, so we don't make changes unless they are needed. 1017 inline static void propagateFPControl(kmp_team_t *team) { 1018 if (__kmp_inherit_fp_control) { 1019 kmp_int16 x87_fpu_control_word; 1020 kmp_uint32 mxcsr; 1021 1022 // Get master values of FPU control flags (both X87 and vector) 1023 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1024 __kmp_store_mxcsr(&mxcsr); 1025 mxcsr &= KMP_X86_MXCSR_MASK; 1026 1027 // There is no point looking at t_fp_control_saved here. 1028 // If it is TRUE, we still have to update the values if they are different 1029 // from those we now have. If it is FALSE we didn't save anything yet, but 1030 // our objective is the same. We have to ensure that the values in the team 1031 // are the same as those we have. 1032 // So, this code achieves what we need whether or not t_fp_control_saved is 1033 // true. By checking whether the value needs updating we avoid unnecessary 1034 // writes that would put the cache-line into a written state, causing all 1035 // threads in the team to have to read it again. 1036 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1037 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1038 // Although we don't use this value, other code in the runtime wants to know 1039 // whether it should restore them. So we must ensure it is correct. 1040 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1041 } else { 1042 // Similarly here. Don't write to this cache-line in the team structure 1043 // unless we have to. 1044 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1045 } 1046 } 1047 1048 // Do the opposite, setting the hardware registers to the updated values from 1049 // the team. 1050 inline static void updateHWFPControl(kmp_team_t *team) { 1051 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1052 // Only reset the fp control regs if they have been changed in the team. 1053 // the parallel region that we are exiting. 1054 kmp_int16 x87_fpu_control_word; 1055 kmp_uint32 mxcsr; 1056 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1057 __kmp_store_mxcsr(&mxcsr); 1058 mxcsr &= KMP_X86_MXCSR_MASK; 1059 1060 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1061 __kmp_clear_x87_fpu_status_word(); 1062 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1063 } 1064 1065 if (team->t.t_mxcsr != mxcsr) { 1066 __kmp_load_mxcsr(&team->t.t_mxcsr); 1067 } 1068 } 1069 } 1070 #else 1071 #define propagateFPControl(x) ((void)0) 1072 #define updateHWFPControl(x) ((void)0) 1073 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1074 1075 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1076 int realloc); // forward declaration 1077 1078 /* Run a parallel region that has been serialized, so runs only in a team of the 1079 single master thread. */ 1080 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1081 kmp_info_t *this_thr; 1082 kmp_team_t *serial_team; 1083 1084 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1085 1086 /* Skip all this code for autopar serialized loops since it results in 1087 unacceptable overhead */ 1088 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1089 return; 1090 1091 if (!TCR_4(__kmp_init_parallel)) 1092 __kmp_parallel_initialize(); 1093 __kmp_resume_if_soft_paused(); 1094 1095 this_thr = __kmp_threads[global_tid]; 1096 serial_team = this_thr->th.th_serial_team; 1097 1098 /* utilize the serialized team held by this thread */ 1099 KMP_DEBUG_ASSERT(serial_team); 1100 KMP_MB(); 1101 1102 if (__kmp_tasking_mode != tskm_immediate_exec) { 1103 KMP_DEBUG_ASSERT( 1104 this_thr->th.th_task_team == 1105 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1106 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1107 NULL); 1108 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1109 "team %p, new task_team = NULL\n", 1110 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1111 this_thr->th.th_task_team = NULL; 1112 } 1113 1114 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1115 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1116 proc_bind = proc_bind_false; 1117 } else if (proc_bind == proc_bind_default) { 1118 // No proc_bind clause was specified, so use the current value 1119 // of proc-bind-var for this parallel region. 1120 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1121 } 1122 // Reset for next parallel region 1123 this_thr->th.th_set_proc_bind = proc_bind_default; 1124 1125 #if OMPT_SUPPORT 1126 ompt_data_t ompt_parallel_data = ompt_data_none; 1127 ompt_data_t *implicit_task_data; 1128 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1129 if (ompt_enabled.enabled && 1130 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1131 1132 ompt_task_info_t *parent_task_info; 1133 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1134 1135 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1136 if (ompt_enabled.ompt_callback_parallel_begin) { 1137 int team_size = 1; 1138 1139 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1140 &(parent_task_info->task_data), &(parent_task_info->frame), 1141 &ompt_parallel_data, team_size, 1142 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1143 } 1144 } 1145 #endif // OMPT_SUPPORT 1146 1147 if (this_thr->th.th_team != serial_team) { 1148 // Nested level will be an index in the nested nthreads array 1149 int level = this_thr->th.th_team->t.t_level; 1150 1151 if (serial_team->t.t_serialized) { 1152 /* this serial team was already used 1153 TODO increase performance by making this locks more specific */ 1154 kmp_team_t *new_team; 1155 1156 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1157 1158 new_team = 1159 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1160 #if OMPT_SUPPORT 1161 ompt_parallel_data, 1162 #endif 1163 proc_bind, &this_thr->th.th_current_task->td_icvs, 1164 0 USE_NESTED_HOT_ARG(NULL)); 1165 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1166 KMP_ASSERT(new_team); 1167 1168 /* setup new serialized team and install it */ 1169 new_team->t.t_threads[0] = this_thr; 1170 new_team->t.t_parent = this_thr->th.th_team; 1171 serial_team = new_team; 1172 this_thr->th.th_serial_team = serial_team; 1173 1174 KF_TRACE( 1175 10, 1176 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1177 global_tid, serial_team)); 1178 1179 /* TODO the above breaks the requirement that if we run out of resources, 1180 then we can still guarantee that serialized teams are ok, since we may 1181 need to allocate a new one */ 1182 } else { 1183 KF_TRACE( 1184 10, 1185 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1186 global_tid, serial_team)); 1187 } 1188 1189 /* we have to initialize this serial team */ 1190 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1191 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1192 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1193 serial_team->t.t_ident = loc; 1194 serial_team->t.t_serialized = 1; 1195 serial_team->t.t_nproc = 1; 1196 serial_team->t.t_parent = this_thr->th.th_team; 1197 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1198 this_thr->th.th_team = serial_team; 1199 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1200 1201 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1202 this_thr->th.th_current_task)); 1203 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1204 this_thr->th.th_current_task->td_flags.executing = 0; 1205 1206 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1207 1208 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1209 implicit task for each serialized task represented by 1210 team->t.t_serialized? */ 1211 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1212 &this_thr->th.th_current_task->td_parent->td_icvs); 1213 1214 // Thread value exists in the nested nthreads array for the next nested 1215 // level 1216 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1217 this_thr->th.th_current_task->td_icvs.nproc = 1218 __kmp_nested_nth.nth[level + 1]; 1219 } 1220 1221 if (__kmp_nested_proc_bind.used && 1222 (level + 1 < __kmp_nested_proc_bind.used)) { 1223 this_thr->th.th_current_task->td_icvs.proc_bind = 1224 __kmp_nested_proc_bind.bind_types[level + 1]; 1225 } 1226 1227 #if USE_DEBUGGER 1228 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1229 #endif 1230 this_thr->th.th_info.ds.ds_tid = 0; 1231 1232 /* set thread cache values */ 1233 this_thr->th.th_team_nproc = 1; 1234 this_thr->th.th_team_master = this_thr; 1235 this_thr->th.th_team_serialized = 1; 1236 1237 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1238 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1239 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1240 1241 propagateFPControl(serial_team); 1242 1243 /* check if we need to allocate dispatch buffers stack */ 1244 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1245 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1246 serial_team->t.t_dispatch->th_disp_buffer = 1247 (dispatch_private_info_t *)__kmp_allocate( 1248 sizeof(dispatch_private_info_t)); 1249 } 1250 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1251 1252 KMP_MB(); 1253 1254 } else { 1255 /* this serialized team is already being used, 1256 * that's fine, just add another nested level */ 1257 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1258 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1259 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1260 ++serial_team->t.t_serialized; 1261 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1262 1263 // Nested level will be an index in the nested nthreads array 1264 int level = this_thr->th.th_team->t.t_level; 1265 // Thread value exists in the nested nthreads array for the next nested 1266 // level 1267 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1268 this_thr->th.th_current_task->td_icvs.nproc = 1269 __kmp_nested_nth.nth[level + 1]; 1270 } 1271 serial_team->t.t_level++; 1272 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1273 "of serial team %p to %d\n", 1274 global_tid, serial_team, serial_team->t.t_level)); 1275 1276 /* allocate/push dispatch buffers stack */ 1277 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1278 { 1279 dispatch_private_info_t *disp_buffer = 1280 (dispatch_private_info_t *)__kmp_allocate( 1281 sizeof(dispatch_private_info_t)); 1282 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1283 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1284 } 1285 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1286 1287 KMP_MB(); 1288 } 1289 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1290 1291 // Perform the display affinity functionality for 1292 // serialized parallel regions 1293 if (__kmp_display_affinity) { 1294 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1295 this_thr->th.th_prev_num_threads != 1) { 1296 // NULL means use the affinity-format-var ICV 1297 __kmp_aux_display_affinity(global_tid, NULL); 1298 this_thr->th.th_prev_level = serial_team->t.t_level; 1299 this_thr->th.th_prev_num_threads = 1; 1300 } 1301 } 1302 1303 if (__kmp_env_consistency_check) 1304 __kmp_push_parallel(global_tid, NULL); 1305 #if OMPT_SUPPORT 1306 serial_team->t.ompt_team_info.master_return_address = codeptr; 1307 if (ompt_enabled.enabled && 1308 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1309 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1310 OMPT_GET_FRAME_ADDRESS(0); 1311 1312 ompt_lw_taskteam_t lw_taskteam; 1313 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1314 &ompt_parallel_data, codeptr); 1315 1316 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1317 // don't use lw_taskteam after linking. content was swaped 1318 1319 /* OMPT implicit task begin */ 1320 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1321 if (ompt_enabled.ompt_callback_implicit_task) { 1322 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1323 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1324 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1325 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1326 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1327 __kmp_tid_from_gtid(global_tid); 1328 } 1329 1330 /* OMPT state */ 1331 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1332 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1333 OMPT_GET_FRAME_ADDRESS(0); 1334 } 1335 #endif 1336 } 1337 1338 /* most of the work for a fork */ 1339 /* return true if we really went parallel, false if serialized */ 1340 int __kmp_fork_call(ident_t *loc, int gtid, 1341 enum fork_context_e call_context, // Intel, GNU, ... 1342 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1343 kmp_va_list ap) { 1344 void **argv; 1345 int i; 1346 int master_tid; 1347 int master_this_cons; 1348 kmp_team_t *team; 1349 kmp_team_t *parent_team; 1350 kmp_info_t *master_th; 1351 kmp_root_t *root; 1352 int nthreads; 1353 int master_active; 1354 int master_set_numthreads; 1355 int level; 1356 int active_level; 1357 int teams_level; 1358 #if KMP_NESTED_HOT_TEAMS 1359 kmp_hot_team_ptr_t **p_hot_teams; 1360 #endif 1361 { // KMP_TIME_BLOCK 1362 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1363 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1364 1365 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1366 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1367 /* Some systems prefer the stack for the root thread(s) to start with */ 1368 /* some gap from the parent stack to prevent false sharing. */ 1369 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1370 /* These 2 lines below are so this does not get optimized out */ 1371 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1372 __kmp_stkpadding += (short)((kmp_int64)dummy); 1373 } 1374 1375 /* initialize if needed */ 1376 KMP_DEBUG_ASSERT( 1377 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1378 if (!TCR_4(__kmp_init_parallel)) 1379 __kmp_parallel_initialize(); 1380 __kmp_resume_if_soft_paused(); 1381 1382 /* setup current data */ 1383 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1384 // shutdown 1385 parent_team = master_th->th.th_team; 1386 master_tid = master_th->th.th_info.ds.ds_tid; 1387 master_this_cons = master_th->th.th_local.this_construct; 1388 root = master_th->th.th_root; 1389 master_active = root->r.r_active; 1390 master_set_numthreads = master_th->th.th_set_nproc; 1391 1392 #if OMPT_SUPPORT 1393 ompt_data_t ompt_parallel_data = ompt_data_none; 1394 ompt_data_t *parent_task_data; 1395 ompt_frame_t *ompt_frame; 1396 ompt_data_t *implicit_task_data; 1397 void *return_address = NULL; 1398 1399 if (ompt_enabled.enabled) { 1400 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1401 NULL, NULL); 1402 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1403 } 1404 #endif 1405 1406 // Nested level will be an index in the nested nthreads array 1407 level = parent_team->t.t_level; 1408 // used to launch non-serial teams even if nested is not allowed 1409 active_level = parent_team->t.t_active_level; 1410 // needed to check nesting inside the teams 1411 teams_level = master_th->th.th_teams_level; 1412 #if KMP_NESTED_HOT_TEAMS 1413 p_hot_teams = &master_th->th.th_hot_teams; 1414 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1415 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1416 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1417 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1418 // it is either actual or not needed (when active_level > 0) 1419 (*p_hot_teams)[0].hot_team_nth = 1; 1420 } 1421 #endif 1422 1423 #if OMPT_SUPPORT 1424 if (ompt_enabled.enabled) { 1425 if (ompt_enabled.ompt_callback_parallel_begin) { 1426 int team_size = master_set_numthreads 1427 ? master_set_numthreads 1428 : get__nproc_2(parent_team, master_tid); 1429 int flags = OMPT_INVOKER(call_context) | 1430 ((microtask == (microtask_t)__kmp_teams_master) 1431 ? ompt_parallel_league 1432 : ompt_parallel_team); 1433 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1434 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1435 return_address); 1436 } 1437 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1438 } 1439 #endif 1440 1441 master_th->th.th_ident = loc; 1442 1443 if (master_th->th.th_teams_microtask && ap && 1444 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1445 // AC: This is start of parallel that is nested inside teams construct. 1446 // The team is actual (hot), all workers are ready at the fork barrier. 1447 // No lock needed to initialize the team a bit, then free workers. 1448 parent_team->t.t_ident = loc; 1449 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1450 parent_team->t.t_argc = argc; 1451 argv = (void **)parent_team->t.t_argv; 1452 for (i = argc - 1; i >= 0; --i) 1453 *argv++ = va_arg(kmp_va_deref(ap), void *); 1454 // Increment our nested depth levels, but not increase the serialization 1455 if (parent_team == master_th->th.th_serial_team) { 1456 // AC: we are in serialized parallel 1457 __kmpc_serialized_parallel(loc, gtid); 1458 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1459 1460 if (call_context == fork_context_gnu) { 1461 // AC: need to decrement t_serialized for enquiry functions to work 1462 // correctly, will restore at join time 1463 parent_team->t.t_serialized--; 1464 return TRUE; 1465 } 1466 1467 #if OMPT_SUPPORT 1468 void *dummy; 1469 void **exit_frame_p; 1470 1471 ompt_lw_taskteam_t lw_taskteam; 1472 1473 if (ompt_enabled.enabled) { 1474 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1475 &ompt_parallel_data, return_address); 1476 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1477 1478 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1479 // don't use lw_taskteam after linking. content was swaped 1480 1481 /* OMPT implicit task begin */ 1482 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1483 if (ompt_enabled.ompt_callback_implicit_task) { 1484 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1485 __kmp_tid_from_gtid(gtid); 1486 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1487 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1488 implicit_task_data, 1, 1489 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1490 } 1491 1492 /* OMPT state */ 1493 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1494 } else { 1495 exit_frame_p = &dummy; 1496 } 1497 #endif 1498 // AC: need to decrement t_serialized for enquiry functions to work 1499 // correctly, will restore at join time 1500 parent_team->t.t_serialized--; 1501 1502 { 1503 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1504 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1505 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1506 #if OMPT_SUPPORT 1507 , 1508 exit_frame_p 1509 #endif 1510 ); 1511 } 1512 1513 #if OMPT_SUPPORT 1514 if (ompt_enabled.enabled) { 1515 *exit_frame_p = NULL; 1516 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1517 if (ompt_enabled.ompt_callback_implicit_task) { 1518 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1519 ompt_scope_end, NULL, implicit_task_data, 1, 1520 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1521 } 1522 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1523 __ompt_lw_taskteam_unlink(master_th); 1524 if (ompt_enabled.ompt_callback_parallel_end) { 1525 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1526 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1527 OMPT_INVOKER(call_context) | ompt_parallel_team, 1528 return_address); 1529 } 1530 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1531 } 1532 #endif 1533 return TRUE; 1534 } 1535 1536 parent_team->t.t_pkfn = microtask; 1537 parent_team->t.t_invoke = invoker; 1538 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1539 parent_team->t.t_active_level++; 1540 parent_team->t.t_level++; 1541 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1542 1543 #if OMPT_SUPPORT 1544 if (ompt_enabled.enabled) { 1545 ompt_lw_taskteam_t lw_taskteam; 1546 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1547 &ompt_parallel_data, return_address); 1548 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1549 } 1550 #endif 1551 1552 /* Change number of threads in the team if requested */ 1553 if (master_set_numthreads) { // The parallel has num_threads clause 1554 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1555 // AC: only can reduce number of threads dynamically, can't increase 1556 kmp_info_t **other_threads = parent_team->t.t_threads; 1557 parent_team->t.t_nproc = master_set_numthreads; 1558 for (i = 0; i < master_set_numthreads; ++i) { 1559 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1560 } 1561 // Keep extra threads hot in the team for possible next parallels 1562 } 1563 master_th->th.th_set_nproc = 0; 1564 } 1565 1566 #if USE_DEBUGGER 1567 if (__kmp_debugging) { // Let debugger override number of threads. 1568 int nth = __kmp_omp_num_threads(loc); 1569 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1570 master_set_numthreads = nth; 1571 } 1572 } 1573 #endif 1574 1575 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1576 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1577 KMP_ITT_DEBUG) && 1578 __kmp_forkjoin_frames_mode == 3 && 1579 parent_team->t.t_active_level == 1 // only report frames at level 1 1580 && master_th->th.th_teams_size.nteams == 1) { 1581 kmp_uint64 tmp_time = __itt_get_timestamp(); 1582 master_th->th.th_frame_time = tmp_time; 1583 parent_team->t.t_region_time = tmp_time; 1584 } 1585 if (__itt_stack_caller_create_ptr) { 1586 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1587 // create new stack stitching id before entering fork barrier 1588 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1589 } 1590 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1591 1592 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1593 "master_th=%p, gtid=%d\n", 1594 root, parent_team, master_th, gtid)); 1595 __kmp_internal_fork(loc, gtid, parent_team); 1596 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1597 "master_th=%p, gtid=%d\n", 1598 root, parent_team, master_th, gtid)); 1599 1600 if (call_context == fork_context_gnu) 1601 return TRUE; 1602 1603 /* Invoke microtask for MASTER thread */ 1604 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1605 parent_team->t.t_id, parent_team->t.t_pkfn)); 1606 1607 if (!parent_team->t.t_invoke(gtid)) { 1608 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1609 } 1610 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1611 parent_team->t.t_id, parent_team->t.t_pkfn)); 1612 KMP_MB(); /* Flush all pending memory write invalidates. */ 1613 1614 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1615 1616 return TRUE; 1617 } // Parallel closely nested in teams construct 1618 1619 #if KMP_DEBUG 1620 if (__kmp_tasking_mode != tskm_immediate_exec) { 1621 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1622 parent_team->t.t_task_team[master_th->th.th_task_state]); 1623 } 1624 #endif 1625 1626 int enter_teams = 0; 1627 if (parent_team->t.t_active_level >= 1628 master_th->th.th_current_task->td_icvs.max_active_levels) { 1629 nthreads = 1; 1630 } else { 1631 enter_teams = ((ap == NULL && active_level == 0) || 1632 (ap && teams_level > 0 && teams_level == level)); 1633 nthreads = 1634 master_set_numthreads 1635 ? master_set_numthreads 1636 : get__nproc_2( 1637 parent_team, 1638 master_tid); // TODO: get nproc directly from current task 1639 1640 // Check if we need to take forkjoin lock? (no need for serialized 1641 // parallel out of teams construct). This code moved here from 1642 // __kmp_reserve_threads() to speedup nested serialized parallels. 1643 if (nthreads > 1) { 1644 if ((get__max_active_levels(master_th) == 1 && 1645 (root->r.r_in_parallel && !enter_teams)) || 1646 (__kmp_library == library_serial)) { 1647 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1648 " threads\n", 1649 gtid, nthreads)); 1650 nthreads = 1; 1651 } 1652 } 1653 if (nthreads > 1) { 1654 /* determine how many new threads we can use */ 1655 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1656 /* AC: If we execute teams from parallel region (on host), then teams 1657 should be created but each can only have 1 thread if nesting is 1658 disabled. If teams called from serial region, then teams and their 1659 threads should be created regardless of the nesting setting. */ 1660 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1661 nthreads, enter_teams); 1662 if (nthreads == 1) { 1663 // Free lock for single thread execution here; for multi-thread 1664 // execution it will be freed later after team of threads created 1665 // and initialized 1666 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1667 } 1668 } 1669 } 1670 KMP_DEBUG_ASSERT(nthreads > 0); 1671 1672 // If we temporarily changed the set number of threads then restore it now 1673 master_th->th.th_set_nproc = 0; 1674 1675 /* create a serialized parallel region? */ 1676 if (nthreads == 1) { 1677 /* josh todo: hypothetical question: what do we do for OS X*? */ 1678 #if KMP_OS_LINUX && \ 1679 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1680 void *args[argc]; 1681 #else 1682 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1683 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1684 KMP_ARCH_AARCH64) */ 1685 1686 KA_TRACE(20, 1687 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1688 1689 __kmpc_serialized_parallel(loc, gtid); 1690 1691 if (call_context == fork_context_intel) { 1692 /* TODO this sucks, use the compiler itself to pass args! :) */ 1693 master_th->th.th_serial_team->t.t_ident = loc; 1694 if (!ap) { 1695 // revert change made in __kmpc_serialized_parallel() 1696 master_th->th.th_serial_team->t.t_level--; 1697 // Get args from parent team for teams construct 1698 1699 #if OMPT_SUPPORT 1700 void *dummy; 1701 void **exit_frame_p; 1702 ompt_task_info_t *task_info; 1703 1704 ompt_lw_taskteam_t lw_taskteam; 1705 1706 if (ompt_enabled.enabled) { 1707 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1708 &ompt_parallel_data, return_address); 1709 1710 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1711 // don't use lw_taskteam after linking. content was swaped 1712 1713 task_info = OMPT_CUR_TASK_INFO(master_th); 1714 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1715 if (ompt_enabled.ompt_callback_implicit_task) { 1716 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1717 __kmp_tid_from_gtid(gtid); 1718 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1719 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1720 &(task_info->task_data), 1, 1721 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1722 ompt_task_implicit); 1723 } 1724 1725 /* OMPT state */ 1726 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1727 } else { 1728 exit_frame_p = &dummy; 1729 } 1730 #endif 1731 1732 { 1733 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1734 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1735 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1736 parent_team->t.t_argv 1737 #if OMPT_SUPPORT 1738 , 1739 exit_frame_p 1740 #endif 1741 ); 1742 } 1743 1744 #if OMPT_SUPPORT 1745 if (ompt_enabled.enabled) { 1746 *exit_frame_p = NULL; 1747 if (ompt_enabled.ompt_callback_implicit_task) { 1748 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1749 ompt_scope_end, NULL, &(task_info->task_data), 1, 1750 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1751 ompt_task_implicit); 1752 } 1753 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1754 __ompt_lw_taskteam_unlink(master_th); 1755 if (ompt_enabled.ompt_callback_parallel_end) { 1756 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1757 &ompt_parallel_data, parent_task_data, 1758 OMPT_INVOKER(call_context) | ompt_parallel_team, 1759 return_address); 1760 } 1761 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1762 } 1763 #endif 1764 } else if (microtask == (microtask_t)__kmp_teams_master) { 1765 KMP_DEBUG_ASSERT(master_th->th.th_team == 1766 master_th->th.th_serial_team); 1767 team = master_th->th.th_team; 1768 // team->t.t_pkfn = microtask; 1769 team->t.t_invoke = invoker; 1770 __kmp_alloc_argv_entries(argc, team, TRUE); 1771 team->t.t_argc = argc; 1772 argv = (void **)team->t.t_argv; 1773 if (ap) { 1774 for (i = argc - 1; i >= 0; --i) 1775 *argv++ = va_arg(kmp_va_deref(ap), void *); 1776 } else { 1777 for (i = 0; i < argc; ++i) 1778 // Get args from parent team for teams construct 1779 argv[i] = parent_team->t.t_argv[i]; 1780 } 1781 // AC: revert change made in __kmpc_serialized_parallel() 1782 // because initial code in teams should have level=0 1783 team->t.t_level--; 1784 // AC: call special invoker for outer "parallel" of teams construct 1785 invoker(gtid); 1786 #if OMPT_SUPPORT 1787 if (ompt_enabled.enabled) { 1788 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1789 if (ompt_enabled.ompt_callback_implicit_task) { 1790 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1791 ompt_scope_end, NULL, &(task_info->task_data), 0, 1792 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1793 } 1794 if (ompt_enabled.ompt_callback_parallel_end) { 1795 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1796 &ompt_parallel_data, parent_task_data, 1797 OMPT_INVOKER(call_context) | ompt_parallel_league, 1798 return_address); 1799 } 1800 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1801 } 1802 #endif 1803 } else { 1804 argv = args; 1805 for (i = argc - 1; i >= 0; --i) 1806 *argv++ = va_arg(kmp_va_deref(ap), void *); 1807 KMP_MB(); 1808 1809 #if OMPT_SUPPORT 1810 void *dummy; 1811 void **exit_frame_p; 1812 ompt_task_info_t *task_info; 1813 1814 ompt_lw_taskteam_t lw_taskteam; 1815 1816 if (ompt_enabled.enabled) { 1817 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1818 &ompt_parallel_data, return_address); 1819 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1820 // don't use lw_taskteam after linking. content was swaped 1821 task_info = OMPT_CUR_TASK_INFO(master_th); 1822 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1823 1824 /* OMPT implicit task begin */ 1825 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1826 if (ompt_enabled.ompt_callback_implicit_task) { 1827 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1828 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1829 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1830 ompt_task_implicit); 1831 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1832 __kmp_tid_from_gtid(gtid); 1833 } 1834 1835 /* OMPT state */ 1836 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1837 } else { 1838 exit_frame_p = &dummy; 1839 } 1840 #endif 1841 1842 { 1843 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1844 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1845 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1846 #if OMPT_SUPPORT 1847 , 1848 exit_frame_p 1849 #endif 1850 ); 1851 } 1852 1853 #if OMPT_SUPPORT 1854 if (ompt_enabled.enabled) { 1855 *exit_frame_p = NULL; 1856 if (ompt_enabled.ompt_callback_implicit_task) { 1857 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1858 ompt_scope_end, NULL, &(task_info->task_data), 1, 1859 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1860 ompt_task_implicit); 1861 } 1862 1863 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1864 __ompt_lw_taskteam_unlink(master_th); 1865 if (ompt_enabled.ompt_callback_parallel_end) { 1866 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1867 &ompt_parallel_data, parent_task_data, 1868 OMPT_INVOKER(call_context) | ompt_parallel_team, 1869 return_address); 1870 } 1871 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1872 } 1873 #endif 1874 } 1875 } else if (call_context == fork_context_gnu) { 1876 #if OMPT_SUPPORT 1877 ompt_lw_taskteam_t lwt; 1878 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1879 return_address); 1880 1881 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1882 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1883 // don't use lw_taskteam after linking. content was swaped 1884 #endif 1885 1886 // we were called from GNU native code 1887 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1888 return FALSE; 1889 } else { 1890 KMP_ASSERT2(call_context < fork_context_last, 1891 "__kmp_fork_call: unknown fork_context parameter"); 1892 } 1893 1894 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1895 KMP_MB(); 1896 return FALSE; 1897 } // if (nthreads == 1) 1898 1899 // GEH: only modify the executing flag in the case when not serialized 1900 // serialized case is handled in kmpc_serialized_parallel 1901 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1902 "curtask=%p, curtask_max_aclevel=%d\n", 1903 parent_team->t.t_active_level, master_th, 1904 master_th->th.th_current_task, 1905 master_th->th.th_current_task->td_icvs.max_active_levels)); 1906 // TODO: GEH - cannot do this assertion because root thread not set up as 1907 // executing 1908 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1909 master_th->th.th_current_task->td_flags.executing = 0; 1910 1911 if (!master_th->th.th_teams_microtask || level > teams_level) { 1912 /* Increment our nested depth level */ 1913 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1914 } 1915 1916 // See if we need to make a copy of the ICVs. 1917 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1918 if ((level + 1 < __kmp_nested_nth.used) && 1919 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1920 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1921 } else { 1922 nthreads_icv = 0; // don't update 1923 } 1924 1925 // Figure out the proc_bind_policy for the new team. 1926 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1927 kmp_proc_bind_t proc_bind_icv = 1928 proc_bind_default; // proc_bind_default means don't update 1929 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1930 proc_bind = proc_bind_false; 1931 } else { 1932 if (proc_bind == proc_bind_default) { 1933 // No proc_bind clause specified; use current proc-bind-var for this 1934 // parallel region 1935 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1936 } 1937 /* else: The proc_bind policy was specified explicitly on parallel clause. 1938 This overrides proc-bind-var for this parallel region, but does not 1939 change proc-bind-var. */ 1940 // Figure the value of proc-bind-var for the child threads. 1941 if ((level + 1 < __kmp_nested_proc_bind.used) && 1942 (__kmp_nested_proc_bind.bind_types[level + 1] != 1943 master_th->th.th_current_task->td_icvs.proc_bind)) { 1944 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1945 } 1946 } 1947 1948 // Reset for next parallel region 1949 master_th->th.th_set_proc_bind = proc_bind_default; 1950 1951 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1952 kmp_internal_control_t new_icvs; 1953 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1954 new_icvs.next = NULL; 1955 if (nthreads_icv > 0) { 1956 new_icvs.nproc = nthreads_icv; 1957 } 1958 if (proc_bind_icv != proc_bind_default) { 1959 new_icvs.proc_bind = proc_bind_icv; 1960 } 1961 1962 /* allocate a new parallel team */ 1963 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1964 team = __kmp_allocate_team(root, nthreads, nthreads, 1965 #if OMPT_SUPPORT 1966 ompt_parallel_data, 1967 #endif 1968 proc_bind, &new_icvs, 1969 argc USE_NESTED_HOT_ARG(master_th)); 1970 } else { 1971 /* allocate a new parallel team */ 1972 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1973 team = __kmp_allocate_team(root, nthreads, nthreads, 1974 #if OMPT_SUPPORT 1975 ompt_parallel_data, 1976 #endif 1977 proc_bind, 1978 &master_th->th.th_current_task->td_icvs, 1979 argc USE_NESTED_HOT_ARG(master_th)); 1980 } 1981 KF_TRACE( 1982 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 1983 1984 /* setup the new team */ 1985 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 1986 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 1987 KMP_CHECK_UPDATE(team->t.t_ident, loc); 1988 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 1989 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 1990 #if OMPT_SUPPORT 1991 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 1992 return_address); 1993 #endif 1994 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 1995 // TODO: parent_team->t.t_level == INT_MAX ??? 1996 if (!master_th->th.th_teams_microtask || level > teams_level) { 1997 int new_level = parent_team->t.t_level + 1; 1998 KMP_CHECK_UPDATE(team->t.t_level, new_level); 1999 new_level = parent_team->t.t_active_level + 1; 2000 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2001 } else { 2002 // AC: Do not increase parallel level at start of the teams construct 2003 int new_level = parent_team->t.t_level; 2004 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2005 new_level = parent_team->t.t_active_level; 2006 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2007 } 2008 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2009 // set master's schedule as new run-time schedule 2010 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2011 2012 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2013 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2014 2015 // Update the floating point rounding in the team if required. 2016 propagateFPControl(team); 2017 2018 if (__kmp_tasking_mode != tskm_immediate_exec) { 2019 // Set master's task team to team's task team. Unless this is hot team, it 2020 // should be NULL. 2021 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2022 parent_team->t.t_task_team[master_th->th.th_task_state]); 2023 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2024 "%p, new task_team %p / team %p\n", 2025 __kmp_gtid_from_thread(master_th), 2026 master_th->th.th_task_team, parent_team, 2027 team->t.t_task_team[master_th->th.th_task_state], team)); 2028 2029 if (active_level || master_th->th.th_task_team) { 2030 // Take a memo of master's task_state 2031 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2032 if (master_th->th.th_task_state_top >= 2033 master_th->th.th_task_state_stack_sz) { // increase size 2034 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2035 kmp_uint8 *old_stack, *new_stack; 2036 kmp_uint32 i; 2037 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2038 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2039 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2040 } 2041 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2042 ++i) { // zero-init rest of stack 2043 new_stack[i] = 0; 2044 } 2045 old_stack = master_th->th.th_task_state_memo_stack; 2046 master_th->th.th_task_state_memo_stack = new_stack; 2047 master_th->th.th_task_state_stack_sz = new_size; 2048 __kmp_free(old_stack); 2049 } 2050 // Store master's task_state on stack 2051 master_th->th 2052 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2053 master_th->th.th_task_state; 2054 master_th->th.th_task_state_top++; 2055 #if KMP_NESTED_HOT_TEAMS 2056 if (master_th->th.th_hot_teams && 2057 active_level < __kmp_hot_teams_max_level && 2058 team == master_th->th.th_hot_teams[active_level].hot_team) { 2059 // Restore master's nested state if nested hot team 2060 master_th->th.th_task_state = 2061 master_th->th 2062 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2063 } else { 2064 #endif 2065 master_th->th.th_task_state = 0; 2066 #if KMP_NESTED_HOT_TEAMS 2067 } 2068 #endif 2069 } 2070 #if !KMP_NESTED_HOT_TEAMS 2071 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2072 (team == root->r.r_hot_team)); 2073 #endif 2074 } 2075 2076 KA_TRACE( 2077 20, 2078 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2079 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2080 team->t.t_nproc)); 2081 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2082 (team->t.t_master_tid == 0 && 2083 (team->t.t_parent == root->r.r_root_team || 2084 team->t.t_parent->t.t_serialized))); 2085 KMP_MB(); 2086 2087 /* now, setup the arguments */ 2088 argv = (void **)team->t.t_argv; 2089 if (ap) { 2090 for (i = argc - 1; i >= 0; --i) { 2091 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2092 KMP_CHECK_UPDATE(*argv, new_argv); 2093 argv++; 2094 } 2095 } else { 2096 for (i = 0; i < argc; ++i) { 2097 // Get args from parent team for teams construct 2098 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2099 } 2100 } 2101 2102 /* now actually fork the threads */ 2103 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2104 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2105 root->r.r_active = TRUE; 2106 2107 __kmp_fork_team_threads(root, team, master_th, gtid); 2108 __kmp_setup_icv_copy(team, nthreads, 2109 &master_th->th.th_current_task->td_icvs, loc); 2110 2111 #if OMPT_SUPPORT 2112 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2113 #endif 2114 2115 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2116 2117 #if USE_ITT_BUILD 2118 if (team->t.t_active_level == 1 // only report frames at level 1 2119 && !master_th->th.th_teams_microtask) { // not in teams construct 2120 #if USE_ITT_NOTIFY 2121 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2122 (__kmp_forkjoin_frames_mode == 3 || 2123 __kmp_forkjoin_frames_mode == 1)) { 2124 kmp_uint64 tmp_time = 0; 2125 if (__itt_get_timestamp_ptr) 2126 tmp_time = __itt_get_timestamp(); 2127 // Internal fork - report frame begin 2128 master_th->th.th_frame_time = tmp_time; 2129 if (__kmp_forkjoin_frames_mode == 3) 2130 team->t.t_region_time = tmp_time; 2131 } else 2132 // only one notification scheme (either "submit" or "forking/joined", not both) 2133 #endif /* USE_ITT_NOTIFY */ 2134 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2135 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2136 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2137 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2138 } 2139 } 2140 #endif /* USE_ITT_BUILD */ 2141 2142 /* now go on and do the work */ 2143 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2144 KMP_MB(); 2145 KF_TRACE(10, 2146 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2147 root, team, master_th, gtid)); 2148 2149 #if USE_ITT_BUILD 2150 if (__itt_stack_caller_create_ptr) { 2151 // create new stack stitching id before entering fork barrier 2152 if (!enter_teams) { 2153 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2154 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2155 } else if (parent_team->t.t_serialized) { 2156 // keep stack stitching id in the serialized parent_team; 2157 // current team will be used for parallel inside the teams; 2158 // if parent_team is active, then it already keeps stack stitching id 2159 // for the league of teams 2160 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2161 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2162 } 2163 } 2164 #endif /* USE_ITT_BUILD */ 2165 2166 // AC: skip __kmp_internal_fork at teams construct, let only master 2167 // threads execute 2168 if (ap) { 2169 __kmp_internal_fork(loc, gtid, team); 2170 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2171 "master_th=%p, gtid=%d\n", 2172 root, team, master_th, gtid)); 2173 } 2174 2175 if (call_context == fork_context_gnu) { 2176 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2177 return TRUE; 2178 } 2179 2180 /* Invoke microtask for MASTER thread */ 2181 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2182 team->t.t_id, team->t.t_pkfn)); 2183 } // END of timer KMP_fork_call block 2184 2185 #if KMP_STATS_ENABLED 2186 // If beginning a teams construct, then change thread state 2187 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2188 if (!ap) { 2189 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2190 } 2191 #endif 2192 2193 if (!team->t.t_invoke(gtid)) { 2194 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2195 } 2196 2197 #if KMP_STATS_ENABLED 2198 // If was beginning of a teams construct, then reset thread state 2199 if (!ap) { 2200 KMP_SET_THREAD_STATE(previous_state); 2201 } 2202 #endif 2203 2204 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2205 team->t.t_id, team->t.t_pkfn)); 2206 KMP_MB(); /* Flush all pending memory write invalidates. */ 2207 2208 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2209 2210 #if OMPT_SUPPORT 2211 if (ompt_enabled.enabled) { 2212 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2213 } 2214 #endif 2215 2216 return TRUE; 2217 } 2218 2219 #if OMPT_SUPPORT 2220 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2221 kmp_team_t *team) { 2222 // restore state outside the region 2223 thread->th.ompt_thread_info.state = 2224 ((team->t.t_serialized) ? ompt_state_work_serial 2225 : ompt_state_work_parallel); 2226 } 2227 2228 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2229 kmp_team_t *team, ompt_data_t *parallel_data, 2230 int flags, void *codeptr) { 2231 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2232 if (ompt_enabled.ompt_callback_parallel_end) { 2233 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2234 parallel_data, &(task_info->task_data), flags, codeptr); 2235 } 2236 2237 task_info->frame.enter_frame = ompt_data_none; 2238 __kmp_join_restore_state(thread, team); 2239 } 2240 #endif 2241 2242 void __kmp_join_call(ident_t *loc, int gtid 2243 #if OMPT_SUPPORT 2244 , 2245 enum fork_context_e fork_context 2246 #endif 2247 , 2248 int exit_teams) { 2249 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2250 kmp_team_t *team; 2251 kmp_team_t *parent_team; 2252 kmp_info_t *master_th; 2253 kmp_root_t *root; 2254 int master_active; 2255 2256 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2257 2258 /* setup current data */ 2259 master_th = __kmp_threads[gtid]; 2260 root = master_th->th.th_root; 2261 team = master_th->th.th_team; 2262 parent_team = team->t.t_parent; 2263 2264 master_th->th.th_ident = loc; 2265 2266 #if OMPT_SUPPORT 2267 void *team_microtask = (void *)team->t.t_pkfn; 2268 // For GOMP interface with serialized parallel, need the 2269 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2270 // and end-parallel events. 2271 if (ompt_enabled.enabled && 2272 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2273 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2274 } 2275 #endif 2276 2277 #if KMP_DEBUG 2278 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2279 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2280 "th_task_team = %p\n", 2281 __kmp_gtid_from_thread(master_th), team, 2282 team->t.t_task_team[master_th->th.th_task_state], 2283 master_th->th.th_task_team)); 2284 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2285 team->t.t_task_team[master_th->th.th_task_state]); 2286 } 2287 #endif 2288 2289 if (team->t.t_serialized) { 2290 if (master_th->th.th_teams_microtask) { 2291 // We are in teams construct 2292 int level = team->t.t_level; 2293 int tlevel = master_th->th.th_teams_level; 2294 if (level == tlevel) { 2295 // AC: we haven't incremented it earlier at start of teams construct, 2296 // so do it here - at the end of teams construct 2297 team->t.t_level++; 2298 } else if (level == tlevel + 1) { 2299 // AC: we are exiting parallel inside teams, need to increment 2300 // serialization in order to restore it in the next call to 2301 // __kmpc_end_serialized_parallel 2302 team->t.t_serialized++; 2303 } 2304 } 2305 __kmpc_end_serialized_parallel(loc, gtid); 2306 2307 #if OMPT_SUPPORT 2308 if (ompt_enabled.enabled) { 2309 __kmp_join_restore_state(master_th, parent_team); 2310 } 2311 #endif 2312 2313 return; 2314 } 2315 2316 master_active = team->t.t_master_active; 2317 2318 if (!exit_teams) { 2319 // AC: No barrier for internal teams at exit from teams construct. 2320 // But there is barrier for external team (league). 2321 __kmp_internal_join(loc, gtid, team); 2322 #if USE_ITT_BUILD 2323 if (__itt_stack_caller_create_ptr) { 2324 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2325 // destroy the stack stitching id after join barrier 2326 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2327 team->t.t_stack_id = NULL; 2328 } 2329 #endif 2330 } else { 2331 master_th->th.th_task_state = 2332 0; // AC: no tasking in teams (out of any parallel) 2333 #if USE_ITT_BUILD 2334 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2335 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2336 // destroy the stack stitching id on exit from the teams construct 2337 // if parent_team is active, then the id will be destroyed later on 2338 // by master of the league of teams 2339 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2340 parent_team->t.t_stack_id = NULL; 2341 } 2342 #endif 2343 } 2344 2345 KMP_MB(); 2346 2347 #if OMPT_SUPPORT 2348 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2349 void *codeptr = team->t.ompt_team_info.master_return_address; 2350 #endif 2351 2352 #if USE_ITT_BUILD 2353 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2354 if (team->t.t_active_level == 1 && 2355 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2356 master_th->th.th_teams_size.nteams == 1)) { 2357 master_th->th.th_ident = loc; 2358 // only one notification scheme (either "submit" or "forking/joined", not 2359 // both) 2360 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2361 __kmp_forkjoin_frames_mode == 3) 2362 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2363 master_th->th.th_frame_time, 0, loc, 2364 master_th->th.th_team_nproc, 1); 2365 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2366 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2367 __kmp_itt_region_joined(gtid); 2368 } // active_level == 1 2369 #endif /* USE_ITT_BUILD */ 2370 2371 if (master_th->th.th_teams_microtask && !exit_teams && 2372 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2373 team->t.t_level == master_th->th.th_teams_level + 1) { 2374 // AC: We need to leave the team structure intact at the end of parallel 2375 // inside the teams construct, so that at the next parallel same (hot) team 2376 // works, only adjust nesting levels 2377 #if OMPT_SUPPORT 2378 ompt_data_t ompt_parallel_data = ompt_data_none; 2379 if (ompt_enabled.enabled) { 2380 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2381 if (ompt_enabled.ompt_callback_implicit_task) { 2382 int ompt_team_size = team->t.t_nproc; 2383 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2384 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2385 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2386 } 2387 task_info->frame.exit_frame = ompt_data_none; 2388 task_info->task_data = ompt_data_none; 2389 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2390 __ompt_lw_taskteam_unlink(master_th); 2391 } 2392 #endif 2393 /* Decrement our nested depth level */ 2394 team->t.t_level--; 2395 team->t.t_active_level--; 2396 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2397 2398 // Restore number of threads in the team if needed. This code relies on 2399 // the proper adjustment of th_teams_size.nth after the fork in 2400 // __kmp_teams_master on each teams master in the case that 2401 // __kmp_reserve_threads reduced it. 2402 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2403 int old_num = master_th->th.th_team_nproc; 2404 int new_num = master_th->th.th_teams_size.nth; 2405 kmp_info_t **other_threads = team->t.t_threads; 2406 team->t.t_nproc = new_num; 2407 for (int i = 0; i < old_num; ++i) { 2408 other_threads[i]->th.th_team_nproc = new_num; 2409 } 2410 // Adjust states of non-used threads of the team 2411 for (int i = old_num; i < new_num; ++i) { 2412 // Re-initialize thread's barrier data. 2413 KMP_DEBUG_ASSERT(other_threads[i]); 2414 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2415 for (int b = 0; b < bs_last_barrier; ++b) { 2416 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2417 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2418 #if USE_DEBUGGER 2419 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2420 #endif 2421 } 2422 if (__kmp_tasking_mode != tskm_immediate_exec) { 2423 // Synchronize thread's task state 2424 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2425 } 2426 } 2427 } 2428 2429 #if OMPT_SUPPORT 2430 if (ompt_enabled.enabled) { 2431 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2432 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2433 } 2434 #endif 2435 2436 return; 2437 } 2438 2439 /* do cleanup and restore the parent team */ 2440 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2441 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2442 2443 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2444 2445 /* jc: The following lock has instructions with REL and ACQ semantics, 2446 separating the parallel user code called in this parallel region 2447 from the serial user code called after this function returns. */ 2448 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2449 2450 if (!master_th->th.th_teams_microtask || 2451 team->t.t_level > master_th->th.th_teams_level) { 2452 /* Decrement our nested depth level */ 2453 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2454 } 2455 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2456 2457 #if OMPT_SUPPORT 2458 if (ompt_enabled.enabled) { 2459 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2460 if (ompt_enabled.ompt_callback_implicit_task) { 2461 int flags = (team_microtask == (void *)__kmp_teams_master) 2462 ? ompt_task_initial 2463 : ompt_task_implicit; 2464 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2465 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2466 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2467 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2468 } 2469 task_info->frame.exit_frame = ompt_data_none; 2470 task_info->task_data = ompt_data_none; 2471 } 2472 #endif 2473 2474 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2475 master_th, team)); 2476 __kmp_pop_current_task_from_thread(master_th); 2477 2478 #if KMP_AFFINITY_SUPPORTED 2479 // Restore master thread's partition. 2480 master_th->th.th_first_place = team->t.t_first_place; 2481 master_th->th.th_last_place = team->t.t_last_place; 2482 #endif // KMP_AFFINITY_SUPPORTED 2483 master_th->th.th_def_allocator = team->t.t_def_allocator; 2484 2485 updateHWFPControl(team); 2486 2487 if (root->r.r_active != master_active) 2488 root->r.r_active = master_active; 2489 2490 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2491 master_th)); // this will free worker threads 2492 2493 /* this race was fun to find. make sure the following is in the critical 2494 region otherwise assertions may fail occasionally since the old team may be 2495 reallocated and the hierarchy appears inconsistent. it is actually safe to 2496 run and won't cause any bugs, but will cause those assertion failures. it's 2497 only one deref&assign so might as well put this in the critical region */ 2498 master_th->th.th_team = parent_team; 2499 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2500 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2501 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2502 2503 /* restore serialized team, if need be */ 2504 if (parent_team->t.t_serialized && 2505 parent_team != master_th->th.th_serial_team && 2506 parent_team != root->r.r_root_team) { 2507 __kmp_free_team(root, 2508 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2509 master_th->th.th_serial_team = parent_team; 2510 } 2511 2512 if (__kmp_tasking_mode != tskm_immediate_exec) { 2513 if (master_th->th.th_task_state_top > 2514 0) { // Restore task state from memo stack 2515 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2516 // Remember master's state if we re-use this nested hot team 2517 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2518 master_th->th.th_task_state; 2519 --master_th->th.th_task_state_top; // pop 2520 // Now restore state at this level 2521 master_th->th.th_task_state = 2522 master_th->th 2523 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2524 } 2525 // Copy the task team from the parent team to the master thread 2526 master_th->th.th_task_team = 2527 parent_team->t.t_task_team[master_th->th.th_task_state]; 2528 KA_TRACE(20, 2529 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2530 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2531 parent_team)); 2532 } 2533 2534 // TODO: GEH - cannot do this assertion because root thread not set up as 2535 // executing 2536 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2537 master_th->th.th_current_task->td_flags.executing = 1; 2538 2539 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2540 2541 #if OMPT_SUPPORT 2542 int flags = 2543 OMPT_INVOKER(fork_context) | 2544 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2545 : ompt_parallel_team); 2546 if (ompt_enabled.enabled) { 2547 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2548 codeptr); 2549 } 2550 #endif 2551 2552 KMP_MB(); 2553 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2554 } 2555 2556 /* Check whether we should push an internal control record onto the 2557 serial team stack. If so, do it. */ 2558 void __kmp_save_internal_controls(kmp_info_t *thread) { 2559 2560 if (thread->th.th_team != thread->th.th_serial_team) { 2561 return; 2562 } 2563 if (thread->th.th_team->t.t_serialized > 1) { 2564 int push = 0; 2565 2566 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2567 push = 1; 2568 } else { 2569 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2570 thread->th.th_team->t.t_serialized) { 2571 push = 1; 2572 } 2573 } 2574 if (push) { /* push a record on the serial team's stack */ 2575 kmp_internal_control_t *control = 2576 (kmp_internal_control_t *)__kmp_allocate( 2577 sizeof(kmp_internal_control_t)); 2578 2579 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2580 2581 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2582 2583 control->next = thread->th.th_team->t.t_control_stack_top; 2584 thread->th.th_team->t.t_control_stack_top = control; 2585 } 2586 } 2587 } 2588 2589 /* Changes set_nproc */ 2590 void __kmp_set_num_threads(int new_nth, int gtid) { 2591 kmp_info_t *thread; 2592 kmp_root_t *root; 2593 2594 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2595 KMP_DEBUG_ASSERT(__kmp_init_serial); 2596 2597 if (new_nth < 1) 2598 new_nth = 1; 2599 else if (new_nth > __kmp_max_nth) 2600 new_nth = __kmp_max_nth; 2601 2602 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2603 thread = __kmp_threads[gtid]; 2604 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2605 return; // nothing to do 2606 2607 __kmp_save_internal_controls(thread); 2608 2609 set__nproc(thread, new_nth); 2610 2611 // If this omp_set_num_threads() call will cause the hot team size to be 2612 // reduced (in the absence of a num_threads clause), then reduce it now, 2613 // rather than waiting for the next parallel region. 2614 root = thread->th.th_root; 2615 if (__kmp_init_parallel && (!root->r.r_active) && 2616 (root->r.r_hot_team->t.t_nproc > new_nth) 2617 #if KMP_NESTED_HOT_TEAMS 2618 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2619 #endif 2620 ) { 2621 kmp_team_t *hot_team = root->r.r_hot_team; 2622 int f; 2623 2624 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2625 2626 // Release the extra threads we don't need any more. 2627 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2628 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2629 if (__kmp_tasking_mode != tskm_immediate_exec) { 2630 // When decreasing team size, threads no longer in the team should unref 2631 // task team. 2632 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2633 } 2634 __kmp_free_thread(hot_team->t.t_threads[f]); 2635 hot_team->t.t_threads[f] = NULL; 2636 } 2637 hot_team->t.t_nproc = new_nth; 2638 #if KMP_NESTED_HOT_TEAMS 2639 if (thread->th.th_hot_teams) { 2640 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2641 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2642 } 2643 #endif 2644 2645 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2646 2647 // Update the t_nproc field in the threads that are still active. 2648 for (f = 0; f < new_nth; f++) { 2649 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2650 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2651 } 2652 // Special flag in case omp_set_num_threads() call 2653 hot_team->t.t_size_changed = -1; 2654 } 2655 } 2656 2657 /* Changes max_active_levels */ 2658 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2659 kmp_info_t *thread; 2660 2661 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2662 "%d = (%d)\n", 2663 gtid, max_active_levels)); 2664 KMP_DEBUG_ASSERT(__kmp_init_serial); 2665 2666 // validate max_active_levels 2667 if (max_active_levels < 0) { 2668 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2669 // We ignore this call if the user has specified a negative value. 2670 // The current setting won't be changed. The last valid setting will be 2671 // used. A warning will be issued (if warnings are allowed as controlled by 2672 // the KMP_WARNINGS env var). 2673 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2674 "max_active_levels for thread %d = (%d)\n", 2675 gtid, max_active_levels)); 2676 return; 2677 } 2678 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2679 // it's OK, the max_active_levels is within the valid range: [ 0; 2680 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2681 // We allow a zero value. (implementation defined behavior) 2682 } else { 2683 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2684 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2685 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2686 // Current upper limit is MAX_INT. (implementation defined behavior) 2687 // If the input exceeds the upper limit, we correct the input to be the 2688 // upper limit. (implementation defined behavior) 2689 // Actually, the flow should never get here until we use MAX_INT limit. 2690 } 2691 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2692 "max_active_levels for thread %d = (%d)\n", 2693 gtid, max_active_levels)); 2694 2695 thread = __kmp_threads[gtid]; 2696 2697 __kmp_save_internal_controls(thread); 2698 2699 set__max_active_levels(thread, max_active_levels); 2700 } 2701 2702 /* Gets max_active_levels */ 2703 int __kmp_get_max_active_levels(int gtid) { 2704 kmp_info_t *thread; 2705 2706 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2707 KMP_DEBUG_ASSERT(__kmp_init_serial); 2708 2709 thread = __kmp_threads[gtid]; 2710 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2711 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2712 "curtask_maxaclevel=%d\n", 2713 gtid, thread->th.th_current_task, 2714 thread->th.th_current_task->td_icvs.max_active_levels)); 2715 return thread->th.th_current_task->td_icvs.max_active_levels; 2716 } 2717 2718 // nteams-var per-device ICV 2719 void __kmp_set_num_teams(int num_teams) { 2720 if (num_teams > 0) 2721 __kmp_nteams = num_teams; 2722 } 2723 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2724 // teams-thread-limit-var per-device ICV 2725 void __kmp_set_teams_thread_limit(int limit) { 2726 if (limit > 0) 2727 __kmp_teams_thread_limit = limit; 2728 } 2729 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2730 2731 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2732 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2733 2734 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2735 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2736 kmp_info_t *thread; 2737 kmp_sched_t orig_kind; 2738 // kmp_team_t *team; 2739 2740 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2741 gtid, (int)kind, chunk)); 2742 KMP_DEBUG_ASSERT(__kmp_init_serial); 2743 2744 // Check if the kind parameter is valid, correct if needed. 2745 // Valid parameters should fit in one of two intervals - standard or extended: 2746 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2747 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2748 orig_kind = kind; 2749 kind = __kmp_sched_without_mods(kind); 2750 2751 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2752 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2753 // TODO: Hint needs attention in case we change the default schedule. 2754 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2755 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2756 __kmp_msg_null); 2757 kind = kmp_sched_default; 2758 chunk = 0; // ignore chunk value in case of bad kind 2759 } 2760 2761 thread = __kmp_threads[gtid]; 2762 2763 __kmp_save_internal_controls(thread); 2764 2765 if (kind < kmp_sched_upper_std) { 2766 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2767 // differ static chunked vs. unchunked: chunk should be invalid to 2768 // indicate unchunked schedule (which is the default) 2769 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2770 } else { 2771 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2772 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2773 } 2774 } else { 2775 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2776 // kmp_sched_lower - 2 ]; 2777 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2778 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2779 kmp_sched_lower - 2]; 2780 } 2781 __kmp_sched_apply_mods_intkind( 2782 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2783 if (kind == kmp_sched_auto || chunk < 1) { 2784 // ignore parameter chunk for schedule auto 2785 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2786 } else { 2787 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2788 } 2789 } 2790 2791 /* Gets def_sched_var ICV values */ 2792 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2793 kmp_info_t *thread; 2794 enum sched_type th_type; 2795 2796 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2797 KMP_DEBUG_ASSERT(__kmp_init_serial); 2798 2799 thread = __kmp_threads[gtid]; 2800 2801 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2802 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2803 case kmp_sch_static: 2804 case kmp_sch_static_greedy: 2805 case kmp_sch_static_balanced: 2806 *kind = kmp_sched_static; 2807 __kmp_sched_apply_mods_stdkind(kind, th_type); 2808 *chunk = 0; // chunk was not set, try to show this fact via zero value 2809 return; 2810 case kmp_sch_static_chunked: 2811 *kind = kmp_sched_static; 2812 break; 2813 case kmp_sch_dynamic_chunked: 2814 *kind = kmp_sched_dynamic; 2815 break; 2816 case kmp_sch_guided_chunked: 2817 case kmp_sch_guided_iterative_chunked: 2818 case kmp_sch_guided_analytical_chunked: 2819 *kind = kmp_sched_guided; 2820 break; 2821 case kmp_sch_auto: 2822 *kind = kmp_sched_auto; 2823 break; 2824 case kmp_sch_trapezoidal: 2825 *kind = kmp_sched_trapezoidal; 2826 break; 2827 #if KMP_STATIC_STEAL_ENABLED 2828 case kmp_sch_static_steal: 2829 *kind = kmp_sched_static_steal; 2830 break; 2831 #endif 2832 default: 2833 KMP_FATAL(UnknownSchedulingType, th_type); 2834 } 2835 2836 __kmp_sched_apply_mods_stdkind(kind, th_type); 2837 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2838 } 2839 2840 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2841 2842 int ii, dd; 2843 kmp_team_t *team; 2844 kmp_info_t *thr; 2845 2846 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2847 KMP_DEBUG_ASSERT(__kmp_init_serial); 2848 2849 // validate level 2850 if (level == 0) 2851 return 0; 2852 if (level < 0) 2853 return -1; 2854 thr = __kmp_threads[gtid]; 2855 team = thr->th.th_team; 2856 ii = team->t.t_level; 2857 if (level > ii) 2858 return -1; 2859 2860 if (thr->th.th_teams_microtask) { 2861 // AC: we are in teams region where multiple nested teams have same level 2862 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2863 if (level <= 2864 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2865 KMP_DEBUG_ASSERT(ii >= tlevel); 2866 // AC: As we need to pass by the teams league, we need to artificially 2867 // increase ii 2868 if (ii == tlevel) { 2869 ii += 2; // three teams have same level 2870 } else { 2871 ii++; // two teams have same level 2872 } 2873 } 2874 } 2875 2876 if (ii == level) 2877 return __kmp_tid_from_gtid(gtid); 2878 2879 dd = team->t.t_serialized; 2880 level++; 2881 while (ii > level) { 2882 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2883 } 2884 if ((team->t.t_serialized) && (!dd)) { 2885 team = team->t.t_parent; 2886 continue; 2887 } 2888 if (ii > level) { 2889 team = team->t.t_parent; 2890 dd = team->t.t_serialized; 2891 ii--; 2892 } 2893 } 2894 2895 return (dd > 1) ? (0) : (team->t.t_master_tid); 2896 } 2897 2898 int __kmp_get_team_size(int gtid, int level) { 2899 2900 int ii, dd; 2901 kmp_team_t *team; 2902 kmp_info_t *thr; 2903 2904 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2905 KMP_DEBUG_ASSERT(__kmp_init_serial); 2906 2907 // validate level 2908 if (level == 0) 2909 return 1; 2910 if (level < 0) 2911 return -1; 2912 thr = __kmp_threads[gtid]; 2913 team = thr->th.th_team; 2914 ii = team->t.t_level; 2915 if (level > ii) 2916 return -1; 2917 2918 if (thr->th.th_teams_microtask) { 2919 // AC: we are in teams region where multiple nested teams have same level 2920 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2921 if (level <= 2922 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2923 KMP_DEBUG_ASSERT(ii >= tlevel); 2924 // AC: As we need to pass by the teams league, we need to artificially 2925 // increase ii 2926 if (ii == tlevel) { 2927 ii += 2; // three teams have same level 2928 } else { 2929 ii++; // two teams have same level 2930 } 2931 } 2932 } 2933 2934 while (ii > level) { 2935 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2936 } 2937 if (team->t.t_serialized && (!dd)) { 2938 team = team->t.t_parent; 2939 continue; 2940 } 2941 if (ii > level) { 2942 team = team->t.t_parent; 2943 ii--; 2944 } 2945 } 2946 2947 return team->t.t_nproc; 2948 } 2949 2950 kmp_r_sched_t __kmp_get_schedule_global() { 2951 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2952 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2953 // independently. So one can get the updated schedule here. 2954 2955 kmp_r_sched_t r_sched; 2956 2957 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2958 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2959 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2960 // different roots (even in OMP 2.5) 2961 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2962 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2963 if (s == kmp_sch_static) { 2964 // replace STATIC with more detailed schedule (balanced or greedy) 2965 r_sched.r_sched_type = __kmp_static; 2966 } else if (s == kmp_sch_guided_chunked) { 2967 // replace GUIDED with more detailed schedule (iterative or analytical) 2968 r_sched.r_sched_type = __kmp_guided; 2969 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2970 r_sched.r_sched_type = __kmp_sched; 2971 } 2972 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2973 2974 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2975 // __kmp_chunk may be wrong here (if it was not ever set) 2976 r_sched.chunk = KMP_DEFAULT_CHUNK; 2977 } else { 2978 r_sched.chunk = __kmp_chunk; 2979 } 2980 2981 return r_sched; 2982 } 2983 2984 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2985 at least argc number of *t_argv entries for the requested team. */ 2986 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2987 2988 KMP_DEBUG_ASSERT(team); 2989 if (!realloc || argc > team->t.t_max_argc) { 2990 2991 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 2992 "current entries=%d\n", 2993 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 2994 /* if previously allocated heap space for args, free them */ 2995 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 2996 __kmp_free((void *)team->t.t_argv); 2997 2998 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 2999 /* use unused space in the cache line for arguments */ 3000 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3001 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3002 "argv entries\n", 3003 team->t.t_id, team->t.t_max_argc)); 3004 team->t.t_argv = &team->t.t_inline_argv[0]; 3005 if (__kmp_storage_map) { 3006 __kmp_print_storage_map_gtid( 3007 -1, &team->t.t_inline_argv[0], 3008 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3009 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3010 team->t.t_id); 3011 } 3012 } else { 3013 /* allocate space for arguments in the heap */ 3014 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3015 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3016 : 2 * argc; 3017 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3018 "argv entries\n", 3019 team->t.t_id, team->t.t_max_argc)); 3020 team->t.t_argv = 3021 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3022 if (__kmp_storage_map) { 3023 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3024 &team->t.t_argv[team->t.t_max_argc], 3025 sizeof(void *) * team->t.t_max_argc, 3026 "team_%d.t_argv", team->t.t_id); 3027 } 3028 } 3029 } 3030 } 3031 3032 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3033 int i; 3034 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3035 team->t.t_threads = 3036 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3037 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3038 sizeof(dispatch_shared_info_t) * num_disp_buff); 3039 team->t.t_dispatch = 3040 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3041 team->t.t_implicit_task_taskdata = 3042 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3043 team->t.t_max_nproc = max_nth; 3044 3045 /* setup dispatch buffers */ 3046 for (i = 0; i < num_disp_buff; ++i) { 3047 team->t.t_disp_buffer[i].buffer_index = i; 3048 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3049 } 3050 } 3051 3052 static void __kmp_free_team_arrays(kmp_team_t *team) { 3053 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3054 int i; 3055 for (i = 0; i < team->t.t_max_nproc; ++i) { 3056 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3057 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3058 team->t.t_dispatch[i].th_disp_buffer = NULL; 3059 } 3060 } 3061 #if KMP_USE_HIER_SCHED 3062 __kmp_dispatch_free_hierarchies(team); 3063 #endif 3064 __kmp_free(team->t.t_threads); 3065 __kmp_free(team->t.t_disp_buffer); 3066 __kmp_free(team->t.t_dispatch); 3067 __kmp_free(team->t.t_implicit_task_taskdata); 3068 team->t.t_threads = NULL; 3069 team->t.t_disp_buffer = NULL; 3070 team->t.t_dispatch = NULL; 3071 team->t.t_implicit_task_taskdata = 0; 3072 } 3073 3074 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3075 kmp_info_t **oldThreads = team->t.t_threads; 3076 3077 __kmp_free(team->t.t_disp_buffer); 3078 __kmp_free(team->t.t_dispatch); 3079 __kmp_free(team->t.t_implicit_task_taskdata); 3080 __kmp_allocate_team_arrays(team, max_nth); 3081 3082 KMP_MEMCPY(team->t.t_threads, oldThreads, 3083 team->t.t_nproc * sizeof(kmp_info_t *)); 3084 3085 __kmp_free(oldThreads); 3086 } 3087 3088 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3089 3090 kmp_r_sched_t r_sched = 3091 __kmp_get_schedule_global(); // get current state of scheduling globals 3092 3093 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3094 3095 kmp_internal_control_t g_icvs = { 3096 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3097 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3098 // adjustment of threads (per thread) 3099 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3100 // whether blocktime is explicitly set 3101 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3102 #if KMP_USE_MONITOR 3103 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3104 // intervals 3105 #endif 3106 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3107 // next parallel region (per thread) 3108 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3109 __kmp_cg_max_nth, // int thread_limit; 3110 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3111 // for max_active_levels 3112 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3113 // {sched,chunk} pair 3114 __kmp_nested_proc_bind.bind_types[0], 3115 __kmp_default_device, 3116 NULL // struct kmp_internal_control *next; 3117 }; 3118 3119 return g_icvs; 3120 } 3121 3122 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3123 3124 kmp_internal_control_t gx_icvs; 3125 gx_icvs.serial_nesting_level = 3126 0; // probably =team->t.t_serial like in save_inter_controls 3127 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3128 gx_icvs.next = NULL; 3129 3130 return gx_icvs; 3131 } 3132 3133 static void __kmp_initialize_root(kmp_root_t *root) { 3134 int f; 3135 kmp_team_t *root_team; 3136 kmp_team_t *hot_team; 3137 int hot_team_max_nth; 3138 kmp_r_sched_t r_sched = 3139 __kmp_get_schedule_global(); // get current state of scheduling globals 3140 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3141 KMP_DEBUG_ASSERT(root); 3142 KMP_ASSERT(!root->r.r_begin); 3143 3144 /* setup the root state structure */ 3145 __kmp_init_lock(&root->r.r_begin_lock); 3146 root->r.r_begin = FALSE; 3147 root->r.r_active = FALSE; 3148 root->r.r_in_parallel = 0; 3149 root->r.r_blocktime = __kmp_dflt_blocktime; 3150 3151 /* setup the root team for this task */ 3152 /* allocate the root team structure */ 3153 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3154 3155 root_team = 3156 __kmp_allocate_team(root, 3157 1, // new_nproc 3158 1, // max_nproc 3159 #if OMPT_SUPPORT 3160 ompt_data_none, // root parallel id 3161 #endif 3162 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3163 0 // argc 3164 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3165 ); 3166 #if USE_DEBUGGER 3167 // Non-NULL value should be assigned to make the debugger display the root 3168 // team. 3169 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3170 #endif 3171 3172 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3173 3174 root->r.r_root_team = root_team; 3175 root_team->t.t_control_stack_top = NULL; 3176 3177 /* initialize root team */ 3178 root_team->t.t_threads[0] = NULL; 3179 root_team->t.t_nproc = 1; 3180 root_team->t.t_serialized = 1; 3181 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3182 root_team->t.t_sched.sched = r_sched.sched; 3183 KA_TRACE( 3184 20, 3185 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3186 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3187 3188 /* setup the hot team for this task */ 3189 /* allocate the hot team structure */ 3190 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3191 3192 hot_team = 3193 __kmp_allocate_team(root, 3194 1, // new_nproc 3195 __kmp_dflt_team_nth_ub * 2, // max_nproc 3196 #if OMPT_SUPPORT 3197 ompt_data_none, // root parallel id 3198 #endif 3199 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3200 0 // argc 3201 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3202 ); 3203 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3204 3205 root->r.r_hot_team = hot_team; 3206 root_team->t.t_control_stack_top = NULL; 3207 3208 /* first-time initialization */ 3209 hot_team->t.t_parent = root_team; 3210 3211 /* initialize hot team */ 3212 hot_team_max_nth = hot_team->t.t_max_nproc; 3213 for (f = 0; f < hot_team_max_nth; ++f) { 3214 hot_team->t.t_threads[f] = NULL; 3215 } 3216 hot_team->t.t_nproc = 1; 3217 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3218 hot_team->t.t_sched.sched = r_sched.sched; 3219 hot_team->t.t_size_changed = 0; 3220 } 3221 3222 #ifdef KMP_DEBUG 3223 3224 typedef struct kmp_team_list_item { 3225 kmp_team_p const *entry; 3226 struct kmp_team_list_item *next; 3227 } kmp_team_list_item_t; 3228 typedef kmp_team_list_item_t *kmp_team_list_t; 3229 3230 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3231 kmp_team_list_t list, // List of teams. 3232 kmp_team_p const *team // Team to add. 3233 ) { 3234 3235 // List must terminate with item where both entry and next are NULL. 3236 // Team is added to the list only once. 3237 // List is sorted in ascending order by team id. 3238 // Team id is *not* a key. 3239 3240 kmp_team_list_t l; 3241 3242 KMP_DEBUG_ASSERT(list != NULL); 3243 if (team == NULL) { 3244 return; 3245 } 3246 3247 __kmp_print_structure_team_accum(list, team->t.t_parent); 3248 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3249 3250 // Search list for the team. 3251 l = list; 3252 while (l->next != NULL && l->entry != team) { 3253 l = l->next; 3254 } 3255 if (l->next != NULL) { 3256 return; // Team has been added before, exit. 3257 } 3258 3259 // Team is not found. Search list again for insertion point. 3260 l = list; 3261 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3262 l = l->next; 3263 } 3264 3265 // Insert team. 3266 { 3267 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3268 sizeof(kmp_team_list_item_t)); 3269 *item = *l; 3270 l->entry = team; 3271 l->next = item; 3272 } 3273 } 3274 3275 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3276 3277 ) { 3278 __kmp_printf("%s", title); 3279 if (team != NULL) { 3280 __kmp_printf("%2x %p\n", team->t.t_id, team); 3281 } else { 3282 __kmp_printf(" - (nil)\n"); 3283 } 3284 } 3285 3286 static void __kmp_print_structure_thread(char const *title, 3287 kmp_info_p const *thread) { 3288 __kmp_printf("%s", title); 3289 if (thread != NULL) { 3290 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3291 } else { 3292 __kmp_printf(" - (nil)\n"); 3293 } 3294 } 3295 3296 void __kmp_print_structure(void) { 3297 3298 kmp_team_list_t list; 3299 3300 // Initialize list of teams. 3301 list = 3302 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3303 list->entry = NULL; 3304 list->next = NULL; 3305 3306 __kmp_printf("\n------------------------------\nGlobal Thread " 3307 "Table\n------------------------------\n"); 3308 { 3309 int gtid; 3310 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3311 __kmp_printf("%2d", gtid); 3312 if (__kmp_threads != NULL) { 3313 __kmp_printf(" %p", __kmp_threads[gtid]); 3314 } 3315 if (__kmp_root != NULL) { 3316 __kmp_printf(" %p", __kmp_root[gtid]); 3317 } 3318 __kmp_printf("\n"); 3319 } 3320 } 3321 3322 // Print out __kmp_threads array. 3323 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3324 "----------\n"); 3325 if (__kmp_threads != NULL) { 3326 int gtid; 3327 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3328 kmp_info_t const *thread = __kmp_threads[gtid]; 3329 if (thread != NULL) { 3330 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3331 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3332 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3333 __kmp_print_structure_team(" Serial Team: ", 3334 thread->th.th_serial_team); 3335 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3336 __kmp_print_structure_thread(" Master: ", 3337 thread->th.th_team_master); 3338 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3339 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3340 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3341 __kmp_print_structure_thread(" Next in pool: ", 3342 thread->th.th_next_pool); 3343 __kmp_printf("\n"); 3344 __kmp_print_structure_team_accum(list, thread->th.th_team); 3345 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3346 } 3347 } 3348 } else { 3349 __kmp_printf("Threads array is not allocated.\n"); 3350 } 3351 3352 // Print out __kmp_root array. 3353 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3354 "--------\n"); 3355 if (__kmp_root != NULL) { 3356 int gtid; 3357 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3358 kmp_root_t const *root = __kmp_root[gtid]; 3359 if (root != NULL) { 3360 __kmp_printf("GTID %2d %p:\n", gtid, root); 3361 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3362 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3363 __kmp_print_structure_thread(" Uber Thread: ", 3364 root->r.r_uber_thread); 3365 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3366 __kmp_printf(" In Parallel: %2d\n", 3367 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3368 __kmp_printf("\n"); 3369 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3370 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3371 } 3372 } 3373 } else { 3374 __kmp_printf("Ubers array is not allocated.\n"); 3375 } 3376 3377 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3378 "--------\n"); 3379 while (list->next != NULL) { 3380 kmp_team_p const *team = list->entry; 3381 int i; 3382 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3383 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3384 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3385 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3386 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3387 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3388 for (i = 0; i < team->t.t_nproc; ++i) { 3389 __kmp_printf(" Thread %2d: ", i); 3390 __kmp_print_structure_thread("", team->t.t_threads[i]); 3391 } 3392 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3393 __kmp_printf("\n"); 3394 list = list->next; 3395 } 3396 3397 // Print out __kmp_thread_pool and __kmp_team_pool. 3398 __kmp_printf("\n------------------------------\nPools\n----------------------" 3399 "--------\n"); 3400 __kmp_print_structure_thread("Thread pool: ", 3401 CCAST(kmp_info_t *, __kmp_thread_pool)); 3402 __kmp_print_structure_team("Team pool: ", 3403 CCAST(kmp_team_t *, __kmp_team_pool)); 3404 __kmp_printf("\n"); 3405 3406 // Free team list. 3407 while (list != NULL) { 3408 kmp_team_list_item_t *item = list; 3409 list = list->next; 3410 KMP_INTERNAL_FREE(item); 3411 } 3412 } 3413 3414 #endif 3415 3416 //--------------------------------------------------------------------------- 3417 // Stuff for per-thread fast random number generator 3418 // Table of primes 3419 static const unsigned __kmp_primes[] = { 3420 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3421 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3422 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3423 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3424 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3425 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3426 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3427 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3428 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3429 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3430 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3431 3432 //--------------------------------------------------------------------------- 3433 // __kmp_get_random: Get a random number using a linear congruential method. 3434 unsigned short __kmp_get_random(kmp_info_t *thread) { 3435 unsigned x = thread->th.th_x; 3436 unsigned short r = (unsigned short)(x >> 16); 3437 3438 thread->th.th_x = x * thread->th.th_a + 1; 3439 3440 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3441 thread->th.th_info.ds.ds_tid, r)); 3442 3443 return r; 3444 } 3445 //-------------------------------------------------------- 3446 // __kmp_init_random: Initialize a random number generator 3447 void __kmp_init_random(kmp_info_t *thread) { 3448 unsigned seed = thread->th.th_info.ds.ds_tid; 3449 3450 thread->th.th_a = 3451 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3452 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3453 KA_TRACE(30, 3454 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3455 } 3456 3457 #if KMP_OS_WINDOWS 3458 /* reclaim array entries for root threads that are already dead, returns number 3459 * reclaimed */ 3460 static int __kmp_reclaim_dead_roots(void) { 3461 int i, r = 0; 3462 3463 for (i = 0; i < __kmp_threads_capacity; ++i) { 3464 if (KMP_UBER_GTID(i) && 3465 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3466 !__kmp_root[i] 3467 ->r.r_active) { // AC: reclaim only roots died in non-active state 3468 r += __kmp_unregister_root_other_thread(i); 3469 } 3470 } 3471 return r; 3472 } 3473 #endif 3474 3475 /* This function attempts to create free entries in __kmp_threads and 3476 __kmp_root, and returns the number of free entries generated. 3477 3478 For Windows* OS static library, the first mechanism used is to reclaim array 3479 entries for root threads that are already dead. 3480 3481 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3482 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3483 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3484 threadprivate cache array has been created. Synchronization with 3485 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3486 3487 After any dead root reclamation, if the clipping value allows array expansion 3488 to result in the generation of a total of nNeed free slots, the function does 3489 that expansion. If not, nothing is done beyond the possible initial root 3490 thread reclamation. 3491 3492 If any argument is negative, the behavior is undefined. */ 3493 static int __kmp_expand_threads(int nNeed) { 3494 int added = 0; 3495 int minimumRequiredCapacity; 3496 int newCapacity; 3497 kmp_info_t **newThreads; 3498 kmp_root_t **newRoot; 3499 3500 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3501 // resizing __kmp_threads does not need additional protection if foreign 3502 // threads are present 3503 3504 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3505 /* only for Windows static library */ 3506 /* reclaim array entries for root threads that are already dead */ 3507 added = __kmp_reclaim_dead_roots(); 3508 3509 if (nNeed) { 3510 nNeed -= added; 3511 if (nNeed < 0) 3512 nNeed = 0; 3513 } 3514 #endif 3515 if (nNeed <= 0) 3516 return added; 3517 3518 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3519 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3520 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3521 // > __kmp_max_nth in one of two ways: 3522 // 3523 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3524 // may not be reused by another thread, so we may need to increase 3525 // __kmp_threads_capacity to __kmp_max_nth + 1. 3526 // 3527 // 2) New foreign root(s) are encountered. We always register new foreign 3528 // roots. This may cause a smaller # of threads to be allocated at 3529 // subsequent parallel regions, but the worker threads hang around (and 3530 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3531 // 3532 // Anyway, that is the reason for moving the check to see if 3533 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3534 // instead of having it performed here. -BB 3535 3536 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3537 3538 /* compute expansion headroom to check if we can expand */ 3539 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3540 /* possible expansion too small -- give up */ 3541 return added; 3542 } 3543 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3544 3545 newCapacity = __kmp_threads_capacity; 3546 do { 3547 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3548 : __kmp_sys_max_nth; 3549 } while (newCapacity < minimumRequiredCapacity); 3550 newThreads = (kmp_info_t **)__kmp_allocate( 3551 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3552 newRoot = 3553 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3554 KMP_MEMCPY(newThreads, __kmp_threads, 3555 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3556 KMP_MEMCPY(newRoot, __kmp_root, 3557 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3558 3559 kmp_info_t **temp_threads = __kmp_threads; 3560 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3561 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3562 __kmp_free(temp_threads); 3563 added += newCapacity - __kmp_threads_capacity; 3564 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3565 3566 if (newCapacity > __kmp_tp_capacity) { 3567 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3568 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3569 __kmp_threadprivate_resize_cache(newCapacity); 3570 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3571 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3572 } 3573 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3574 } 3575 3576 return added; 3577 } 3578 3579 /* Register the current thread as a root thread and obtain our gtid. We must 3580 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3581 thread that calls from __kmp_do_serial_initialize() */ 3582 int __kmp_register_root(int initial_thread) { 3583 kmp_info_t *root_thread; 3584 kmp_root_t *root; 3585 int gtid; 3586 int capacity; 3587 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3588 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3589 KMP_MB(); 3590 3591 /* 2007-03-02: 3592 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3593 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3594 work as expected -- it may return false (that means there is at least one 3595 empty slot in __kmp_threads array), but it is possible the only free slot 3596 is #0, which is reserved for initial thread and so cannot be used for this 3597 one. Following code workarounds this bug. 3598 3599 However, right solution seems to be not reserving slot #0 for initial 3600 thread because: 3601 (1) there is no magic in slot #0, 3602 (2) we cannot detect initial thread reliably (the first thread which does 3603 serial initialization may be not a real initial thread). 3604 */ 3605 capacity = __kmp_threads_capacity; 3606 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3607 --capacity; 3608 } 3609 3610 /* see if there are too many threads */ 3611 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3612 if (__kmp_tp_cached) { 3613 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3614 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3615 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3616 } else { 3617 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3618 __kmp_msg_null); 3619 } 3620 } 3621 3622 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3623 // 0: initial thread, also a regular OpenMP thread. 3624 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3625 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3626 // regular OpenMP threads. 3627 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3628 // Find an available thread slot for hidden helper thread. Slots for hidden 3629 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3630 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3631 gtid <= __kmp_hidden_helper_threads_num; 3632 gtid++) 3633 ; 3634 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3635 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3636 "hidden helper thread: T#%d\n", 3637 gtid)); 3638 } else { 3639 /* find an available thread slot */ 3640 // Don't reassign the zero slot since we need that to only be used by 3641 // initial thread. Slots for hidden helper threads should also be skipped. 3642 if (initial_thread && __kmp_threads[0] == NULL) { 3643 gtid = 0; 3644 } else { 3645 for (gtid = __kmp_hidden_helper_threads_num + 1; 3646 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3647 ; 3648 } 3649 KA_TRACE( 3650 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3651 KMP_ASSERT(gtid < __kmp_threads_capacity); 3652 } 3653 3654 /* update global accounting */ 3655 __kmp_all_nth++; 3656 TCW_4(__kmp_nth, __kmp_nth + 1); 3657 3658 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3659 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3660 if (__kmp_adjust_gtid_mode) { 3661 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3662 if (TCR_4(__kmp_gtid_mode) != 2) { 3663 TCW_4(__kmp_gtid_mode, 2); 3664 } 3665 } else { 3666 if (TCR_4(__kmp_gtid_mode) != 1) { 3667 TCW_4(__kmp_gtid_mode, 1); 3668 } 3669 } 3670 } 3671 3672 #ifdef KMP_ADJUST_BLOCKTIME 3673 /* Adjust blocktime to zero if necessary */ 3674 /* Middle initialization might not have occurred yet */ 3675 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3676 if (__kmp_nth > __kmp_avail_proc) { 3677 __kmp_zero_bt = TRUE; 3678 } 3679 } 3680 #endif /* KMP_ADJUST_BLOCKTIME */ 3681 3682 /* setup this new hierarchy */ 3683 if (!(root = __kmp_root[gtid])) { 3684 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3685 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3686 } 3687 3688 #if KMP_STATS_ENABLED 3689 // Initialize stats as soon as possible (right after gtid assignment). 3690 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3691 __kmp_stats_thread_ptr->startLife(); 3692 KMP_SET_THREAD_STATE(SERIAL_REGION); 3693 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3694 #endif 3695 __kmp_initialize_root(root); 3696 3697 /* setup new root thread structure */ 3698 if (root->r.r_uber_thread) { 3699 root_thread = root->r.r_uber_thread; 3700 } else { 3701 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3702 if (__kmp_storage_map) { 3703 __kmp_print_thread_storage_map(root_thread, gtid); 3704 } 3705 root_thread->th.th_info.ds.ds_gtid = gtid; 3706 #if OMPT_SUPPORT 3707 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3708 #endif 3709 root_thread->th.th_root = root; 3710 if (__kmp_env_consistency_check) { 3711 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3712 } 3713 #if USE_FAST_MEMORY 3714 __kmp_initialize_fast_memory(root_thread); 3715 #endif /* USE_FAST_MEMORY */ 3716 3717 #if KMP_USE_BGET 3718 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3719 __kmp_initialize_bget(root_thread); 3720 #endif 3721 __kmp_init_random(root_thread); // Initialize random number generator 3722 } 3723 3724 /* setup the serial team held in reserve by the root thread */ 3725 if (!root_thread->th.th_serial_team) { 3726 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3727 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3728 root_thread->th.th_serial_team = __kmp_allocate_team( 3729 root, 1, 1, 3730 #if OMPT_SUPPORT 3731 ompt_data_none, // root parallel id 3732 #endif 3733 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3734 } 3735 KMP_ASSERT(root_thread->th.th_serial_team); 3736 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3737 root_thread->th.th_serial_team)); 3738 3739 /* drop root_thread into place */ 3740 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3741 3742 root->r.r_root_team->t.t_threads[0] = root_thread; 3743 root->r.r_hot_team->t.t_threads[0] = root_thread; 3744 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3745 // AC: the team created in reserve, not for execution (it is unused for now). 3746 root_thread->th.th_serial_team->t.t_serialized = 0; 3747 root->r.r_uber_thread = root_thread; 3748 3749 /* initialize the thread, get it ready to go */ 3750 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3751 TCW_4(__kmp_init_gtid, TRUE); 3752 3753 /* prepare the master thread for get_gtid() */ 3754 __kmp_gtid_set_specific(gtid); 3755 3756 #if USE_ITT_BUILD 3757 __kmp_itt_thread_name(gtid); 3758 #endif /* USE_ITT_BUILD */ 3759 3760 #ifdef KMP_TDATA_GTID 3761 __kmp_gtid = gtid; 3762 #endif 3763 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3764 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3765 3766 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3767 "plain=%u\n", 3768 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3769 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3770 KMP_INIT_BARRIER_STATE)); 3771 { // Initialize barrier data. 3772 int b; 3773 for (b = 0; b < bs_last_barrier; ++b) { 3774 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3775 #if USE_DEBUGGER 3776 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3777 #endif 3778 } 3779 } 3780 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3781 KMP_INIT_BARRIER_STATE); 3782 3783 #if KMP_AFFINITY_SUPPORTED 3784 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3785 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3786 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3787 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3788 if (TCR_4(__kmp_init_middle)) { 3789 __kmp_affinity_set_init_mask(gtid, TRUE); 3790 } 3791 #endif /* KMP_AFFINITY_SUPPORTED */ 3792 root_thread->th.th_def_allocator = __kmp_def_allocator; 3793 root_thread->th.th_prev_level = 0; 3794 root_thread->th.th_prev_num_threads = 1; 3795 3796 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3797 tmp->cg_root = root_thread; 3798 tmp->cg_thread_limit = __kmp_cg_max_nth; 3799 tmp->cg_nthreads = 1; 3800 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3801 " cg_nthreads init to 1\n", 3802 root_thread, tmp)); 3803 tmp->up = NULL; 3804 root_thread->th.th_cg_roots = tmp; 3805 3806 __kmp_root_counter++; 3807 3808 #if OMPT_SUPPORT 3809 if (!initial_thread && ompt_enabled.enabled) { 3810 3811 kmp_info_t *root_thread = ompt_get_thread(); 3812 3813 ompt_set_thread_state(root_thread, ompt_state_overhead); 3814 3815 if (ompt_enabled.ompt_callback_thread_begin) { 3816 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3817 ompt_thread_initial, __ompt_get_thread_data_internal()); 3818 } 3819 ompt_data_t *task_data; 3820 ompt_data_t *parallel_data; 3821 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3822 NULL); 3823 if (ompt_enabled.ompt_callback_implicit_task) { 3824 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3825 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3826 } 3827 3828 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3829 } 3830 #endif 3831 3832 KMP_MB(); 3833 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3834 3835 return gtid; 3836 } 3837 3838 #if KMP_NESTED_HOT_TEAMS 3839 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3840 const int max_level) { 3841 int i, n, nth; 3842 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3843 if (!hot_teams || !hot_teams[level].hot_team) { 3844 return 0; 3845 } 3846 KMP_DEBUG_ASSERT(level < max_level); 3847 kmp_team_t *team = hot_teams[level].hot_team; 3848 nth = hot_teams[level].hot_team_nth; 3849 n = nth - 1; // master is not freed 3850 if (level < max_level - 1) { 3851 for (i = 0; i < nth; ++i) { 3852 kmp_info_t *th = team->t.t_threads[i]; 3853 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3854 if (i > 0 && th->th.th_hot_teams) { 3855 __kmp_free(th->th.th_hot_teams); 3856 th->th.th_hot_teams = NULL; 3857 } 3858 } 3859 } 3860 __kmp_free_team(root, team, NULL); 3861 return n; 3862 } 3863 #endif 3864 3865 // Resets a root thread and clear its root and hot teams. 3866 // Returns the number of __kmp_threads entries directly and indirectly freed. 3867 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3868 kmp_team_t *root_team = root->r.r_root_team; 3869 kmp_team_t *hot_team = root->r.r_hot_team; 3870 int n = hot_team->t.t_nproc; 3871 int i; 3872 3873 KMP_DEBUG_ASSERT(!root->r.r_active); 3874 3875 root->r.r_root_team = NULL; 3876 root->r.r_hot_team = NULL; 3877 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3878 // before call to __kmp_free_team(). 3879 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3880 #if KMP_NESTED_HOT_TEAMS 3881 if (__kmp_hot_teams_max_level > 3882 0) { // need to free nested hot teams and their threads if any 3883 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3884 kmp_info_t *th = hot_team->t.t_threads[i]; 3885 if (__kmp_hot_teams_max_level > 1) { 3886 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3887 } 3888 if (th->th.th_hot_teams) { 3889 __kmp_free(th->th.th_hot_teams); 3890 th->th.th_hot_teams = NULL; 3891 } 3892 } 3893 } 3894 #endif 3895 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3896 3897 // Before we can reap the thread, we need to make certain that all other 3898 // threads in the teams that had this root as ancestor have stopped trying to 3899 // steal tasks. 3900 if (__kmp_tasking_mode != tskm_immediate_exec) { 3901 __kmp_wait_to_unref_task_teams(); 3902 } 3903 3904 #if KMP_OS_WINDOWS 3905 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3906 KA_TRACE( 3907 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3908 "\n", 3909 (LPVOID) & (root->r.r_uber_thread->th), 3910 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3911 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3912 #endif /* KMP_OS_WINDOWS */ 3913 3914 #if OMPT_SUPPORT 3915 ompt_data_t *task_data; 3916 ompt_data_t *parallel_data; 3917 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3918 NULL); 3919 if (ompt_enabled.ompt_callback_implicit_task) { 3920 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3921 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3922 } 3923 if (ompt_enabled.ompt_callback_thread_end) { 3924 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3925 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3926 } 3927 #endif 3928 3929 TCW_4(__kmp_nth, 3930 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3931 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3932 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3933 " to %d\n", 3934 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3935 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3936 if (i == 1) { 3937 // need to free contention group structure 3938 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3939 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3940 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3941 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3942 root->r.r_uber_thread->th.th_cg_roots = NULL; 3943 } 3944 __kmp_reap_thread(root->r.r_uber_thread, 1); 3945 3946 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3947 // instead of freeing. 3948 root->r.r_uber_thread = NULL; 3949 /* mark root as no longer in use */ 3950 root->r.r_begin = FALSE; 3951 3952 return n; 3953 } 3954 3955 void __kmp_unregister_root_current_thread(int gtid) { 3956 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3957 /* this lock should be ok, since unregister_root_current_thread is never 3958 called during an abort, only during a normal close. furthermore, if you 3959 have the forkjoin lock, you should never try to get the initz lock */ 3960 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3961 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3962 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3963 "exiting T#%d\n", 3964 gtid)); 3965 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3966 return; 3967 } 3968 kmp_root_t *root = __kmp_root[gtid]; 3969 3970 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3971 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3972 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3973 KMP_ASSERT(root->r.r_active == FALSE); 3974 3975 KMP_MB(); 3976 3977 kmp_info_t *thread = __kmp_threads[gtid]; 3978 kmp_team_t *team = thread->th.th_team; 3979 kmp_task_team_t *task_team = thread->th.th_task_team; 3980 3981 // we need to wait for the proxy tasks before finishing the thread 3982 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3983 #if OMPT_SUPPORT 3984 // the runtime is shutting down so we won't report any events 3985 thread->th.ompt_thread_info.state = ompt_state_undefined; 3986 #endif 3987 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3988 } 3989 3990 __kmp_reset_root(gtid, root); 3991 3992 KMP_MB(); 3993 KC_TRACE(10, 3994 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3995 3996 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3997 } 3998 3999 #if KMP_OS_WINDOWS 4000 /* __kmp_forkjoin_lock must be already held 4001 Unregisters a root thread that is not the current thread. Returns the number 4002 of __kmp_threads entries freed as a result. */ 4003 static int __kmp_unregister_root_other_thread(int gtid) { 4004 kmp_root_t *root = __kmp_root[gtid]; 4005 int r; 4006 4007 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4008 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4009 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4010 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4011 KMP_ASSERT(root->r.r_active == FALSE); 4012 4013 r = __kmp_reset_root(gtid, root); 4014 KC_TRACE(10, 4015 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4016 return r; 4017 } 4018 #endif 4019 4020 #if KMP_DEBUG 4021 void __kmp_task_info() { 4022 4023 kmp_int32 gtid = __kmp_entry_gtid(); 4024 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4025 kmp_info_t *this_thr = __kmp_threads[gtid]; 4026 kmp_team_t *steam = this_thr->th.th_serial_team; 4027 kmp_team_t *team = this_thr->th.th_team; 4028 4029 __kmp_printf( 4030 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4031 "ptask=%p\n", 4032 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4033 team->t.t_implicit_task_taskdata[tid].td_parent); 4034 } 4035 #endif // KMP_DEBUG 4036 4037 /* TODO optimize with one big memclr, take out what isn't needed, split 4038 responsibility to workers as much as possible, and delay initialization of 4039 features as much as possible */ 4040 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4041 int tid, int gtid) { 4042 /* this_thr->th.th_info.ds.ds_gtid is setup in 4043 kmp_allocate_thread/create_worker. 4044 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4045 kmp_info_t *master = team->t.t_threads[0]; 4046 KMP_DEBUG_ASSERT(this_thr != NULL); 4047 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4048 KMP_DEBUG_ASSERT(team); 4049 KMP_DEBUG_ASSERT(team->t.t_threads); 4050 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4051 KMP_DEBUG_ASSERT(master); 4052 KMP_DEBUG_ASSERT(master->th.th_root); 4053 4054 KMP_MB(); 4055 4056 TCW_SYNC_PTR(this_thr->th.th_team, team); 4057 4058 this_thr->th.th_info.ds.ds_tid = tid; 4059 this_thr->th.th_set_nproc = 0; 4060 if (__kmp_tasking_mode != tskm_immediate_exec) 4061 // When tasking is possible, threads are not safe to reap until they are 4062 // done tasking; this will be set when tasking code is exited in wait 4063 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4064 else // no tasking --> always safe to reap 4065 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4066 this_thr->th.th_set_proc_bind = proc_bind_default; 4067 #if KMP_AFFINITY_SUPPORTED 4068 this_thr->th.th_new_place = this_thr->th.th_current_place; 4069 #endif 4070 this_thr->th.th_root = master->th.th_root; 4071 4072 /* setup the thread's cache of the team structure */ 4073 this_thr->th.th_team_nproc = team->t.t_nproc; 4074 this_thr->th.th_team_master = master; 4075 this_thr->th.th_team_serialized = team->t.t_serialized; 4076 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4077 4078 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4079 4080 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4081 tid, gtid, this_thr, this_thr->th.th_current_task)); 4082 4083 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4084 team, tid, TRUE); 4085 4086 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4087 tid, gtid, this_thr, this_thr->th.th_current_task)); 4088 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4089 // __kmp_initialize_team()? 4090 4091 /* TODO no worksharing in speculative threads */ 4092 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4093 4094 this_thr->th.th_local.this_construct = 0; 4095 4096 if (!this_thr->th.th_pri_common) { 4097 this_thr->th.th_pri_common = 4098 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4099 if (__kmp_storage_map) { 4100 __kmp_print_storage_map_gtid( 4101 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4102 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4103 } 4104 this_thr->th.th_pri_head = NULL; 4105 } 4106 4107 if (this_thr != master && // Master's CG root is initialized elsewhere 4108 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4109 // Make new thread's CG root same as master's 4110 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4111 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4112 if (tmp) { 4113 // worker changes CG, need to check if old CG should be freed 4114 int i = tmp->cg_nthreads--; 4115 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4116 " on node %p of thread %p to %d\n", 4117 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4118 if (i == 1) { 4119 __kmp_free(tmp); // last thread left CG --> free it 4120 } 4121 } 4122 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4123 // Increment new thread's CG root's counter to add the new thread 4124 this_thr->th.th_cg_roots->cg_nthreads++; 4125 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4126 " node %p of thread %p to %d\n", 4127 this_thr, this_thr->th.th_cg_roots, 4128 this_thr->th.th_cg_roots->cg_root, 4129 this_thr->th.th_cg_roots->cg_nthreads)); 4130 this_thr->th.th_current_task->td_icvs.thread_limit = 4131 this_thr->th.th_cg_roots->cg_thread_limit; 4132 } 4133 4134 /* Initialize dynamic dispatch */ 4135 { 4136 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4137 // Use team max_nproc since this will never change for the team. 4138 size_t disp_size = 4139 sizeof(dispatch_private_info_t) * 4140 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4141 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4142 team->t.t_max_nproc)); 4143 KMP_ASSERT(dispatch); 4144 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4145 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4146 4147 dispatch->th_disp_index = 0; 4148 dispatch->th_doacross_buf_idx = 0; 4149 if (!dispatch->th_disp_buffer) { 4150 dispatch->th_disp_buffer = 4151 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4152 4153 if (__kmp_storage_map) { 4154 __kmp_print_storage_map_gtid( 4155 gtid, &dispatch->th_disp_buffer[0], 4156 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4157 ? 1 4158 : __kmp_dispatch_num_buffers], 4159 disp_size, 4160 "th_%d.th_dispatch.th_disp_buffer " 4161 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4162 gtid, team->t.t_id, gtid); 4163 } 4164 } else { 4165 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4166 } 4167 4168 dispatch->th_dispatch_pr_current = 0; 4169 dispatch->th_dispatch_sh_current = 0; 4170 4171 dispatch->th_deo_fcn = 0; /* ORDERED */ 4172 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4173 } 4174 4175 this_thr->th.th_next_pool = NULL; 4176 4177 if (!this_thr->th.th_task_state_memo_stack) { 4178 size_t i; 4179 this_thr->th.th_task_state_memo_stack = 4180 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4181 this_thr->th.th_task_state_top = 0; 4182 this_thr->th.th_task_state_stack_sz = 4; 4183 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4184 ++i) // zero init the stack 4185 this_thr->th.th_task_state_memo_stack[i] = 0; 4186 } 4187 4188 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4189 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4190 4191 KMP_MB(); 4192 } 4193 4194 /* allocate a new thread for the requesting team. this is only called from 4195 within a forkjoin critical section. we will first try to get an available 4196 thread from the thread pool. if none is available, we will fork a new one 4197 assuming we are able to create a new one. this should be assured, as the 4198 caller should check on this first. */ 4199 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4200 int new_tid) { 4201 kmp_team_t *serial_team; 4202 kmp_info_t *new_thr; 4203 int new_gtid; 4204 4205 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4206 KMP_DEBUG_ASSERT(root && team); 4207 #if !KMP_NESTED_HOT_TEAMS 4208 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4209 #endif 4210 KMP_MB(); 4211 4212 /* first, try to get one from the thread pool */ 4213 if (__kmp_thread_pool) { 4214 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4215 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4216 if (new_thr == __kmp_thread_pool_insert_pt) { 4217 __kmp_thread_pool_insert_pt = NULL; 4218 } 4219 TCW_4(new_thr->th.th_in_pool, FALSE); 4220 __kmp_suspend_initialize_thread(new_thr); 4221 __kmp_lock_suspend_mx(new_thr); 4222 if (new_thr->th.th_active_in_pool == TRUE) { 4223 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4224 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4225 new_thr->th.th_active_in_pool = FALSE; 4226 } 4227 __kmp_unlock_suspend_mx(new_thr); 4228 4229 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4230 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4231 KMP_ASSERT(!new_thr->th.th_team); 4232 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4233 4234 /* setup the thread structure */ 4235 __kmp_initialize_info(new_thr, team, new_tid, 4236 new_thr->th.th_info.ds.ds_gtid); 4237 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4238 4239 TCW_4(__kmp_nth, __kmp_nth + 1); 4240 4241 new_thr->th.th_task_state = 0; 4242 new_thr->th.th_task_state_top = 0; 4243 new_thr->th.th_task_state_stack_sz = 4; 4244 4245 #ifdef KMP_ADJUST_BLOCKTIME 4246 /* Adjust blocktime back to zero if necessary */ 4247 /* Middle initialization might not have occurred yet */ 4248 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4249 if (__kmp_nth > __kmp_avail_proc) { 4250 __kmp_zero_bt = TRUE; 4251 } 4252 } 4253 #endif /* KMP_ADJUST_BLOCKTIME */ 4254 4255 #if KMP_DEBUG 4256 // If thread entered pool via __kmp_free_thread, wait_flag should != 4257 // KMP_BARRIER_PARENT_FLAG. 4258 int b; 4259 kmp_balign_t *balign = new_thr->th.th_bar; 4260 for (b = 0; b < bs_last_barrier; ++b) 4261 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4262 #endif 4263 4264 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4265 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4266 4267 KMP_MB(); 4268 return new_thr; 4269 } 4270 4271 /* no, well fork a new one */ 4272 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4273 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4274 4275 #if KMP_USE_MONITOR 4276 // If this is the first worker thread the RTL is creating, then also 4277 // launch the monitor thread. We try to do this as early as possible. 4278 if (!TCR_4(__kmp_init_monitor)) { 4279 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4280 if (!TCR_4(__kmp_init_monitor)) { 4281 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4282 TCW_4(__kmp_init_monitor, 1); 4283 __kmp_create_monitor(&__kmp_monitor); 4284 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4285 #if KMP_OS_WINDOWS 4286 // AC: wait until monitor has started. This is a fix for CQ232808. 4287 // The reason is that if the library is loaded/unloaded in a loop with 4288 // small (parallel) work in between, then there is high probability that 4289 // monitor thread started after the library shutdown. At shutdown it is 4290 // too late to cope with the problem, because when the master is in 4291 // DllMain (process detach) the monitor has no chances to start (it is 4292 // blocked), and master has no means to inform the monitor that the 4293 // library has gone, because all the memory which the monitor can access 4294 // is going to be released/reset. 4295 while (TCR_4(__kmp_init_monitor) < 2) { 4296 KMP_YIELD(TRUE); 4297 } 4298 KF_TRACE(10, ("after monitor thread has started\n")); 4299 #endif 4300 } 4301 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4302 } 4303 #endif 4304 4305 KMP_MB(); 4306 4307 { 4308 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4309 ? 1 4310 : __kmp_hidden_helper_threads_num + 1; 4311 4312 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4313 ++new_gtid) { 4314 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4315 } 4316 4317 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4318 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4319 } 4320 } 4321 4322 /* allocate space for it. */ 4323 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4324 4325 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4326 4327 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4328 // suppress race conditions detection on synchronization flags in debug mode 4329 // this helps to analyze library internals eliminating false positives 4330 __itt_suppress_mark_range( 4331 __itt_suppress_range, __itt_suppress_threading_errors, 4332 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4333 __itt_suppress_mark_range( 4334 __itt_suppress_range, __itt_suppress_threading_errors, 4335 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4336 #if KMP_OS_WINDOWS 4337 __itt_suppress_mark_range( 4338 __itt_suppress_range, __itt_suppress_threading_errors, 4339 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4340 #else 4341 __itt_suppress_mark_range(__itt_suppress_range, 4342 __itt_suppress_threading_errors, 4343 &new_thr->th.th_suspend_init_count, 4344 sizeof(new_thr->th.th_suspend_init_count)); 4345 #endif 4346 // TODO: check if we need to also suppress b_arrived flags 4347 __itt_suppress_mark_range(__itt_suppress_range, 4348 __itt_suppress_threading_errors, 4349 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4350 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4351 __itt_suppress_mark_range(__itt_suppress_range, 4352 __itt_suppress_threading_errors, 4353 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4354 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4355 __itt_suppress_mark_range(__itt_suppress_range, 4356 __itt_suppress_threading_errors, 4357 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4358 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4359 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4360 if (__kmp_storage_map) { 4361 __kmp_print_thread_storage_map(new_thr, new_gtid); 4362 } 4363 4364 // add the reserve serialized team, initialized from the team's master thread 4365 { 4366 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4367 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4368 new_thr->th.th_serial_team = serial_team = 4369 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4370 #if OMPT_SUPPORT 4371 ompt_data_none, // root parallel id 4372 #endif 4373 proc_bind_default, &r_icvs, 4374 0 USE_NESTED_HOT_ARG(NULL)); 4375 } 4376 KMP_ASSERT(serial_team); 4377 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4378 // execution (it is unused for now). 4379 serial_team->t.t_threads[0] = new_thr; 4380 KF_TRACE(10, 4381 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4382 new_thr)); 4383 4384 /* setup the thread structures */ 4385 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4386 4387 #if USE_FAST_MEMORY 4388 __kmp_initialize_fast_memory(new_thr); 4389 #endif /* USE_FAST_MEMORY */ 4390 4391 #if KMP_USE_BGET 4392 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4393 __kmp_initialize_bget(new_thr); 4394 #endif 4395 4396 __kmp_init_random(new_thr); // Initialize random number generator 4397 4398 /* Initialize these only once when thread is grabbed for a team allocation */ 4399 KA_TRACE(20, 4400 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4401 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4402 4403 int b; 4404 kmp_balign_t *balign = new_thr->th.th_bar; 4405 for (b = 0; b < bs_last_barrier; ++b) { 4406 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4407 balign[b].bb.team = NULL; 4408 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4409 balign[b].bb.use_oncore_barrier = 0; 4410 } 4411 4412 new_thr->th.th_spin_here = FALSE; 4413 new_thr->th.th_next_waiting = 0; 4414 #if KMP_OS_UNIX 4415 new_thr->th.th_blocking = false; 4416 #endif 4417 4418 #if KMP_AFFINITY_SUPPORTED 4419 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4420 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4421 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4422 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4423 #endif 4424 new_thr->th.th_def_allocator = __kmp_def_allocator; 4425 new_thr->th.th_prev_level = 0; 4426 new_thr->th.th_prev_num_threads = 1; 4427 4428 TCW_4(new_thr->th.th_in_pool, FALSE); 4429 new_thr->th.th_active_in_pool = FALSE; 4430 TCW_4(new_thr->th.th_active, TRUE); 4431 4432 /* adjust the global counters */ 4433 __kmp_all_nth++; 4434 __kmp_nth++; 4435 4436 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4437 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4438 if (__kmp_adjust_gtid_mode) { 4439 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4440 if (TCR_4(__kmp_gtid_mode) != 2) { 4441 TCW_4(__kmp_gtid_mode, 2); 4442 } 4443 } else { 4444 if (TCR_4(__kmp_gtid_mode) != 1) { 4445 TCW_4(__kmp_gtid_mode, 1); 4446 } 4447 } 4448 } 4449 4450 #ifdef KMP_ADJUST_BLOCKTIME 4451 /* Adjust blocktime back to zero if necessary */ 4452 /* Middle initialization might not have occurred yet */ 4453 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4454 if (__kmp_nth > __kmp_avail_proc) { 4455 __kmp_zero_bt = TRUE; 4456 } 4457 } 4458 #endif /* KMP_ADJUST_BLOCKTIME */ 4459 4460 /* actually fork it and create the new worker thread */ 4461 KF_TRACE( 4462 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4463 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4464 KF_TRACE(10, 4465 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4466 4467 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4468 new_gtid)); 4469 KMP_MB(); 4470 return new_thr; 4471 } 4472 4473 /* Reinitialize team for reuse. 4474 The hot team code calls this case at every fork barrier, so EPCC barrier 4475 test are extremely sensitive to changes in it, esp. writes to the team 4476 struct, which cause a cache invalidation in all threads. 4477 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4478 static void __kmp_reinitialize_team(kmp_team_t *team, 4479 kmp_internal_control_t *new_icvs, 4480 ident_t *loc) { 4481 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4482 team->t.t_threads[0], team)); 4483 KMP_DEBUG_ASSERT(team && new_icvs); 4484 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4485 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4486 4487 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4488 // Copy ICVs to the master thread's implicit taskdata 4489 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4490 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4491 4492 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4493 team->t.t_threads[0], team)); 4494 } 4495 4496 /* Initialize the team data structure. 4497 This assumes the t_threads and t_max_nproc are already set. 4498 Also, we don't touch the arguments */ 4499 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4500 kmp_internal_control_t *new_icvs, 4501 ident_t *loc) { 4502 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4503 4504 /* verify */ 4505 KMP_DEBUG_ASSERT(team); 4506 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4507 KMP_DEBUG_ASSERT(team->t.t_threads); 4508 KMP_MB(); 4509 4510 team->t.t_master_tid = 0; /* not needed */ 4511 /* team->t.t_master_bar; not needed */ 4512 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4513 team->t.t_nproc = new_nproc; 4514 4515 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4516 team->t.t_next_pool = NULL; 4517 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4518 * up hot team */ 4519 4520 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4521 team->t.t_invoke = NULL; /* not needed */ 4522 4523 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4524 team->t.t_sched.sched = new_icvs->sched.sched; 4525 4526 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4527 team->t.t_fp_control_saved = FALSE; /* not needed */ 4528 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4529 team->t.t_mxcsr = 0; /* not needed */ 4530 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4531 4532 team->t.t_construct = 0; 4533 4534 team->t.t_ordered.dt.t_value = 0; 4535 team->t.t_master_active = FALSE; 4536 4537 #ifdef KMP_DEBUG 4538 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4539 #endif 4540 #if KMP_OS_WINDOWS 4541 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4542 #endif 4543 4544 team->t.t_control_stack_top = NULL; 4545 4546 __kmp_reinitialize_team(team, new_icvs, loc); 4547 4548 KMP_MB(); 4549 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4550 } 4551 4552 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4553 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4554 static void 4555 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4556 if (KMP_AFFINITY_CAPABLE()) { 4557 int status; 4558 if (old_mask != NULL) { 4559 status = __kmp_get_system_affinity(old_mask, TRUE); 4560 int error = errno; 4561 if (status != 0) { 4562 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4563 __kmp_msg_null); 4564 } 4565 } 4566 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4567 } 4568 } 4569 #endif 4570 4571 #if KMP_AFFINITY_SUPPORTED 4572 4573 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4574 // It calculates the worker + master thread's partition based upon the parent 4575 // thread's partition, and binds each worker to a thread in their partition. 4576 // The master thread's partition should already include its current binding. 4577 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4578 // Copy the master thread's place partition to the team struct 4579 kmp_info_t *master_th = team->t.t_threads[0]; 4580 KMP_DEBUG_ASSERT(master_th != NULL); 4581 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4582 int first_place = master_th->th.th_first_place; 4583 int last_place = master_th->th.th_last_place; 4584 int masters_place = master_th->th.th_current_place; 4585 team->t.t_first_place = first_place; 4586 team->t.t_last_place = last_place; 4587 4588 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4589 "bound to place %d partition = [%d,%d]\n", 4590 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4591 team->t.t_id, masters_place, first_place, last_place)); 4592 4593 switch (proc_bind) { 4594 4595 case proc_bind_default: 4596 // serial teams might have the proc_bind policy set to proc_bind_default. It 4597 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4598 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4599 break; 4600 4601 case proc_bind_master: { 4602 int f; 4603 int n_th = team->t.t_nproc; 4604 for (f = 1; f < n_th; f++) { 4605 kmp_info_t *th = team->t.t_threads[f]; 4606 KMP_DEBUG_ASSERT(th != NULL); 4607 th->th.th_first_place = first_place; 4608 th->th.th_last_place = last_place; 4609 th->th.th_new_place = masters_place; 4610 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4611 team->t.t_display_affinity != 1) { 4612 team->t.t_display_affinity = 1; 4613 } 4614 4615 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4616 "partition = [%d,%d]\n", 4617 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4618 f, masters_place, first_place, last_place)); 4619 } 4620 } break; 4621 4622 case proc_bind_close: { 4623 int f; 4624 int n_th = team->t.t_nproc; 4625 int n_places; 4626 if (first_place <= last_place) { 4627 n_places = last_place - first_place + 1; 4628 } else { 4629 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4630 } 4631 if (n_th <= n_places) { 4632 int place = masters_place; 4633 for (f = 1; f < n_th; f++) { 4634 kmp_info_t *th = team->t.t_threads[f]; 4635 KMP_DEBUG_ASSERT(th != NULL); 4636 4637 if (place == last_place) { 4638 place = first_place; 4639 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4640 place = 0; 4641 } else { 4642 place++; 4643 } 4644 th->th.th_first_place = first_place; 4645 th->th.th_last_place = last_place; 4646 th->th.th_new_place = place; 4647 if (__kmp_display_affinity && place != th->th.th_current_place && 4648 team->t.t_display_affinity != 1) { 4649 team->t.t_display_affinity = 1; 4650 } 4651 4652 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4653 "partition = [%d,%d]\n", 4654 __kmp_gtid_from_thread(team->t.t_threads[f]), 4655 team->t.t_id, f, place, first_place, last_place)); 4656 } 4657 } else { 4658 int S, rem, gap, s_count; 4659 S = n_th / n_places; 4660 s_count = 0; 4661 rem = n_th - (S * n_places); 4662 gap = rem > 0 ? n_places / rem : n_places; 4663 int place = masters_place; 4664 int gap_ct = gap; 4665 for (f = 0; f < n_th; f++) { 4666 kmp_info_t *th = team->t.t_threads[f]; 4667 KMP_DEBUG_ASSERT(th != NULL); 4668 4669 th->th.th_first_place = first_place; 4670 th->th.th_last_place = last_place; 4671 th->th.th_new_place = place; 4672 if (__kmp_display_affinity && place != th->th.th_current_place && 4673 team->t.t_display_affinity != 1) { 4674 team->t.t_display_affinity = 1; 4675 } 4676 s_count++; 4677 4678 if ((s_count == S) && rem && (gap_ct == gap)) { 4679 // do nothing, add an extra thread to place on next iteration 4680 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4681 // we added an extra thread to this place; move to next place 4682 if (place == last_place) { 4683 place = first_place; 4684 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4685 place = 0; 4686 } else { 4687 place++; 4688 } 4689 s_count = 0; 4690 gap_ct = 1; 4691 rem--; 4692 } else if (s_count == S) { // place full; don't add extra 4693 if (place == last_place) { 4694 place = first_place; 4695 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4696 place = 0; 4697 } else { 4698 place++; 4699 } 4700 gap_ct++; 4701 s_count = 0; 4702 } 4703 4704 KA_TRACE(100, 4705 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4706 "partition = [%d,%d]\n", 4707 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4708 th->th.th_new_place, first_place, last_place)); 4709 } 4710 KMP_DEBUG_ASSERT(place == masters_place); 4711 } 4712 } break; 4713 4714 case proc_bind_spread: { 4715 int f; 4716 int n_th = team->t.t_nproc; 4717 int n_places; 4718 int thidx; 4719 if (first_place <= last_place) { 4720 n_places = last_place - first_place + 1; 4721 } else { 4722 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4723 } 4724 if (n_th <= n_places) { 4725 int place = -1; 4726 4727 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4728 int S = n_places / n_th; 4729 int s_count, rem, gap, gap_ct; 4730 4731 place = masters_place; 4732 rem = n_places - n_th * S; 4733 gap = rem ? n_th / rem : 1; 4734 gap_ct = gap; 4735 thidx = n_th; 4736 if (update_master_only == 1) 4737 thidx = 1; 4738 for (f = 0; f < thidx; f++) { 4739 kmp_info_t *th = team->t.t_threads[f]; 4740 KMP_DEBUG_ASSERT(th != NULL); 4741 4742 th->th.th_first_place = place; 4743 th->th.th_new_place = place; 4744 if (__kmp_display_affinity && place != th->th.th_current_place && 4745 team->t.t_display_affinity != 1) { 4746 team->t.t_display_affinity = 1; 4747 } 4748 s_count = 1; 4749 while (s_count < S) { 4750 if (place == last_place) { 4751 place = first_place; 4752 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4753 place = 0; 4754 } else { 4755 place++; 4756 } 4757 s_count++; 4758 } 4759 if (rem && (gap_ct == gap)) { 4760 if (place == last_place) { 4761 place = first_place; 4762 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4763 place = 0; 4764 } else { 4765 place++; 4766 } 4767 rem--; 4768 gap_ct = 0; 4769 } 4770 th->th.th_last_place = place; 4771 gap_ct++; 4772 4773 if (place == last_place) { 4774 place = first_place; 4775 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4776 place = 0; 4777 } else { 4778 place++; 4779 } 4780 4781 KA_TRACE(100, 4782 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4783 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4784 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4785 f, th->th.th_new_place, th->th.th_first_place, 4786 th->th.th_last_place, __kmp_affinity_num_masks)); 4787 } 4788 } else { 4789 /* Having uniform space of available computation places I can create 4790 T partitions of round(P/T) size and put threads into the first 4791 place of each partition. */ 4792 double current = static_cast<double>(masters_place); 4793 double spacing = 4794 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4795 int first, last; 4796 kmp_info_t *th; 4797 4798 thidx = n_th + 1; 4799 if (update_master_only == 1) 4800 thidx = 1; 4801 for (f = 0; f < thidx; f++) { 4802 first = static_cast<int>(current); 4803 last = static_cast<int>(current + spacing) - 1; 4804 KMP_DEBUG_ASSERT(last >= first); 4805 if (first >= n_places) { 4806 if (masters_place) { 4807 first -= n_places; 4808 last -= n_places; 4809 if (first == (masters_place + 1)) { 4810 KMP_DEBUG_ASSERT(f == n_th); 4811 first--; 4812 } 4813 if (last == masters_place) { 4814 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4815 last--; 4816 } 4817 } else { 4818 KMP_DEBUG_ASSERT(f == n_th); 4819 first = 0; 4820 last = 0; 4821 } 4822 } 4823 if (last >= n_places) { 4824 last = (n_places - 1); 4825 } 4826 place = first; 4827 current += spacing; 4828 if (f < n_th) { 4829 KMP_DEBUG_ASSERT(0 <= first); 4830 KMP_DEBUG_ASSERT(n_places > first); 4831 KMP_DEBUG_ASSERT(0 <= last); 4832 KMP_DEBUG_ASSERT(n_places > last); 4833 KMP_DEBUG_ASSERT(last_place >= first_place); 4834 th = team->t.t_threads[f]; 4835 KMP_DEBUG_ASSERT(th); 4836 th->th.th_first_place = first; 4837 th->th.th_new_place = place; 4838 th->th.th_last_place = last; 4839 if (__kmp_display_affinity && place != th->th.th_current_place && 4840 team->t.t_display_affinity != 1) { 4841 team->t.t_display_affinity = 1; 4842 } 4843 KA_TRACE(100, 4844 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4845 "partition = [%d,%d], spacing = %.4f\n", 4846 __kmp_gtid_from_thread(team->t.t_threads[f]), 4847 team->t.t_id, f, th->th.th_new_place, 4848 th->th.th_first_place, th->th.th_last_place, spacing)); 4849 } 4850 } 4851 } 4852 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4853 } else { 4854 int S, rem, gap, s_count; 4855 S = n_th / n_places; 4856 s_count = 0; 4857 rem = n_th - (S * n_places); 4858 gap = rem > 0 ? n_places / rem : n_places; 4859 int place = masters_place; 4860 int gap_ct = gap; 4861 thidx = n_th; 4862 if (update_master_only == 1) 4863 thidx = 1; 4864 for (f = 0; f < thidx; f++) { 4865 kmp_info_t *th = team->t.t_threads[f]; 4866 KMP_DEBUG_ASSERT(th != NULL); 4867 4868 th->th.th_first_place = place; 4869 th->th.th_last_place = place; 4870 th->th.th_new_place = place; 4871 if (__kmp_display_affinity && place != th->th.th_current_place && 4872 team->t.t_display_affinity != 1) { 4873 team->t.t_display_affinity = 1; 4874 } 4875 s_count++; 4876 4877 if ((s_count == S) && rem && (gap_ct == gap)) { 4878 // do nothing, add an extra thread to place on next iteration 4879 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4880 // we added an extra thread to this place; move on to next place 4881 if (place == last_place) { 4882 place = first_place; 4883 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4884 place = 0; 4885 } else { 4886 place++; 4887 } 4888 s_count = 0; 4889 gap_ct = 1; 4890 rem--; 4891 } else if (s_count == S) { // place is full; don't add extra thread 4892 if (place == last_place) { 4893 place = first_place; 4894 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4895 place = 0; 4896 } else { 4897 place++; 4898 } 4899 gap_ct++; 4900 s_count = 0; 4901 } 4902 4903 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4904 "partition = [%d,%d]\n", 4905 __kmp_gtid_from_thread(team->t.t_threads[f]), 4906 team->t.t_id, f, th->th.th_new_place, 4907 th->th.th_first_place, th->th.th_last_place)); 4908 } 4909 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4910 } 4911 } break; 4912 4913 default: 4914 break; 4915 } 4916 4917 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4918 } 4919 4920 #endif // KMP_AFFINITY_SUPPORTED 4921 4922 /* allocate a new team data structure to use. take one off of the free pool if 4923 available */ 4924 kmp_team_t * 4925 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4926 #if OMPT_SUPPORT 4927 ompt_data_t ompt_parallel_data, 4928 #endif 4929 kmp_proc_bind_t new_proc_bind, 4930 kmp_internal_control_t *new_icvs, 4931 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4932 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4933 int f; 4934 kmp_team_t *team; 4935 int use_hot_team = !root->r.r_active; 4936 int level = 0; 4937 4938 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4939 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4940 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4941 KMP_MB(); 4942 4943 #if KMP_NESTED_HOT_TEAMS 4944 kmp_hot_team_ptr_t *hot_teams; 4945 if (master) { 4946 team = master->th.th_team; 4947 level = team->t.t_active_level; 4948 if (master->th.th_teams_microtask) { // in teams construct? 4949 if (master->th.th_teams_size.nteams > 1 && 4950 ( // #teams > 1 4951 team->t.t_pkfn == 4952 (microtask_t)__kmp_teams_master || // inner fork of the teams 4953 master->th.th_teams_level < 4954 team->t.t_level)) { // or nested parallel inside the teams 4955 ++level; // not increment if #teams==1, or for outer fork of the teams; 4956 // increment otherwise 4957 } 4958 } 4959 hot_teams = master->th.th_hot_teams; 4960 if (level < __kmp_hot_teams_max_level && hot_teams && 4961 hot_teams[level].hot_team) { 4962 // hot team has already been allocated for given level 4963 use_hot_team = 1; 4964 } else { 4965 use_hot_team = 0; 4966 } 4967 } else { 4968 // check we won't access uninitialized hot_teams, just in case 4969 KMP_DEBUG_ASSERT(new_nproc == 1); 4970 } 4971 #endif 4972 // Optimization to use a "hot" team 4973 if (use_hot_team && new_nproc > 1) { 4974 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4975 #if KMP_NESTED_HOT_TEAMS 4976 team = hot_teams[level].hot_team; 4977 #else 4978 team = root->r.r_hot_team; 4979 #endif 4980 #if KMP_DEBUG 4981 if (__kmp_tasking_mode != tskm_immediate_exec) { 4982 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4983 "task_team[1] = %p before reinit\n", 4984 team->t.t_task_team[0], team->t.t_task_team[1])); 4985 } 4986 #endif 4987 4988 // Has the number of threads changed? 4989 /* Let's assume the most common case is that the number of threads is 4990 unchanged, and put that case first. */ 4991 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4992 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4993 // This case can mean that omp_set_num_threads() was called and the hot 4994 // team size was already reduced, so we check the special flag 4995 if (team->t.t_size_changed == -1) { 4996 team->t.t_size_changed = 1; 4997 } else { 4998 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4999 } 5000 5001 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5002 kmp_r_sched_t new_sched = new_icvs->sched; 5003 // set master's schedule as new run-time schedule 5004 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5005 5006 __kmp_reinitialize_team(team, new_icvs, 5007 root->r.r_uber_thread->th.th_ident); 5008 5009 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5010 team->t.t_threads[0], team)); 5011 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5012 5013 #if KMP_AFFINITY_SUPPORTED 5014 if ((team->t.t_size_changed == 0) && 5015 (team->t.t_proc_bind == new_proc_bind)) { 5016 if (new_proc_bind == proc_bind_spread) { 5017 __kmp_partition_places( 5018 team, 1); // add flag to update only master for spread 5019 } 5020 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5021 "proc_bind = %d, partition = [%d,%d]\n", 5022 team->t.t_id, new_proc_bind, team->t.t_first_place, 5023 team->t.t_last_place)); 5024 } else { 5025 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5026 __kmp_partition_places(team); 5027 } 5028 #else 5029 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5030 #endif /* KMP_AFFINITY_SUPPORTED */ 5031 } else if (team->t.t_nproc > new_nproc) { 5032 KA_TRACE(20, 5033 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5034 new_nproc)); 5035 5036 team->t.t_size_changed = 1; 5037 #if KMP_NESTED_HOT_TEAMS 5038 if (__kmp_hot_teams_mode == 0) { 5039 // AC: saved number of threads should correspond to team's value in this 5040 // mode, can be bigger in mode 1, when hot team has threads in reserve 5041 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5042 hot_teams[level].hot_team_nth = new_nproc; 5043 #endif // KMP_NESTED_HOT_TEAMS 5044 /* release the extra threads we don't need any more */ 5045 for (f = new_nproc; f < team->t.t_nproc; f++) { 5046 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5047 if (__kmp_tasking_mode != tskm_immediate_exec) { 5048 // When decreasing team size, threads no longer in the team should 5049 // unref task team. 5050 team->t.t_threads[f]->th.th_task_team = NULL; 5051 } 5052 __kmp_free_thread(team->t.t_threads[f]); 5053 team->t.t_threads[f] = NULL; 5054 } 5055 #if KMP_NESTED_HOT_TEAMS 5056 } // (__kmp_hot_teams_mode == 0) 5057 else { 5058 // When keeping extra threads in team, switch threads to wait on own 5059 // b_go flag 5060 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5061 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5062 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5063 for (int b = 0; b < bs_last_barrier; ++b) { 5064 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5065 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5066 } 5067 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5068 } 5069 } 5070 } 5071 #endif // KMP_NESTED_HOT_TEAMS 5072 team->t.t_nproc = new_nproc; 5073 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5074 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5075 __kmp_reinitialize_team(team, new_icvs, 5076 root->r.r_uber_thread->th.th_ident); 5077 5078 // Update remaining threads 5079 for (f = 0; f < new_nproc; ++f) { 5080 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5081 } 5082 5083 // restore the current task state of the master thread: should be the 5084 // implicit task 5085 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5086 team->t.t_threads[0], team)); 5087 5088 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5089 5090 #ifdef KMP_DEBUG 5091 for (f = 0; f < team->t.t_nproc; f++) { 5092 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5093 team->t.t_threads[f]->th.th_team_nproc == 5094 team->t.t_nproc); 5095 } 5096 #endif 5097 5098 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5099 #if KMP_AFFINITY_SUPPORTED 5100 __kmp_partition_places(team); 5101 #endif 5102 } else { // team->t.t_nproc < new_nproc 5103 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5104 kmp_affin_mask_t *old_mask; 5105 if (KMP_AFFINITY_CAPABLE()) { 5106 KMP_CPU_ALLOC(old_mask); 5107 } 5108 #endif 5109 5110 KA_TRACE(20, 5111 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5112 new_nproc)); 5113 5114 team->t.t_size_changed = 1; 5115 5116 #if KMP_NESTED_HOT_TEAMS 5117 int avail_threads = hot_teams[level].hot_team_nth; 5118 if (new_nproc < avail_threads) 5119 avail_threads = new_nproc; 5120 kmp_info_t **other_threads = team->t.t_threads; 5121 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5122 // Adjust barrier data of reserved threads (if any) of the team 5123 // Other data will be set in __kmp_initialize_info() below. 5124 int b; 5125 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5126 for (b = 0; b < bs_last_barrier; ++b) { 5127 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5128 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5129 #if USE_DEBUGGER 5130 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5131 #endif 5132 } 5133 } 5134 if (hot_teams[level].hot_team_nth >= new_nproc) { 5135 // we have all needed threads in reserve, no need to allocate any 5136 // this only possible in mode 1, cannot have reserved threads in mode 0 5137 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5138 team->t.t_nproc = new_nproc; // just get reserved threads involved 5139 } else { 5140 // we may have some threads in reserve, but not enough 5141 team->t.t_nproc = 5142 hot_teams[level] 5143 .hot_team_nth; // get reserved threads involved if any 5144 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5145 #endif // KMP_NESTED_HOT_TEAMS 5146 if (team->t.t_max_nproc < new_nproc) { 5147 /* reallocate larger arrays */ 5148 __kmp_reallocate_team_arrays(team, new_nproc); 5149 __kmp_reinitialize_team(team, new_icvs, NULL); 5150 } 5151 5152 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5153 /* Temporarily set full mask for master thread before creation of 5154 workers. The reason is that workers inherit the affinity from master, 5155 so if a lot of workers are created on the single core quickly, they 5156 don't get a chance to set their own affinity for a long time. */ 5157 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5158 #endif 5159 5160 /* allocate new threads for the hot team */ 5161 for (f = team->t.t_nproc; f < new_nproc; f++) { 5162 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5163 KMP_DEBUG_ASSERT(new_worker); 5164 team->t.t_threads[f] = new_worker; 5165 5166 KA_TRACE(20, 5167 ("__kmp_allocate_team: team %d init T#%d arrived: " 5168 "join=%llu, plain=%llu\n", 5169 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5170 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5171 team->t.t_bar[bs_plain_barrier].b_arrived)); 5172 5173 { // Initialize barrier data for new threads. 5174 int b; 5175 kmp_balign_t *balign = new_worker->th.th_bar; 5176 for (b = 0; b < bs_last_barrier; ++b) { 5177 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5178 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5179 KMP_BARRIER_PARENT_FLAG); 5180 #if USE_DEBUGGER 5181 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5182 #endif 5183 } 5184 } 5185 } 5186 5187 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5188 if (KMP_AFFINITY_CAPABLE()) { 5189 /* Restore initial master thread's affinity mask */ 5190 __kmp_set_system_affinity(old_mask, TRUE); 5191 KMP_CPU_FREE(old_mask); 5192 } 5193 #endif 5194 #if KMP_NESTED_HOT_TEAMS 5195 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5196 #endif // KMP_NESTED_HOT_TEAMS 5197 /* make sure everyone is syncronized */ 5198 int old_nproc = team->t.t_nproc; // save old value and use to update only 5199 // new threads below 5200 __kmp_initialize_team(team, new_nproc, new_icvs, 5201 root->r.r_uber_thread->th.th_ident); 5202 5203 /* reinitialize the threads */ 5204 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5205 for (f = 0; f < team->t.t_nproc; ++f) 5206 __kmp_initialize_info(team->t.t_threads[f], team, f, 5207 __kmp_gtid_from_tid(f, team)); 5208 5209 if (level) { // set th_task_state for new threads in nested hot team 5210 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5211 // only need to set the th_task_state for the new threads. th_task_state 5212 // for master thread will not be accurate until after this in 5213 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5214 // correct value. 5215 for (f = old_nproc; f < team->t.t_nproc; ++f) 5216 team->t.t_threads[f]->th.th_task_state = 5217 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5218 } else { // set th_task_state for new threads in non-nested hot team 5219 kmp_uint8 old_state = 5220 team->t.t_threads[0]->th.th_task_state; // copy master's state 5221 for (f = old_nproc; f < team->t.t_nproc; ++f) 5222 team->t.t_threads[f]->th.th_task_state = old_state; 5223 } 5224 5225 #ifdef KMP_DEBUG 5226 for (f = 0; f < team->t.t_nproc; ++f) { 5227 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5228 team->t.t_threads[f]->th.th_team_nproc == 5229 team->t.t_nproc); 5230 } 5231 #endif 5232 5233 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5234 #if KMP_AFFINITY_SUPPORTED 5235 __kmp_partition_places(team); 5236 #endif 5237 } // Check changes in number of threads 5238 5239 kmp_info_t *master = team->t.t_threads[0]; 5240 if (master->th.th_teams_microtask) { 5241 for (f = 1; f < new_nproc; ++f) { 5242 // propagate teams construct specific info to workers 5243 kmp_info_t *thr = team->t.t_threads[f]; 5244 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5245 thr->th.th_teams_level = master->th.th_teams_level; 5246 thr->th.th_teams_size = master->th.th_teams_size; 5247 } 5248 } 5249 #if KMP_NESTED_HOT_TEAMS 5250 if (level) { 5251 // Sync barrier state for nested hot teams, not needed for outermost hot 5252 // team. 5253 for (f = 1; f < new_nproc; ++f) { 5254 kmp_info_t *thr = team->t.t_threads[f]; 5255 int b; 5256 kmp_balign_t *balign = thr->th.th_bar; 5257 for (b = 0; b < bs_last_barrier; ++b) { 5258 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5259 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5260 #if USE_DEBUGGER 5261 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5262 #endif 5263 } 5264 } 5265 } 5266 #endif // KMP_NESTED_HOT_TEAMS 5267 5268 /* reallocate space for arguments if necessary */ 5269 __kmp_alloc_argv_entries(argc, team, TRUE); 5270 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5271 // The hot team re-uses the previous task team, 5272 // if untouched during the previous release->gather phase. 5273 5274 KF_TRACE(10, (" hot_team = %p\n", team)); 5275 5276 #if KMP_DEBUG 5277 if (__kmp_tasking_mode != tskm_immediate_exec) { 5278 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5279 "task_team[1] = %p after reinit\n", 5280 team->t.t_task_team[0], team->t.t_task_team[1])); 5281 } 5282 #endif 5283 5284 #if OMPT_SUPPORT 5285 __ompt_team_assign_id(team, ompt_parallel_data); 5286 #endif 5287 5288 KMP_MB(); 5289 5290 return team; 5291 } 5292 5293 /* next, let's try to take one from the team pool */ 5294 KMP_MB(); 5295 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5296 /* TODO: consider resizing undersized teams instead of reaping them, now 5297 that we have a resizing mechanism */ 5298 if (team->t.t_max_nproc >= max_nproc) { 5299 /* take this team from the team pool */ 5300 __kmp_team_pool = team->t.t_next_pool; 5301 5302 /* setup the team for fresh use */ 5303 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5304 5305 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5306 "task_team[1] %p to NULL\n", 5307 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5308 team->t.t_task_team[0] = NULL; 5309 team->t.t_task_team[1] = NULL; 5310 5311 /* reallocate space for arguments if necessary */ 5312 __kmp_alloc_argv_entries(argc, team, TRUE); 5313 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5314 5315 KA_TRACE( 5316 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5317 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5318 { // Initialize barrier data. 5319 int b; 5320 for (b = 0; b < bs_last_barrier; ++b) { 5321 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5322 #if USE_DEBUGGER 5323 team->t.t_bar[b].b_master_arrived = 0; 5324 team->t.t_bar[b].b_team_arrived = 0; 5325 #endif 5326 } 5327 } 5328 5329 team->t.t_proc_bind = new_proc_bind; 5330 5331 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5332 team->t.t_id)); 5333 5334 #if OMPT_SUPPORT 5335 __ompt_team_assign_id(team, ompt_parallel_data); 5336 #endif 5337 5338 KMP_MB(); 5339 5340 return team; 5341 } 5342 5343 /* reap team if it is too small, then loop back and check the next one */ 5344 // not sure if this is wise, but, will be redone during the hot-teams 5345 // rewrite. 5346 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5347 team = __kmp_reap_team(team); 5348 __kmp_team_pool = team; 5349 } 5350 5351 /* nothing available in the pool, no matter, make a new team! */ 5352 KMP_MB(); 5353 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5354 5355 /* and set it up */ 5356 team->t.t_max_nproc = max_nproc; 5357 /* NOTE well, for some reason allocating one big buffer and dividing it up 5358 seems to really hurt performance a lot on the P4, so, let's not use this */ 5359 __kmp_allocate_team_arrays(team, max_nproc); 5360 5361 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5362 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5363 5364 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5365 "%p to NULL\n", 5366 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5367 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5368 // memory, no need to duplicate 5369 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5370 // memory, no need to duplicate 5371 5372 if (__kmp_storage_map) { 5373 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5374 } 5375 5376 /* allocate space for arguments */ 5377 __kmp_alloc_argv_entries(argc, team, FALSE); 5378 team->t.t_argc = argc; 5379 5380 KA_TRACE(20, 5381 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5382 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5383 { // Initialize barrier data. 5384 int b; 5385 for (b = 0; b < bs_last_barrier; ++b) { 5386 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5387 #if USE_DEBUGGER 5388 team->t.t_bar[b].b_master_arrived = 0; 5389 team->t.t_bar[b].b_team_arrived = 0; 5390 #endif 5391 } 5392 } 5393 5394 team->t.t_proc_bind = new_proc_bind; 5395 5396 #if OMPT_SUPPORT 5397 __ompt_team_assign_id(team, ompt_parallel_data); 5398 team->t.ompt_serialized_team_info = NULL; 5399 #endif 5400 5401 KMP_MB(); 5402 5403 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5404 team->t.t_id)); 5405 5406 return team; 5407 } 5408 5409 /* TODO implement hot-teams at all levels */ 5410 /* TODO implement lazy thread release on demand (disband request) */ 5411 5412 /* free the team. return it to the team pool. release all the threads 5413 * associated with it */ 5414 void __kmp_free_team(kmp_root_t *root, 5415 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5416 int f; 5417 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5418 team->t.t_id)); 5419 5420 /* verify state */ 5421 KMP_DEBUG_ASSERT(root); 5422 KMP_DEBUG_ASSERT(team); 5423 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5424 KMP_DEBUG_ASSERT(team->t.t_threads); 5425 5426 int use_hot_team = team == root->r.r_hot_team; 5427 #if KMP_NESTED_HOT_TEAMS 5428 int level; 5429 kmp_hot_team_ptr_t *hot_teams; 5430 if (master) { 5431 level = team->t.t_active_level - 1; 5432 if (master->th.th_teams_microtask) { // in teams construct? 5433 if (master->th.th_teams_size.nteams > 1) { 5434 ++level; // level was not increased in teams construct for 5435 // team_of_masters 5436 } 5437 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5438 master->th.th_teams_level == team->t.t_level) { 5439 ++level; // level was not increased in teams construct for 5440 // team_of_workers before the parallel 5441 } // team->t.t_level will be increased inside parallel 5442 } 5443 hot_teams = master->th.th_hot_teams; 5444 if (level < __kmp_hot_teams_max_level) { 5445 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5446 use_hot_team = 1; 5447 } 5448 } 5449 #endif // KMP_NESTED_HOT_TEAMS 5450 5451 /* team is done working */ 5452 TCW_SYNC_PTR(team->t.t_pkfn, 5453 NULL); // Important for Debugging Support Library. 5454 #if KMP_OS_WINDOWS 5455 team->t.t_copyin_counter = 0; // init counter for possible reuse 5456 #endif 5457 // Do not reset pointer to parent team to NULL for hot teams. 5458 5459 /* if we are non-hot team, release our threads */ 5460 if (!use_hot_team) { 5461 if (__kmp_tasking_mode != tskm_immediate_exec) { 5462 // Wait for threads to reach reapable state 5463 for (f = 1; f < team->t.t_nproc; ++f) { 5464 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5465 kmp_info_t *th = team->t.t_threads[f]; 5466 volatile kmp_uint32 *state = &th->th.th_reap_state; 5467 while (*state != KMP_SAFE_TO_REAP) { 5468 #if KMP_OS_WINDOWS 5469 // On Windows a thread can be killed at any time, check this 5470 DWORD ecode; 5471 if (!__kmp_is_thread_alive(th, &ecode)) { 5472 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5473 break; 5474 } 5475 #endif 5476 // first check if thread is sleeping 5477 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5478 if (fl.is_sleeping()) 5479 fl.resume(__kmp_gtid_from_thread(th)); 5480 KMP_CPU_PAUSE(); 5481 } 5482 } 5483 5484 // Delete task teams 5485 int tt_idx; 5486 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5487 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5488 if (task_team != NULL) { 5489 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5490 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5491 team->t.t_threads[f]->th.th_task_team = NULL; 5492 } 5493 KA_TRACE( 5494 20, 5495 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5496 __kmp_get_gtid(), task_team, team->t.t_id)); 5497 #if KMP_NESTED_HOT_TEAMS 5498 __kmp_free_task_team(master, task_team); 5499 #endif 5500 team->t.t_task_team[tt_idx] = NULL; 5501 } 5502 } 5503 } 5504 5505 // Reset pointer to parent team only for non-hot teams. 5506 team->t.t_parent = NULL; 5507 team->t.t_level = 0; 5508 team->t.t_active_level = 0; 5509 5510 /* free the worker threads */ 5511 for (f = 1; f < team->t.t_nproc; ++f) { 5512 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5513 __kmp_free_thread(team->t.t_threads[f]); 5514 team->t.t_threads[f] = NULL; 5515 } 5516 5517 /* put the team back in the team pool */ 5518 /* TODO limit size of team pool, call reap_team if pool too large */ 5519 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5520 __kmp_team_pool = (volatile kmp_team_t *)team; 5521 } else { // Check if team was created for the masters in a teams construct 5522 // See if first worker is a CG root 5523 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5524 team->t.t_threads[1]->th.th_cg_roots); 5525 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5526 // Clean up the CG root nodes on workers so that this team can be re-used 5527 for (f = 1; f < team->t.t_nproc; ++f) { 5528 kmp_info_t *thr = team->t.t_threads[f]; 5529 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5530 thr->th.th_cg_roots->cg_root == thr); 5531 // Pop current CG root off list 5532 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5533 thr->th.th_cg_roots = tmp->up; 5534 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5535 " up to node %p. cg_nthreads was %d\n", 5536 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5537 int i = tmp->cg_nthreads--; 5538 if (i == 1) { 5539 __kmp_free(tmp); // free CG if we are the last thread in it 5540 } 5541 // Restore current task's thread_limit from CG root 5542 if (thr->th.th_cg_roots) 5543 thr->th.th_current_task->td_icvs.thread_limit = 5544 thr->th.th_cg_roots->cg_thread_limit; 5545 } 5546 } 5547 } 5548 5549 KMP_MB(); 5550 } 5551 5552 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5553 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5554 kmp_team_t *next_pool = team->t.t_next_pool; 5555 5556 KMP_DEBUG_ASSERT(team); 5557 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5558 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5559 KMP_DEBUG_ASSERT(team->t.t_threads); 5560 KMP_DEBUG_ASSERT(team->t.t_argv); 5561 5562 /* TODO clean the threads that are a part of this? */ 5563 5564 /* free stuff */ 5565 __kmp_free_team_arrays(team); 5566 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5567 __kmp_free((void *)team->t.t_argv); 5568 __kmp_free(team); 5569 5570 KMP_MB(); 5571 return next_pool; 5572 } 5573 5574 // Free the thread. Don't reap it, just place it on the pool of available 5575 // threads. 5576 // 5577 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5578 // binding for the affinity mechanism to be useful. 5579 // 5580 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5581 // However, we want to avoid a potential performance problem by always 5582 // scanning through the list to find the correct point at which to insert 5583 // the thread (potential N**2 behavior). To do this we keep track of the 5584 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5585 // With single-level parallelism, threads will always be added to the tail 5586 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5587 // parallelism, all bets are off and we may need to scan through the entire 5588 // free list. 5589 // 5590 // This change also has a potentially large performance benefit, for some 5591 // applications. Previously, as threads were freed from the hot team, they 5592 // would be placed back on the free list in inverse order. If the hot team 5593 // grew back to it's original size, then the freed thread would be placed 5594 // back on the hot team in reverse order. This could cause bad cache 5595 // locality problems on programs where the size of the hot team regularly 5596 // grew and shrunk. 5597 // 5598 // Now, for single-level parallelism, the OMP tid is always == gtid. 5599 void __kmp_free_thread(kmp_info_t *this_th) { 5600 int gtid; 5601 kmp_info_t **scan; 5602 5603 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5604 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5605 5606 KMP_DEBUG_ASSERT(this_th); 5607 5608 // When moving thread to pool, switch thread to wait on own b_go flag, and 5609 // uninitialized (NULL team). 5610 int b; 5611 kmp_balign_t *balign = this_th->th.th_bar; 5612 for (b = 0; b < bs_last_barrier; ++b) { 5613 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5614 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5615 balign[b].bb.team = NULL; 5616 balign[b].bb.leaf_kids = 0; 5617 } 5618 this_th->th.th_task_state = 0; 5619 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5620 5621 /* put thread back on the free pool */ 5622 TCW_PTR(this_th->th.th_team, NULL); 5623 TCW_PTR(this_th->th.th_root, NULL); 5624 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5625 5626 while (this_th->th.th_cg_roots) { 5627 this_th->th.th_cg_roots->cg_nthreads--; 5628 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5629 " %p of thread %p to %d\n", 5630 this_th, this_th->th.th_cg_roots, 5631 this_th->th.th_cg_roots->cg_root, 5632 this_th->th.th_cg_roots->cg_nthreads)); 5633 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5634 if (tmp->cg_root == this_th) { // Thread is a cg_root 5635 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5636 KA_TRACE( 5637 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5638 this_th->th.th_cg_roots = tmp->up; 5639 __kmp_free(tmp); 5640 } else { // Worker thread 5641 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5642 __kmp_free(tmp); 5643 } 5644 this_th->th.th_cg_roots = NULL; 5645 break; 5646 } 5647 } 5648 5649 /* If the implicit task assigned to this thread can be used by other threads 5650 * -> multiple threads can share the data and try to free the task at 5651 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5652 * with higher probability when hot team is disabled but can occurs even when 5653 * the hot team is enabled */ 5654 __kmp_free_implicit_task(this_th); 5655 this_th->th.th_current_task = NULL; 5656 5657 // If the __kmp_thread_pool_insert_pt is already past the new insert 5658 // point, then we need to re-scan the entire list. 5659 gtid = this_th->th.th_info.ds.ds_gtid; 5660 if (__kmp_thread_pool_insert_pt != NULL) { 5661 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5662 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5663 __kmp_thread_pool_insert_pt = NULL; 5664 } 5665 } 5666 5667 // Scan down the list to find the place to insert the thread. 5668 // scan is the address of a link in the list, possibly the address of 5669 // __kmp_thread_pool itself. 5670 // 5671 // In the absence of nested parallelism, the for loop will have 0 iterations. 5672 if (__kmp_thread_pool_insert_pt != NULL) { 5673 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5674 } else { 5675 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5676 } 5677 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5678 scan = &((*scan)->th.th_next_pool)) 5679 ; 5680 5681 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5682 // to its address. 5683 TCW_PTR(this_th->th.th_next_pool, *scan); 5684 __kmp_thread_pool_insert_pt = *scan = this_th; 5685 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5686 (this_th->th.th_info.ds.ds_gtid < 5687 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5688 TCW_4(this_th->th.th_in_pool, TRUE); 5689 __kmp_suspend_initialize_thread(this_th); 5690 __kmp_lock_suspend_mx(this_th); 5691 if (this_th->th.th_active == TRUE) { 5692 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5693 this_th->th.th_active_in_pool = TRUE; 5694 } 5695 #if KMP_DEBUG 5696 else { 5697 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5698 } 5699 #endif 5700 __kmp_unlock_suspend_mx(this_th); 5701 5702 TCW_4(__kmp_nth, __kmp_nth - 1); 5703 5704 #ifdef KMP_ADJUST_BLOCKTIME 5705 /* Adjust blocktime back to user setting or default if necessary */ 5706 /* Middle initialization might never have occurred */ 5707 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5708 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5709 if (__kmp_nth <= __kmp_avail_proc) { 5710 __kmp_zero_bt = FALSE; 5711 } 5712 } 5713 #endif /* KMP_ADJUST_BLOCKTIME */ 5714 5715 KMP_MB(); 5716 } 5717 5718 /* ------------------------------------------------------------------------ */ 5719 5720 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5721 #if OMP_PROFILING_SUPPORT 5722 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5723 // TODO: add a configuration option for time granularity 5724 if (ProfileTraceFile) 5725 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5726 #endif 5727 5728 int gtid = this_thr->th.th_info.ds.ds_gtid; 5729 /* void *stack_data;*/ 5730 kmp_team_t **volatile pteam; 5731 5732 KMP_MB(); 5733 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5734 5735 if (__kmp_env_consistency_check) { 5736 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5737 } 5738 5739 #if OMPT_SUPPORT 5740 ompt_data_t *thread_data; 5741 if (ompt_enabled.enabled) { 5742 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5743 *thread_data = ompt_data_none; 5744 5745 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5746 this_thr->th.ompt_thread_info.wait_id = 0; 5747 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5748 this_thr->th.ompt_thread_info.parallel_flags = 0; 5749 if (ompt_enabled.ompt_callback_thread_begin) { 5750 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5751 ompt_thread_worker, thread_data); 5752 } 5753 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5754 } 5755 #endif 5756 5757 /* This is the place where threads wait for work */ 5758 while (!TCR_4(__kmp_global.g.g_done)) { 5759 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5760 KMP_MB(); 5761 5762 /* wait for work to do */ 5763 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5764 5765 /* No tid yet since not part of a team */ 5766 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5767 5768 #if OMPT_SUPPORT 5769 if (ompt_enabled.enabled) { 5770 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5771 } 5772 #endif 5773 5774 pteam = &this_thr->th.th_team; 5775 5776 /* have we been allocated? */ 5777 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5778 /* we were just woken up, so run our new task */ 5779 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5780 int rc; 5781 KA_TRACE(20, 5782 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5783 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5784 (*pteam)->t.t_pkfn)); 5785 5786 updateHWFPControl(*pteam); 5787 5788 #if OMPT_SUPPORT 5789 if (ompt_enabled.enabled) { 5790 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5791 } 5792 #endif 5793 5794 rc = (*pteam)->t.t_invoke(gtid); 5795 KMP_ASSERT(rc); 5796 5797 KMP_MB(); 5798 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5799 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5800 (*pteam)->t.t_pkfn)); 5801 } 5802 #if OMPT_SUPPORT 5803 if (ompt_enabled.enabled) { 5804 /* no frame set while outside task */ 5805 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5806 5807 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5808 } 5809 #endif 5810 /* join barrier after parallel region */ 5811 __kmp_join_barrier(gtid); 5812 } 5813 } 5814 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5815 5816 #if OMPT_SUPPORT 5817 if (ompt_enabled.ompt_callback_thread_end) { 5818 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5819 } 5820 #endif 5821 5822 this_thr->th.th_task_team = NULL; 5823 /* run the destructors for the threadprivate data for this thread */ 5824 __kmp_common_destroy_gtid(gtid); 5825 5826 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5827 KMP_MB(); 5828 5829 #if OMP_PROFILING_SUPPORT 5830 llvm::timeTraceProfilerFinishThread(); 5831 #endif 5832 return this_thr; 5833 } 5834 5835 /* ------------------------------------------------------------------------ */ 5836 5837 void __kmp_internal_end_dest(void *specific_gtid) { 5838 // Make sure no significant bits are lost 5839 int gtid; 5840 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5841 5842 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5843 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5844 * this is because 0 is reserved for the nothing-stored case */ 5845 5846 __kmp_internal_end_thread(gtid); 5847 } 5848 5849 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5850 5851 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5852 __kmp_internal_end_atexit(); 5853 } 5854 5855 #endif 5856 5857 /* [Windows] josh: when the atexit handler is called, there may still be more 5858 than one thread alive */ 5859 void __kmp_internal_end_atexit(void) { 5860 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5861 /* [Windows] 5862 josh: ideally, we want to completely shutdown the library in this atexit 5863 handler, but stat code that depends on thread specific data for gtid fails 5864 because that data becomes unavailable at some point during the shutdown, so 5865 we call __kmp_internal_end_thread instead. We should eventually remove the 5866 dependency on __kmp_get_specific_gtid in the stat code and use 5867 __kmp_internal_end_library to cleanly shutdown the library. 5868 5869 // TODO: Can some of this comment about GVS be removed? 5870 I suspect that the offending stat code is executed when the calling thread 5871 tries to clean up a dead root thread's data structures, resulting in GVS 5872 code trying to close the GVS structures for that thread, but since the stat 5873 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5874 the calling thread is cleaning up itself instead of another thread, it get 5875 confused. This happens because allowing a thread to unregister and cleanup 5876 another thread is a recent modification for addressing an issue. 5877 Based on the current design (20050722), a thread may end up 5878 trying to unregister another thread only if thread death does not trigger 5879 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5880 thread specific data destructor function to detect thread death. For 5881 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5882 is nothing. Thus, the workaround is applicable only for Windows static 5883 stat library. */ 5884 __kmp_internal_end_library(-1); 5885 #if KMP_OS_WINDOWS 5886 __kmp_close_console(); 5887 #endif 5888 } 5889 5890 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5891 // It is assumed __kmp_forkjoin_lock is acquired. 5892 5893 int gtid; 5894 5895 KMP_DEBUG_ASSERT(thread != NULL); 5896 5897 gtid = thread->th.th_info.ds.ds_gtid; 5898 5899 if (!is_root) { 5900 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5901 /* Assume the threads are at the fork barrier here */ 5902 KA_TRACE( 5903 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5904 gtid)); 5905 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5906 * (GEH) */ 5907 ANNOTATE_HAPPENS_BEFORE(thread); 5908 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5909 thread); 5910 __kmp_release_64(&flag); 5911 } 5912 5913 // Terminate OS thread. 5914 __kmp_reap_worker(thread); 5915 5916 // The thread was killed asynchronously. If it was actively 5917 // spinning in the thread pool, decrement the global count. 5918 // 5919 // There is a small timing hole here - if the worker thread was just waking 5920 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5921 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5922 // the global counter might not get updated. 5923 // 5924 // Currently, this can only happen as the library is unloaded, 5925 // so there are no harmful side effects. 5926 if (thread->th.th_active_in_pool) { 5927 thread->th.th_active_in_pool = FALSE; 5928 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5929 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5930 } 5931 } 5932 5933 __kmp_free_implicit_task(thread); 5934 5935 // Free the fast memory for tasking 5936 #if USE_FAST_MEMORY 5937 __kmp_free_fast_memory(thread); 5938 #endif /* USE_FAST_MEMORY */ 5939 5940 __kmp_suspend_uninitialize_thread(thread); 5941 5942 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5943 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5944 5945 --__kmp_all_nth; 5946 // __kmp_nth was decremented when thread is added to the pool. 5947 5948 #ifdef KMP_ADJUST_BLOCKTIME 5949 /* Adjust blocktime back to user setting or default if necessary */ 5950 /* Middle initialization might never have occurred */ 5951 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5952 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5953 if (__kmp_nth <= __kmp_avail_proc) { 5954 __kmp_zero_bt = FALSE; 5955 } 5956 } 5957 #endif /* KMP_ADJUST_BLOCKTIME */ 5958 5959 /* free the memory being used */ 5960 if (__kmp_env_consistency_check) { 5961 if (thread->th.th_cons) { 5962 __kmp_free_cons_stack(thread->th.th_cons); 5963 thread->th.th_cons = NULL; 5964 } 5965 } 5966 5967 if (thread->th.th_pri_common != NULL) { 5968 __kmp_free(thread->th.th_pri_common); 5969 thread->th.th_pri_common = NULL; 5970 } 5971 5972 if (thread->th.th_task_state_memo_stack != NULL) { 5973 __kmp_free(thread->th.th_task_state_memo_stack); 5974 thread->th.th_task_state_memo_stack = NULL; 5975 } 5976 5977 #if KMP_USE_BGET 5978 if (thread->th.th_local.bget_data != NULL) { 5979 __kmp_finalize_bget(thread); 5980 } 5981 #endif 5982 5983 #if KMP_AFFINITY_SUPPORTED 5984 if (thread->th.th_affin_mask != NULL) { 5985 KMP_CPU_FREE(thread->th.th_affin_mask); 5986 thread->th.th_affin_mask = NULL; 5987 } 5988 #endif /* KMP_AFFINITY_SUPPORTED */ 5989 5990 #if KMP_USE_HIER_SCHED 5991 if (thread->th.th_hier_bar_data != NULL) { 5992 __kmp_free(thread->th.th_hier_bar_data); 5993 thread->th.th_hier_bar_data = NULL; 5994 } 5995 #endif 5996 5997 __kmp_reap_team(thread->th.th_serial_team); 5998 thread->th.th_serial_team = NULL; 5999 __kmp_free(thread); 6000 6001 KMP_MB(); 6002 6003 } // __kmp_reap_thread 6004 6005 static void __kmp_internal_end(void) { 6006 int i; 6007 6008 /* First, unregister the library */ 6009 __kmp_unregister_library(); 6010 6011 #if KMP_OS_WINDOWS 6012 /* In Win static library, we can't tell when a root actually dies, so we 6013 reclaim the data structures for any root threads that have died but not 6014 unregistered themselves, in order to shut down cleanly. 6015 In Win dynamic library we also can't tell when a thread dies. */ 6016 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6017 // dead roots 6018 #endif 6019 6020 for (i = 0; i < __kmp_threads_capacity; i++) 6021 if (__kmp_root[i]) 6022 if (__kmp_root[i]->r.r_active) 6023 break; 6024 KMP_MB(); /* Flush all pending memory write invalidates. */ 6025 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6026 6027 if (i < __kmp_threads_capacity) { 6028 #if KMP_USE_MONITOR 6029 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6030 KMP_MB(); /* Flush all pending memory write invalidates. */ 6031 6032 // Need to check that monitor was initialized before reaping it. If we are 6033 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6034 // __kmp_monitor will appear to contain valid data, but it is only valid in 6035 // the parent process, not the child. 6036 // New behavior (201008): instead of keying off of the flag 6037 // __kmp_init_parallel, the monitor thread creation is keyed off 6038 // of the new flag __kmp_init_monitor. 6039 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6040 if (TCR_4(__kmp_init_monitor)) { 6041 __kmp_reap_monitor(&__kmp_monitor); 6042 TCW_4(__kmp_init_monitor, 0); 6043 } 6044 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6045 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6046 #endif // KMP_USE_MONITOR 6047 } else { 6048 /* TODO move this to cleanup code */ 6049 #ifdef KMP_DEBUG 6050 /* make sure that everything has properly ended */ 6051 for (i = 0; i < __kmp_threads_capacity; i++) { 6052 if (__kmp_root[i]) { 6053 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6054 // there can be uber threads alive here 6055 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6056 } 6057 } 6058 #endif 6059 6060 KMP_MB(); 6061 6062 // Reap the worker threads. 6063 // This is valid for now, but be careful if threads are reaped sooner. 6064 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6065 // Get the next thread from the pool. 6066 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6067 __kmp_thread_pool = thread->th.th_next_pool; 6068 // Reap it. 6069 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6070 thread->th.th_next_pool = NULL; 6071 thread->th.th_in_pool = FALSE; 6072 __kmp_reap_thread(thread, 0); 6073 } 6074 __kmp_thread_pool_insert_pt = NULL; 6075 6076 // Reap teams. 6077 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6078 // Get the next team from the pool. 6079 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6080 __kmp_team_pool = team->t.t_next_pool; 6081 // Reap it. 6082 team->t.t_next_pool = NULL; 6083 __kmp_reap_team(team); 6084 } 6085 6086 __kmp_reap_task_teams(); 6087 6088 #if KMP_OS_UNIX 6089 // Threads that are not reaped should not access any resources since they 6090 // are going to be deallocated soon, so the shutdown sequence should wait 6091 // until all threads either exit the final spin-waiting loop or begin 6092 // sleeping after the given blocktime. 6093 for (i = 0; i < __kmp_threads_capacity; i++) { 6094 kmp_info_t *thr = __kmp_threads[i]; 6095 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6096 KMP_CPU_PAUSE(); 6097 } 6098 #endif 6099 6100 for (i = 0; i < __kmp_threads_capacity; ++i) { 6101 // TBD: Add some checking... 6102 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6103 } 6104 6105 /* Make sure all threadprivate destructors get run by joining with all 6106 worker threads before resetting this flag */ 6107 TCW_SYNC_4(__kmp_init_common, FALSE); 6108 6109 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6110 KMP_MB(); 6111 6112 #if KMP_USE_MONITOR 6113 // See note above: One of the possible fixes for CQ138434 / CQ140126 6114 // 6115 // FIXME: push both code fragments down and CSE them? 6116 // push them into __kmp_cleanup() ? 6117 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6118 if (TCR_4(__kmp_init_monitor)) { 6119 __kmp_reap_monitor(&__kmp_monitor); 6120 TCW_4(__kmp_init_monitor, 0); 6121 } 6122 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6123 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6124 #endif 6125 } /* else !__kmp_global.t_active */ 6126 TCW_4(__kmp_init_gtid, FALSE); 6127 KMP_MB(); /* Flush all pending memory write invalidates. */ 6128 6129 __kmp_cleanup(); 6130 #if OMPT_SUPPORT 6131 ompt_fini(); 6132 #endif 6133 } 6134 6135 void __kmp_internal_end_library(int gtid_req) { 6136 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6137 /* this shouldn't be a race condition because __kmp_internal_end() is the 6138 only place to clear __kmp_serial_init */ 6139 /* we'll check this later too, after we get the lock */ 6140 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6141 // redundant, because the next check will work in any case. 6142 if (__kmp_global.g.g_abort) { 6143 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6144 /* TODO abort? */ 6145 return; 6146 } 6147 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6148 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6149 return; 6150 } 6151 6152 KMP_MB(); /* Flush all pending memory write invalidates. */ 6153 /* find out who we are and what we should do */ 6154 { 6155 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6156 KA_TRACE( 6157 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6158 if (gtid == KMP_GTID_SHUTDOWN) { 6159 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6160 "already shutdown\n")); 6161 return; 6162 } else if (gtid == KMP_GTID_MONITOR) { 6163 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6164 "registered, or system shutdown\n")); 6165 return; 6166 } else if (gtid == KMP_GTID_DNE) { 6167 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6168 "shutdown\n")); 6169 /* we don't know who we are, but we may still shutdown the library */ 6170 } else if (KMP_UBER_GTID(gtid)) { 6171 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6172 if (__kmp_root[gtid]->r.r_active) { 6173 __kmp_global.g.g_abort = -1; 6174 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6175 __kmp_unregister_library(); 6176 KA_TRACE(10, 6177 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6178 gtid)); 6179 return; 6180 } else { 6181 KA_TRACE( 6182 10, 6183 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6184 __kmp_unregister_root_current_thread(gtid); 6185 } 6186 } else { 6187 /* worker threads may call this function through the atexit handler, if they 6188 * call exit() */ 6189 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6190 TODO: do a thorough shutdown instead */ 6191 #ifdef DUMP_DEBUG_ON_EXIT 6192 if (__kmp_debug_buf) 6193 __kmp_dump_debug_buffer(); 6194 #endif 6195 // added unregister library call here when we switch to shm linux 6196 // if we don't, it will leave lots of files in /dev/shm 6197 // cleanup shared memory file before exiting. 6198 __kmp_unregister_library(); 6199 return; 6200 } 6201 } 6202 /* synchronize the termination process */ 6203 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6204 6205 /* have we already finished */ 6206 if (__kmp_global.g.g_abort) { 6207 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6208 /* TODO abort? */ 6209 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6210 return; 6211 } 6212 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6213 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6214 return; 6215 } 6216 6217 /* We need this lock to enforce mutex between this reading of 6218 __kmp_threads_capacity and the writing by __kmp_register_root. 6219 Alternatively, we can use a counter of roots that is atomically updated by 6220 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6221 __kmp_internal_end_*. */ 6222 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6223 6224 /* now we can safely conduct the actual termination */ 6225 __kmp_internal_end(); 6226 6227 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6228 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6229 6230 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6231 6232 #ifdef DUMP_DEBUG_ON_EXIT 6233 if (__kmp_debug_buf) 6234 __kmp_dump_debug_buffer(); 6235 #endif 6236 6237 #if KMP_OS_WINDOWS 6238 __kmp_close_console(); 6239 #endif 6240 6241 __kmp_fini_allocator(); 6242 6243 } // __kmp_internal_end_library 6244 6245 void __kmp_internal_end_thread(int gtid_req) { 6246 int i; 6247 6248 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6249 /* this shouldn't be a race condition because __kmp_internal_end() is the 6250 * only place to clear __kmp_serial_init */ 6251 /* we'll check this later too, after we get the lock */ 6252 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6253 // redundant, because the next check will work in any case. 6254 if (__kmp_global.g.g_abort) { 6255 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6256 /* TODO abort? */ 6257 return; 6258 } 6259 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6260 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6261 return; 6262 } 6263 6264 // If hidden helper team has been initialized, we need to deinit it 6265 if (TCR_4(__kmp_init_hidden_helper)) { 6266 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6267 // First release the main thread to let it continue its work 6268 __kmp_hidden_helper_main_thread_release(); 6269 // Wait until the hidden helper team has been destroyed 6270 __kmp_hidden_helper_threads_deinitz_wait(); 6271 } 6272 6273 KMP_MB(); /* Flush all pending memory write invalidates. */ 6274 6275 /* find out who we are and what we should do */ 6276 { 6277 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6278 KA_TRACE(10, 6279 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6280 if (gtid == KMP_GTID_SHUTDOWN) { 6281 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6282 "already shutdown\n")); 6283 return; 6284 } else if (gtid == KMP_GTID_MONITOR) { 6285 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6286 "registered, or system shutdown\n")); 6287 return; 6288 } else if (gtid == KMP_GTID_DNE) { 6289 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6290 "shutdown\n")); 6291 return; 6292 /* we don't know who we are */ 6293 } else if (KMP_UBER_GTID(gtid)) { 6294 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6295 if (__kmp_root[gtid]->r.r_active) { 6296 __kmp_global.g.g_abort = -1; 6297 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6298 KA_TRACE(10, 6299 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6300 gtid)); 6301 return; 6302 } else { 6303 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6304 gtid)); 6305 __kmp_unregister_root_current_thread(gtid); 6306 } 6307 } else { 6308 /* just a worker thread, let's leave */ 6309 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6310 6311 if (gtid >= 0) { 6312 __kmp_threads[gtid]->th.th_task_team = NULL; 6313 } 6314 6315 KA_TRACE(10, 6316 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6317 gtid)); 6318 return; 6319 } 6320 } 6321 #if KMP_DYNAMIC_LIB 6322 if (__kmp_pause_status != kmp_hard_paused) 6323 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6324 // because we will better shutdown later in the library destructor. 6325 { 6326 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6327 return; 6328 } 6329 #endif 6330 /* synchronize the termination process */ 6331 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6332 6333 /* have we already finished */ 6334 if (__kmp_global.g.g_abort) { 6335 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6336 /* TODO abort? */ 6337 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6338 return; 6339 } 6340 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6341 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6342 return; 6343 } 6344 6345 /* We need this lock to enforce mutex between this reading of 6346 __kmp_threads_capacity and the writing by __kmp_register_root. 6347 Alternatively, we can use a counter of roots that is atomically updated by 6348 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6349 __kmp_internal_end_*. */ 6350 6351 /* should we finish the run-time? are all siblings done? */ 6352 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6353 6354 for (i = 0; i < __kmp_threads_capacity; ++i) { 6355 if (KMP_UBER_GTID(i)) { 6356 KA_TRACE( 6357 10, 6358 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6359 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6360 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6361 return; 6362 } 6363 } 6364 6365 /* now we can safely conduct the actual termination */ 6366 6367 __kmp_internal_end(); 6368 6369 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6370 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6371 6372 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6373 6374 #ifdef DUMP_DEBUG_ON_EXIT 6375 if (__kmp_debug_buf) 6376 __kmp_dump_debug_buffer(); 6377 #endif 6378 } // __kmp_internal_end_thread 6379 6380 // ----------------------------------------------------------------------------- 6381 // Library registration stuff. 6382 6383 static long __kmp_registration_flag = 0; 6384 // Random value used to indicate library initialization. 6385 static char *__kmp_registration_str = NULL; 6386 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6387 6388 static inline char *__kmp_reg_status_name() { 6389 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6390 each thread. If registration and unregistration go in different threads 6391 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6392 env var can not be found, because the name will contain different pid. */ 6393 // macOS* complains about name being too long with additional getuid() 6394 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6395 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6396 (int)getuid()); 6397 #else 6398 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6399 #endif 6400 } // __kmp_reg_status_get 6401 6402 void __kmp_register_library_startup(void) { 6403 6404 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6405 int done = 0; 6406 union { 6407 double dtime; 6408 long ltime; 6409 } time; 6410 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6411 __kmp_initialize_system_tick(); 6412 #endif 6413 __kmp_read_system_time(&time.dtime); 6414 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6415 __kmp_registration_str = 6416 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6417 __kmp_registration_flag, KMP_LIBRARY_FILE); 6418 6419 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6420 __kmp_registration_str)); 6421 6422 while (!done) { 6423 6424 char *value = NULL; // Actual value of the environment variable. 6425 6426 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6427 char *shm_name = __kmp_str_format("/%s", name); 6428 int shm_preexist = 0; 6429 char *data1; 6430 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6431 if ((fd1 == -1) && (errno == EEXIST)) { 6432 // file didn't open because it already exists. 6433 // try opening existing file 6434 fd1 = shm_open(shm_name, O_RDWR, 0666); 6435 if (fd1 == -1) { // file didn't open 6436 // error out here 6437 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6438 __kmp_msg_null); 6439 } else { 6440 // able to open existing file 6441 shm_preexist = 1; 6442 } 6443 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6444 // already exists. 6445 // error out here. 6446 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6447 __kmp_msg_null); 6448 } 6449 if (shm_preexist == 0) { 6450 // we created SHM now set size 6451 if (ftruncate(fd1, SHM_SIZE) == -1) { 6452 // error occured setting size; 6453 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6454 KMP_ERR(errno), __kmp_msg_null); 6455 } 6456 } 6457 data1 = 6458 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6459 if (data1 == MAP_FAILED) { 6460 // failed to map shared memory 6461 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6462 __kmp_msg_null); 6463 } 6464 if (shm_preexist == 0) { // set data to SHM, set value 6465 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6466 } 6467 // Read value from either what we just wrote or existing file. 6468 value = __kmp_str_format("%s", data1); // read value from SHM 6469 munmap(data1, SHM_SIZE); 6470 close(fd1); 6471 #else // Windows and unix with static library 6472 // Set environment variable, but do not overwrite if it is exist. 6473 __kmp_env_set(name, __kmp_registration_str, 0); 6474 // read value to see if it got set 6475 value = __kmp_env_get(name); 6476 #endif 6477 6478 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6479 done = 1; // Ok, environment variable set successfully, exit the loop. 6480 } else { 6481 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6482 // Check whether it alive or dead. 6483 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6484 char *tail = value; 6485 char *flag_addr_str = NULL; 6486 char *flag_val_str = NULL; 6487 char const *file_name = NULL; 6488 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6489 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6490 file_name = tail; 6491 if (tail != NULL) { 6492 long *flag_addr = 0; 6493 unsigned long flag_val = 0; 6494 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6495 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6496 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6497 // First, check whether environment-encoded address is mapped into 6498 // addr space. 6499 // If so, dereference it to see if it still has the right value. 6500 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6501 neighbor = 1; 6502 } else { 6503 // If not, then we know the other copy of the library is no longer 6504 // running. 6505 neighbor = 2; 6506 } 6507 } 6508 } 6509 switch (neighbor) { 6510 case 0: // Cannot parse environment variable -- neighbor status unknown. 6511 // Assume it is the incompatible format of future version of the 6512 // library. Assume the other library is alive. 6513 // WARN( ... ); // TODO: Issue a warning. 6514 file_name = "unknown library"; 6515 KMP_FALLTHROUGH(); 6516 // Attention! Falling to the next case. That's intentional. 6517 case 1: { // Neighbor is alive. 6518 // Check it is allowed. 6519 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6520 if (!__kmp_str_match_true(duplicate_ok)) { 6521 // That's not allowed. Issue fatal error. 6522 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6523 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6524 } 6525 KMP_INTERNAL_FREE(duplicate_ok); 6526 __kmp_duplicate_library_ok = 1; 6527 done = 1; // Exit the loop. 6528 } break; 6529 case 2: { // Neighbor is dead. 6530 6531 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6532 // close shared memory. 6533 shm_unlink(shm_name); // this removes file in /dev/shm 6534 #else 6535 // Clear the variable and try to register library again. 6536 __kmp_env_unset(name); 6537 #endif 6538 } break; 6539 default: { 6540 KMP_DEBUG_ASSERT(0); 6541 } break; 6542 } 6543 } 6544 KMP_INTERNAL_FREE((void *)value); 6545 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6546 KMP_INTERNAL_FREE((void *)shm_name); 6547 #endif 6548 } // while 6549 KMP_INTERNAL_FREE((void *)name); 6550 6551 } // func __kmp_register_library_startup 6552 6553 void __kmp_unregister_library(void) { 6554 6555 char *name = __kmp_reg_status_name(); 6556 char *value = NULL; 6557 6558 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6559 char *shm_name = __kmp_str_format("/%s", name); 6560 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6561 if (fd1 == -1) { 6562 // file did not open. return. 6563 return; 6564 } 6565 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6566 if (data1 != MAP_FAILED) { 6567 value = __kmp_str_format("%s", data1); // read value from SHM 6568 munmap(data1, SHM_SIZE); 6569 } 6570 close(fd1); 6571 #else 6572 value = __kmp_env_get(name); 6573 #endif 6574 6575 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6576 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6577 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6578 // Ok, this is our variable. Delete it. 6579 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6580 shm_unlink(shm_name); // this removes file in /dev/shm 6581 #else 6582 __kmp_env_unset(name); 6583 #endif 6584 } 6585 6586 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6587 KMP_INTERNAL_FREE(shm_name); 6588 #endif 6589 6590 KMP_INTERNAL_FREE(__kmp_registration_str); 6591 KMP_INTERNAL_FREE(value); 6592 KMP_INTERNAL_FREE(name); 6593 6594 __kmp_registration_flag = 0; 6595 __kmp_registration_str = NULL; 6596 6597 } // __kmp_unregister_library 6598 6599 // End of Library registration stuff. 6600 // ----------------------------------------------------------------------------- 6601 6602 #if KMP_MIC_SUPPORTED 6603 6604 static void __kmp_check_mic_type() { 6605 kmp_cpuid_t cpuid_state = {0}; 6606 kmp_cpuid_t *cs_p = &cpuid_state; 6607 __kmp_x86_cpuid(1, 0, cs_p); 6608 // We don't support mic1 at the moment 6609 if ((cs_p->eax & 0xff0) == 0xB10) { 6610 __kmp_mic_type = mic2; 6611 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6612 __kmp_mic_type = mic3; 6613 } else { 6614 __kmp_mic_type = non_mic; 6615 } 6616 } 6617 6618 #endif /* KMP_MIC_SUPPORTED */ 6619 6620 #if KMP_HAVE_UMWAIT 6621 static void __kmp_user_level_mwait_init() { 6622 struct kmp_cpuid buf; 6623 __kmp_x86_cpuid(7, 0, &buf); 6624 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6625 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6626 __kmp_umwait_enabled)); 6627 } 6628 #elif KMP_HAVE_MWAIT 6629 #ifndef AT_INTELPHIUSERMWAIT 6630 // Spurious, non-existent value that should always fail to return anything. 6631 // Will be replaced with the correct value when we know that. 6632 #define AT_INTELPHIUSERMWAIT 10000 6633 #endif 6634 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6635 // earlier OS is used to build the RTL, we'll use the following internal 6636 // function when the entry is not found. 6637 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6638 unsigned long getauxval(unsigned long) { return 0; } 6639 6640 static void __kmp_user_level_mwait_init() { 6641 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6642 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6643 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6644 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6645 if (__kmp_mic_type == mic3) { 6646 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6647 if ((res & 0x1) || __kmp_user_level_mwait) { 6648 __kmp_mwait_enabled = TRUE; 6649 if (__kmp_user_level_mwait) { 6650 KMP_INFORM(EnvMwaitWarn); 6651 } 6652 } else { 6653 __kmp_mwait_enabled = FALSE; 6654 } 6655 } 6656 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6657 "__kmp_mwait_enabled = %d\n", 6658 __kmp_mic_type, __kmp_mwait_enabled)); 6659 } 6660 #endif /* KMP_HAVE_UMWAIT */ 6661 6662 static void __kmp_do_serial_initialize(void) { 6663 int i, gtid; 6664 size_t size; 6665 6666 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6667 6668 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6669 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6670 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6671 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6672 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6673 6674 #if OMPT_SUPPORT 6675 ompt_pre_init(); 6676 #endif 6677 6678 __kmp_validate_locks(); 6679 6680 /* Initialize internal memory allocator */ 6681 __kmp_init_allocator(); 6682 6683 /* Register the library startup via an environment variable and check to see 6684 whether another copy of the library is already registered. */ 6685 6686 __kmp_register_library_startup(); 6687 6688 /* TODO reinitialization of library */ 6689 if (TCR_4(__kmp_global.g.g_done)) { 6690 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6691 } 6692 6693 __kmp_global.g.g_abort = 0; 6694 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6695 6696 /* initialize the locks */ 6697 #if KMP_USE_ADAPTIVE_LOCKS 6698 #if KMP_DEBUG_ADAPTIVE_LOCKS 6699 __kmp_init_speculative_stats(); 6700 #endif 6701 #endif 6702 #if KMP_STATS_ENABLED 6703 __kmp_stats_init(); 6704 #endif 6705 __kmp_init_lock(&__kmp_global_lock); 6706 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6707 __kmp_init_lock(&__kmp_debug_lock); 6708 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6709 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6710 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6711 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6712 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6713 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6714 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6715 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6716 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6717 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6718 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6719 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6720 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6721 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6722 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6723 #if KMP_USE_MONITOR 6724 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6725 #endif 6726 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6727 6728 /* conduct initialization and initial setup of configuration */ 6729 6730 __kmp_runtime_initialize(); 6731 6732 #if KMP_MIC_SUPPORTED 6733 __kmp_check_mic_type(); 6734 #endif 6735 6736 // Some global variable initialization moved here from kmp_env_initialize() 6737 #ifdef KMP_DEBUG 6738 kmp_diag = 0; 6739 #endif 6740 __kmp_abort_delay = 0; 6741 6742 // From __kmp_init_dflt_team_nth() 6743 /* assume the entire machine will be used */ 6744 __kmp_dflt_team_nth_ub = __kmp_xproc; 6745 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6746 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6747 } 6748 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6749 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6750 } 6751 __kmp_max_nth = __kmp_sys_max_nth; 6752 __kmp_cg_max_nth = __kmp_sys_max_nth; 6753 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6754 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6755 __kmp_teams_max_nth = __kmp_sys_max_nth; 6756 } 6757 6758 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6759 // part 6760 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6761 #if KMP_USE_MONITOR 6762 __kmp_monitor_wakeups = 6763 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6764 __kmp_bt_intervals = 6765 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6766 #endif 6767 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6768 __kmp_library = library_throughput; 6769 // From KMP_SCHEDULE initialization 6770 __kmp_static = kmp_sch_static_balanced; 6771 // AC: do not use analytical here, because it is non-monotonous 6772 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6773 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6774 // need to repeat assignment 6775 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6776 // bit control and barrier method control parts 6777 #if KMP_FAST_REDUCTION_BARRIER 6778 #define kmp_reduction_barrier_gather_bb ((int)1) 6779 #define kmp_reduction_barrier_release_bb ((int)1) 6780 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6781 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6782 #endif // KMP_FAST_REDUCTION_BARRIER 6783 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6784 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6785 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6786 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6787 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6788 #if KMP_FAST_REDUCTION_BARRIER 6789 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6790 // lin_64 ): hyper,1 6791 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6792 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6793 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6794 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6795 } 6796 #endif // KMP_FAST_REDUCTION_BARRIER 6797 } 6798 #if KMP_FAST_REDUCTION_BARRIER 6799 #undef kmp_reduction_barrier_release_pat 6800 #undef kmp_reduction_barrier_gather_pat 6801 #undef kmp_reduction_barrier_release_bb 6802 #undef kmp_reduction_barrier_gather_bb 6803 #endif // KMP_FAST_REDUCTION_BARRIER 6804 #if KMP_MIC_SUPPORTED 6805 if (__kmp_mic_type == mic2) { // KNC 6806 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6807 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6808 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6809 1; // forkjoin release 6810 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6811 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6812 } 6813 #if KMP_FAST_REDUCTION_BARRIER 6814 if (__kmp_mic_type == mic2) { // KNC 6815 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6816 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6817 } 6818 #endif // KMP_FAST_REDUCTION_BARRIER 6819 #endif // KMP_MIC_SUPPORTED 6820 6821 // From KMP_CHECKS initialization 6822 #ifdef KMP_DEBUG 6823 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6824 #else 6825 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6826 #endif 6827 6828 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6829 __kmp_foreign_tp = TRUE; 6830 6831 __kmp_global.g.g_dynamic = FALSE; 6832 __kmp_global.g.g_dynamic_mode = dynamic_default; 6833 6834 __kmp_env_initialize(NULL); 6835 6836 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6837 __kmp_user_level_mwait_init(); 6838 #endif 6839 // Print all messages in message catalog for testing purposes. 6840 #ifdef KMP_DEBUG 6841 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6842 if (__kmp_str_match_true(val)) { 6843 kmp_str_buf_t buffer; 6844 __kmp_str_buf_init(&buffer); 6845 __kmp_i18n_dump_catalog(&buffer); 6846 __kmp_printf("%s", buffer.str); 6847 __kmp_str_buf_free(&buffer); 6848 } 6849 __kmp_env_free(&val); 6850 #endif 6851 6852 __kmp_threads_capacity = 6853 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6854 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6855 __kmp_tp_capacity = __kmp_default_tp_capacity( 6856 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6857 6858 // If the library is shut down properly, both pools must be NULL. Just in 6859 // case, set them to NULL -- some memory may leak, but subsequent code will 6860 // work even if pools are not freed. 6861 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6862 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6863 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6864 __kmp_thread_pool = NULL; 6865 __kmp_thread_pool_insert_pt = NULL; 6866 __kmp_team_pool = NULL; 6867 6868 /* Allocate all of the variable sized records */ 6869 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6870 * expandable */ 6871 /* Since allocation is cache-aligned, just add extra padding at the end */ 6872 size = 6873 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6874 CACHE_LINE; 6875 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6876 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6877 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6878 6879 /* init thread counts */ 6880 KMP_DEBUG_ASSERT(__kmp_all_nth == 6881 0); // Asserts fail if the library is reinitializing and 6882 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6883 __kmp_all_nth = 0; 6884 __kmp_nth = 0; 6885 6886 /* setup the uber master thread and hierarchy */ 6887 gtid = __kmp_register_root(TRUE); 6888 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6889 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6890 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6891 6892 KMP_MB(); /* Flush all pending memory write invalidates. */ 6893 6894 __kmp_common_initialize(); 6895 6896 #if KMP_OS_UNIX 6897 /* invoke the child fork handler */ 6898 __kmp_register_atfork(); 6899 #endif 6900 6901 #if !KMP_DYNAMIC_LIB 6902 { 6903 /* Invoke the exit handler when the program finishes, only for static 6904 library. For dynamic library, we already have _fini and DllMain. */ 6905 int rc = atexit(__kmp_internal_end_atexit); 6906 if (rc != 0) { 6907 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6908 __kmp_msg_null); 6909 } 6910 } 6911 #endif 6912 6913 #if KMP_HANDLE_SIGNALS 6914 #if KMP_OS_UNIX 6915 /* NOTE: make sure that this is called before the user installs their own 6916 signal handlers so that the user handlers are called first. this way they 6917 can return false, not call our handler, avoid terminating the library, and 6918 continue execution where they left off. */ 6919 __kmp_install_signals(FALSE); 6920 #endif /* KMP_OS_UNIX */ 6921 #if KMP_OS_WINDOWS 6922 __kmp_install_signals(TRUE); 6923 #endif /* KMP_OS_WINDOWS */ 6924 #endif 6925 6926 /* we have finished the serial initialization */ 6927 __kmp_init_counter++; 6928 6929 __kmp_init_serial = TRUE; 6930 6931 if (__kmp_settings) { 6932 __kmp_env_print(); 6933 } 6934 6935 if (__kmp_display_env || __kmp_display_env_verbose) { 6936 __kmp_env_print_2(); 6937 } 6938 6939 #if OMPT_SUPPORT 6940 ompt_post_init(); 6941 #endif 6942 6943 KMP_MB(); 6944 6945 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6946 } 6947 6948 void __kmp_serial_initialize(void) { 6949 if (__kmp_init_serial) { 6950 return; 6951 } 6952 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6953 if (__kmp_init_serial) { 6954 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6955 return; 6956 } 6957 __kmp_do_serial_initialize(); 6958 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6959 } 6960 6961 static void __kmp_do_middle_initialize(void) { 6962 int i, j; 6963 int prev_dflt_team_nth; 6964 6965 if (!__kmp_init_serial) { 6966 __kmp_do_serial_initialize(); 6967 } 6968 6969 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6970 6971 // Save the previous value for the __kmp_dflt_team_nth so that 6972 // we can avoid some reinitialization if it hasn't changed. 6973 prev_dflt_team_nth = __kmp_dflt_team_nth; 6974 6975 #if KMP_AFFINITY_SUPPORTED 6976 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6977 // number of cores on the machine. 6978 __kmp_affinity_initialize(); 6979 6980 // Run through the __kmp_threads array and set the affinity mask 6981 // for each root thread that is currently registered with the RTL. 6982 for (i = 0; i < __kmp_threads_capacity; i++) { 6983 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6984 __kmp_affinity_set_init_mask(i, TRUE); 6985 } 6986 } 6987 #endif /* KMP_AFFINITY_SUPPORTED */ 6988 6989 KMP_ASSERT(__kmp_xproc > 0); 6990 if (__kmp_avail_proc == 0) { 6991 __kmp_avail_proc = __kmp_xproc; 6992 } 6993 6994 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6995 // correct them now 6996 j = 0; 6997 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6998 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6999 __kmp_avail_proc; 7000 j++; 7001 } 7002 7003 if (__kmp_dflt_team_nth == 0) { 7004 #ifdef KMP_DFLT_NTH_CORES 7005 // Default #threads = #cores 7006 __kmp_dflt_team_nth = __kmp_ncores; 7007 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7008 "__kmp_ncores (%d)\n", 7009 __kmp_dflt_team_nth)); 7010 #else 7011 // Default #threads = #available OS procs 7012 __kmp_dflt_team_nth = __kmp_avail_proc; 7013 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7014 "__kmp_avail_proc(%d)\n", 7015 __kmp_dflt_team_nth)); 7016 #endif /* KMP_DFLT_NTH_CORES */ 7017 } 7018 7019 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7020 __kmp_dflt_team_nth = KMP_MIN_NTH; 7021 } 7022 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7023 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7024 } 7025 7026 // There's no harm in continuing if the following check fails, 7027 // but it indicates an error in the previous logic. 7028 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7029 7030 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7031 // Run through the __kmp_threads array and set the num threads icv for each 7032 // root thread that is currently registered with the RTL (which has not 7033 // already explicitly set its nthreads-var with a call to 7034 // omp_set_num_threads()). 7035 for (i = 0; i < __kmp_threads_capacity; i++) { 7036 kmp_info_t *thread = __kmp_threads[i]; 7037 if (thread == NULL) 7038 continue; 7039 if (thread->th.th_current_task->td_icvs.nproc != 0) 7040 continue; 7041 7042 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7043 } 7044 } 7045 KA_TRACE( 7046 20, 7047 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7048 __kmp_dflt_team_nth)); 7049 7050 #ifdef KMP_ADJUST_BLOCKTIME 7051 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7052 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7053 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7054 if (__kmp_nth > __kmp_avail_proc) { 7055 __kmp_zero_bt = TRUE; 7056 } 7057 } 7058 #endif /* KMP_ADJUST_BLOCKTIME */ 7059 7060 /* we have finished middle initialization */ 7061 TCW_SYNC_4(__kmp_init_middle, TRUE); 7062 7063 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7064 } 7065 7066 void __kmp_middle_initialize(void) { 7067 if (__kmp_init_middle) { 7068 return; 7069 } 7070 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7071 if (__kmp_init_middle) { 7072 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7073 return; 7074 } 7075 __kmp_do_middle_initialize(); 7076 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7077 } 7078 7079 void __kmp_parallel_initialize(void) { 7080 int gtid = __kmp_entry_gtid(); // this might be a new root 7081 7082 /* synchronize parallel initialization (for sibling) */ 7083 if (TCR_4(__kmp_init_parallel)) 7084 return; 7085 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7086 if (TCR_4(__kmp_init_parallel)) { 7087 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7088 return; 7089 } 7090 7091 /* TODO reinitialization after we have already shut down */ 7092 if (TCR_4(__kmp_global.g.g_done)) { 7093 KA_TRACE( 7094 10, 7095 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7096 __kmp_infinite_loop(); 7097 } 7098 7099 /* jc: The lock __kmp_initz_lock is already held, so calling 7100 __kmp_serial_initialize would cause a deadlock. So we call 7101 __kmp_do_serial_initialize directly. */ 7102 if (!__kmp_init_middle) { 7103 __kmp_do_middle_initialize(); 7104 } 7105 __kmp_resume_if_hard_paused(); 7106 7107 /* begin initialization */ 7108 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7109 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7110 7111 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7112 // Save the FP control regs. 7113 // Worker threads will set theirs to these values at thread startup. 7114 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7115 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7116 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7117 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7118 7119 #if KMP_OS_UNIX 7120 #if KMP_HANDLE_SIGNALS 7121 /* must be after __kmp_serial_initialize */ 7122 __kmp_install_signals(TRUE); 7123 #endif 7124 #endif 7125 7126 __kmp_suspend_initialize(); 7127 7128 #if defined(USE_LOAD_BALANCE) 7129 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7130 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7131 } 7132 #else 7133 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7134 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7135 } 7136 #endif 7137 7138 if (__kmp_version) { 7139 __kmp_print_version_2(); 7140 } 7141 7142 /* we have finished parallel initialization */ 7143 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7144 7145 KMP_MB(); 7146 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7147 7148 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7149 } 7150 7151 void __kmp_hidden_helper_initialize() { 7152 if (TCR_4(__kmp_init_hidden_helper)) 7153 return; 7154 7155 // __kmp_parallel_initialize is required before we initialize hidden helper 7156 if (!TCR_4(__kmp_init_parallel)) 7157 __kmp_parallel_initialize(); 7158 7159 // Double check. Note that this double check should not be placed before 7160 // __kmp_parallel_initialize as it will cause dead lock. 7161 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7162 if (TCR_4(__kmp_init_hidden_helper)) { 7163 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7164 return; 7165 } 7166 7167 // Set the count of hidden helper tasks to be executed to zero 7168 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7169 7170 // Set the global variable indicating that we're initializing hidden helper 7171 // team/threads 7172 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7173 7174 // Platform independent initialization 7175 __kmp_do_initialize_hidden_helper_threads(); 7176 7177 // Wait here for the finish of initialization of hidden helper teams 7178 __kmp_hidden_helper_threads_initz_wait(); 7179 7180 // We have finished hidden helper initialization 7181 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7182 7183 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7184 } 7185 7186 /* ------------------------------------------------------------------------ */ 7187 7188 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7189 kmp_team_t *team) { 7190 kmp_disp_t *dispatch; 7191 7192 KMP_MB(); 7193 7194 /* none of the threads have encountered any constructs, yet. */ 7195 this_thr->th.th_local.this_construct = 0; 7196 #if KMP_CACHE_MANAGE 7197 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7198 #endif /* KMP_CACHE_MANAGE */ 7199 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7200 KMP_DEBUG_ASSERT(dispatch); 7201 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7202 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7203 // this_thr->th.th_info.ds.ds_tid ] ); 7204 7205 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7206 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7207 if (__kmp_env_consistency_check) 7208 __kmp_push_parallel(gtid, team->t.t_ident); 7209 7210 KMP_MB(); /* Flush all pending memory write invalidates. */ 7211 } 7212 7213 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7214 kmp_team_t *team) { 7215 if (__kmp_env_consistency_check) 7216 __kmp_pop_parallel(gtid, team->t.t_ident); 7217 7218 __kmp_finish_implicit_task(this_thr); 7219 } 7220 7221 int __kmp_invoke_task_func(int gtid) { 7222 int rc; 7223 int tid = __kmp_tid_from_gtid(gtid); 7224 kmp_info_t *this_thr = __kmp_threads[gtid]; 7225 kmp_team_t *team = this_thr->th.th_team; 7226 7227 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7228 #if USE_ITT_BUILD 7229 if (__itt_stack_caller_create_ptr) { 7230 // inform ittnotify about entering user's code 7231 if (team->t.t_stack_id != NULL) { 7232 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7233 } else { 7234 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7235 __kmp_itt_stack_callee_enter( 7236 (__itt_caller)team->t.t_parent->t.t_stack_id); 7237 } 7238 } 7239 #endif /* USE_ITT_BUILD */ 7240 #if INCLUDE_SSC_MARKS 7241 SSC_MARK_INVOKING(); 7242 #endif 7243 7244 #if OMPT_SUPPORT 7245 void *dummy; 7246 void **exit_frame_p; 7247 ompt_data_t *my_task_data; 7248 ompt_data_t *my_parallel_data; 7249 int ompt_team_size; 7250 7251 if (ompt_enabled.enabled) { 7252 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7253 .ompt_task_info.frame.exit_frame.ptr); 7254 } else { 7255 exit_frame_p = &dummy; 7256 } 7257 7258 my_task_data = 7259 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7260 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7261 if (ompt_enabled.ompt_callback_implicit_task) { 7262 ompt_team_size = team->t.t_nproc; 7263 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7264 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7265 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7266 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7267 } 7268 #endif 7269 7270 #if KMP_STATS_ENABLED 7271 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7272 if (previous_state == stats_state_e::TEAMS_REGION) { 7273 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7274 } else { 7275 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7276 } 7277 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7278 #endif 7279 7280 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7281 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7282 #if OMPT_SUPPORT 7283 , 7284 exit_frame_p 7285 #endif 7286 ); 7287 #if OMPT_SUPPORT 7288 *exit_frame_p = NULL; 7289 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7290 #endif 7291 7292 #if KMP_STATS_ENABLED 7293 if (previous_state == stats_state_e::TEAMS_REGION) { 7294 KMP_SET_THREAD_STATE(previous_state); 7295 } 7296 KMP_POP_PARTITIONED_TIMER(); 7297 #endif 7298 7299 #if USE_ITT_BUILD 7300 if (__itt_stack_caller_create_ptr) { 7301 // inform ittnotify about leaving user's code 7302 if (team->t.t_stack_id != NULL) { 7303 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7304 } else { 7305 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7306 __kmp_itt_stack_callee_leave( 7307 (__itt_caller)team->t.t_parent->t.t_stack_id); 7308 } 7309 } 7310 #endif /* USE_ITT_BUILD */ 7311 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7312 7313 return rc; 7314 } 7315 7316 void __kmp_teams_master(int gtid) { 7317 // This routine is called by all master threads in teams construct 7318 kmp_info_t *thr = __kmp_threads[gtid]; 7319 kmp_team_t *team = thr->th.th_team; 7320 ident_t *loc = team->t.t_ident; 7321 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7322 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7323 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7324 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7325 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7326 7327 // This thread is a new CG root. Set up the proper variables. 7328 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7329 tmp->cg_root = thr; // Make thr the CG root 7330 // Init to thread limit that was stored when league masters were forked 7331 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7332 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7333 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7334 " cg_nthreads to 1\n", 7335 thr, tmp)); 7336 tmp->up = thr->th.th_cg_roots; 7337 thr->th.th_cg_roots = tmp; 7338 7339 // Launch league of teams now, but not let workers execute 7340 // (they hang on fork barrier until next parallel) 7341 #if INCLUDE_SSC_MARKS 7342 SSC_MARK_FORKING(); 7343 #endif 7344 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7345 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7346 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7347 #if INCLUDE_SSC_MARKS 7348 SSC_MARK_JOINING(); 7349 #endif 7350 // If the team size was reduced from the limit, set it to the new size 7351 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7352 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7353 // AC: last parameter "1" eliminates join barrier which won't work because 7354 // worker threads are in a fork barrier waiting for more parallel regions 7355 __kmp_join_call(loc, gtid 7356 #if OMPT_SUPPORT 7357 , 7358 fork_context_intel 7359 #endif 7360 , 7361 1); 7362 } 7363 7364 int __kmp_invoke_teams_master(int gtid) { 7365 kmp_info_t *this_thr = __kmp_threads[gtid]; 7366 kmp_team_t *team = this_thr->th.th_team; 7367 #if KMP_DEBUG 7368 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7369 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7370 (void *)__kmp_teams_master); 7371 #endif 7372 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7373 #if OMPT_SUPPORT 7374 int tid = __kmp_tid_from_gtid(gtid); 7375 ompt_data_t *task_data = 7376 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7377 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7378 if (ompt_enabled.ompt_callback_implicit_task) { 7379 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7380 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7381 ompt_task_initial); 7382 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7383 } 7384 #endif 7385 __kmp_teams_master(gtid); 7386 #if OMPT_SUPPORT 7387 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7388 #endif 7389 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7390 return 1; 7391 } 7392 7393 /* this sets the requested number of threads for the next parallel region 7394 encountered by this team. since this should be enclosed in the forkjoin 7395 critical section it should avoid race conditions with asymmetrical nested 7396 parallelism */ 7397 7398 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7399 kmp_info_t *thr = __kmp_threads[gtid]; 7400 7401 if (num_threads > 0) 7402 thr->th.th_set_nproc = num_threads; 7403 } 7404 7405 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7406 int num_threads) { 7407 KMP_DEBUG_ASSERT(thr); 7408 // Remember the number of threads for inner parallel regions 7409 if (!TCR_4(__kmp_init_middle)) 7410 __kmp_middle_initialize(); // get internal globals calculated 7411 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7412 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7413 7414 if (num_threads == 0) { 7415 if (__kmp_teams_thread_limit > 0) { 7416 num_threads = __kmp_teams_thread_limit; 7417 } else { 7418 num_threads = __kmp_avail_proc / num_teams; 7419 } 7420 // adjust num_threads w/o warning as it is not user setting 7421 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7422 // no thread_limit clause specified - do not change thread-limit-var ICV 7423 if (num_threads > __kmp_dflt_team_nth) { 7424 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7425 } 7426 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7427 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7428 } // prevent team size to exceed thread-limit-var 7429 if (num_teams * num_threads > __kmp_teams_max_nth) { 7430 num_threads = __kmp_teams_max_nth / num_teams; 7431 } 7432 if (num_threads == 0) { 7433 num_threads = 1; 7434 } 7435 } else { 7436 // This thread will be the master of the league masters 7437 // Store new thread limit; old limit is saved in th_cg_roots list 7438 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7439 // num_threads = min(num_threads, nthreads-var) 7440 if (num_threads > __kmp_dflt_team_nth) { 7441 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7442 } 7443 if (num_teams * num_threads > __kmp_teams_max_nth) { 7444 int new_threads = __kmp_teams_max_nth / num_teams; 7445 if (new_threads == 0) { 7446 new_threads = 1; 7447 } 7448 if (new_threads != num_threads) { 7449 if (!__kmp_reserve_warn) { // user asked for too many threads 7450 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7451 __kmp_msg(kmp_ms_warning, 7452 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7453 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7454 } 7455 } 7456 num_threads = new_threads; 7457 } 7458 } 7459 thr->th.th_teams_size.nth = num_threads; 7460 } 7461 7462 /* this sets the requested number of teams for the teams region and/or 7463 the number of threads for the next parallel region encountered */ 7464 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7465 int num_threads) { 7466 kmp_info_t *thr = __kmp_threads[gtid]; 7467 KMP_DEBUG_ASSERT(num_teams >= 0); 7468 KMP_DEBUG_ASSERT(num_threads >= 0); 7469 7470 if (num_teams == 0) { 7471 if (__kmp_nteams > 0) { 7472 num_teams = __kmp_nteams; 7473 } else { 7474 num_teams = 1; // default number of teams is 1. 7475 } 7476 } 7477 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7478 if (!__kmp_reserve_warn) { 7479 __kmp_reserve_warn = 1; 7480 __kmp_msg(kmp_ms_warning, 7481 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7482 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7483 } 7484 num_teams = __kmp_teams_max_nth; 7485 } 7486 // Set number of teams (number of threads in the outer "parallel" of the 7487 // teams) 7488 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7489 7490 __kmp_push_thread_limit(thr, num_teams, num_threads); 7491 } 7492 7493 /* This sets the requested number of teams for the teams region and/or 7494 the number of threads for the next parallel region encountered */ 7495 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7496 int num_teams_ub, int num_threads) { 7497 kmp_info_t *thr = __kmp_threads[gtid]; 7498 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7499 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7500 KMP_DEBUG_ASSERT(num_threads >= 0); 7501 7502 if (num_teams_lb > num_teams_ub) { 7503 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7504 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7505 } 7506 7507 int num_teams = 1; // defalt number of teams is 1. 7508 7509 if (num_teams_lb == 0 && num_teams_ub > 0) 7510 num_teams_lb = num_teams_ub; 7511 7512 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7513 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7514 if (num_teams > __kmp_teams_max_nth) { 7515 if (!__kmp_reserve_warn) { 7516 __kmp_reserve_warn = 1; 7517 __kmp_msg(kmp_ms_warning, 7518 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7519 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7520 } 7521 num_teams = __kmp_teams_max_nth; 7522 } 7523 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7524 num_teams = num_teams_ub; 7525 } else { // num_teams_lb <= num_teams <= num_teams_ub 7526 if (num_threads == 0) { 7527 if (num_teams_ub > __kmp_teams_max_nth) { 7528 num_teams = num_teams_lb; 7529 } else { 7530 num_teams = num_teams_ub; 7531 } 7532 } else { 7533 num_teams = (num_threads > __kmp_teams_max_nth) 7534 ? num_teams 7535 : __kmp_teams_max_nth / num_threads; 7536 if (num_teams < num_teams_lb) { 7537 num_teams = num_teams_lb; 7538 } else if (num_teams > num_teams_ub) { 7539 num_teams = num_teams_ub; 7540 } 7541 } 7542 } 7543 // Set number of teams (number of threads in the outer "parallel" of the 7544 // teams) 7545 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7546 7547 __kmp_push_thread_limit(thr, num_teams, num_threads); 7548 } 7549 7550 // Set the proc_bind var to use in the following parallel region. 7551 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7552 kmp_info_t *thr = __kmp_threads[gtid]; 7553 thr->th.th_set_proc_bind = proc_bind; 7554 } 7555 7556 /* Launch the worker threads into the microtask. */ 7557 7558 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7559 kmp_info_t *this_thr = __kmp_threads[gtid]; 7560 7561 #ifdef KMP_DEBUG 7562 int f; 7563 #endif /* KMP_DEBUG */ 7564 7565 KMP_DEBUG_ASSERT(team); 7566 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7567 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7568 KMP_MB(); /* Flush all pending memory write invalidates. */ 7569 7570 team->t.t_construct = 0; /* no single directives seen yet */ 7571 team->t.t_ordered.dt.t_value = 7572 0; /* thread 0 enters the ordered section first */ 7573 7574 /* Reset the identifiers on the dispatch buffer */ 7575 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7576 if (team->t.t_max_nproc > 1) { 7577 int i; 7578 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7579 team->t.t_disp_buffer[i].buffer_index = i; 7580 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7581 } 7582 } else { 7583 team->t.t_disp_buffer[0].buffer_index = 0; 7584 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7585 } 7586 7587 KMP_MB(); /* Flush all pending memory write invalidates. */ 7588 KMP_ASSERT(this_thr->th.th_team == team); 7589 7590 #ifdef KMP_DEBUG 7591 for (f = 0; f < team->t.t_nproc; f++) { 7592 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7593 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7594 } 7595 #endif /* KMP_DEBUG */ 7596 7597 /* release the worker threads so they may begin working */ 7598 __kmp_fork_barrier(gtid, 0); 7599 } 7600 7601 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7602 kmp_info_t *this_thr = __kmp_threads[gtid]; 7603 7604 KMP_DEBUG_ASSERT(team); 7605 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7606 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7607 KMP_MB(); /* Flush all pending memory write invalidates. */ 7608 7609 /* Join barrier after fork */ 7610 7611 #ifdef KMP_DEBUG 7612 if (__kmp_threads[gtid] && 7613 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7614 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7615 __kmp_threads[gtid]); 7616 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7617 "team->t.t_nproc=%d\n", 7618 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7619 team->t.t_nproc); 7620 __kmp_print_structure(); 7621 } 7622 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7623 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7624 #endif /* KMP_DEBUG */ 7625 7626 __kmp_join_barrier(gtid); /* wait for everyone */ 7627 #if OMPT_SUPPORT 7628 if (ompt_enabled.enabled && 7629 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7630 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7631 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7632 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7633 #if OMPT_OPTIONAL 7634 void *codeptr = NULL; 7635 if (KMP_MASTER_TID(ds_tid) && 7636 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7637 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7638 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7639 7640 if (ompt_enabled.ompt_callback_sync_region_wait) { 7641 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7642 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7643 codeptr); 7644 } 7645 if (ompt_enabled.ompt_callback_sync_region) { 7646 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7647 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7648 codeptr); 7649 } 7650 #endif 7651 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7652 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7653 ompt_scope_end, NULL, task_data, 0, ds_tid, 7654 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7655 } 7656 } 7657 #endif 7658 7659 KMP_MB(); /* Flush all pending memory write invalidates. */ 7660 KMP_ASSERT(this_thr->th.th_team == team); 7661 } 7662 7663 /* ------------------------------------------------------------------------ */ 7664 7665 #ifdef USE_LOAD_BALANCE 7666 7667 // Return the worker threads actively spinning in the hot team, if we 7668 // are at the outermost level of parallelism. Otherwise, return 0. 7669 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7670 int i; 7671 int retval; 7672 kmp_team_t *hot_team; 7673 7674 if (root->r.r_active) { 7675 return 0; 7676 } 7677 hot_team = root->r.r_hot_team; 7678 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7679 return hot_team->t.t_nproc - 1; // Don't count master thread 7680 } 7681 7682 // Skip the master thread - it is accounted for elsewhere. 7683 retval = 0; 7684 for (i = 1; i < hot_team->t.t_nproc; i++) { 7685 if (hot_team->t.t_threads[i]->th.th_active) { 7686 retval++; 7687 } 7688 } 7689 return retval; 7690 } 7691 7692 // Perform an automatic adjustment to the number of 7693 // threads used by the next parallel region. 7694 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7695 int retval; 7696 int pool_active; 7697 int hot_team_active; 7698 int team_curr_active; 7699 int system_active; 7700 7701 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7702 set_nproc)); 7703 KMP_DEBUG_ASSERT(root); 7704 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7705 ->th.th_current_task->td_icvs.dynamic == TRUE); 7706 KMP_DEBUG_ASSERT(set_nproc > 1); 7707 7708 if (set_nproc == 1) { 7709 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7710 return 1; 7711 } 7712 7713 // Threads that are active in the thread pool, active in the hot team for this 7714 // particular root (if we are at the outer par level), and the currently 7715 // executing thread (to become the master) are available to add to the new 7716 // team, but are currently contributing to the system load, and must be 7717 // accounted for. 7718 pool_active = __kmp_thread_pool_active_nth; 7719 hot_team_active = __kmp_active_hot_team_nproc(root); 7720 team_curr_active = pool_active + hot_team_active + 1; 7721 7722 // Check the system load. 7723 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7724 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7725 "hot team active = %d\n", 7726 system_active, pool_active, hot_team_active)); 7727 7728 if (system_active < 0) { 7729 // There was an error reading the necessary info from /proc, so use the 7730 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7731 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7732 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7733 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7734 7735 // Make this call behave like the thread limit algorithm. 7736 retval = __kmp_avail_proc - __kmp_nth + 7737 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7738 if (retval > set_nproc) { 7739 retval = set_nproc; 7740 } 7741 if (retval < KMP_MIN_NTH) { 7742 retval = KMP_MIN_NTH; 7743 } 7744 7745 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7746 retval)); 7747 return retval; 7748 } 7749 7750 // There is a slight delay in the load balance algorithm in detecting new 7751 // running procs. The real system load at this instant should be at least as 7752 // large as the #active omp thread that are available to add to the team. 7753 if (system_active < team_curr_active) { 7754 system_active = team_curr_active; 7755 } 7756 retval = __kmp_avail_proc - system_active + team_curr_active; 7757 if (retval > set_nproc) { 7758 retval = set_nproc; 7759 } 7760 if (retval < KMP_MIN_NTH) { 7761 retval = KMP_MIN_NTH; 7762 } 7763 7764 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7765 return retval; 7766 } // __kmp_load_balance_nproc() 7767 7768 #endif /* USE_LOAD_BALANCE */ 7769 7770 /* ------------------------------------------------------------------------ */ 7771 7772 /* NOTE: this is called with the __kmp_init_lock held */ 7773 void __kmp_cleanup(void) { 7774 int f; 7775 7776 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7777 7778 if (TCR_4(__kmp_init_parallel)) { 7779 #if KMP_HANDLE_SIGNALS 7780 __kmp_remove_signals(); 7781 #endif 7782 TCW_4(__kmp_init_parallel, FALSE); 7783 } 7784 7785 if (TCR_4(__kmp_init_middle)) { 7786 #if KMP_AFFINITY_SUPPORTED 7787 __kmp_affinity_uninitialize(); 7788 #endif /* KMP_AFFINITY_SUPPORTED */ 7789 __kmp_cleanup_hierarchy(); 7790 TCW_4(__kmp_init_middle, FALSE); 7791 } 7792 7793 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7794 7795 if (__kmp_init_serial) { 7796 __kmp_runtime_destroy(); 7797 __kmp_init_serial = FALSE; 7798 } 7799 7800 __kmp_cleanup_threadprivate_caches(); 7801 7802 for (f = 0; f < __kmp_threads_capacity; f++) { 7803 if (__kmp_root[f] != NULL) { 7804 __kmp_free(__kmp_root[f]); 7805 __kmp_root[f] = NULL; 7806 } 7807 } 7808 __kmp_free(__kmp_threads); 7809 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7810 // there is no need in freeing __kmp_root. 7811 __kmp_threads = NULL; 7812 __kmp_root = NULL; 7813 __kmp_threads_capacity = 0; 7814 7815 #if KMP_USE_DYNAMIC_LOCK 7816 __kmp_cleanup_indirect_user_locks(); 7817 #else 7818 __kmp_cleanup_user_locks(); 7819 #endif 7820 7821 #if KMP_AFFINITY_SUPPORTED 7822 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7823 __kmp_cpuinfo_file = NULL; 7824 #endif /* KMP_AFFINITY_SUPPORTED */ 7825 7826 #if KMP_USE_ADAPTIVE_LOCKS 7827 #if KMP_DEBUG_ADAPTIVE_LOCKS 7828 __kmp_print_speculative_stats(); 7829 #endif 7830 #endif 7831 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7832 __kmp_nested_nth.nth = NULL; 7833 __kmp_nested_nth.size = 0; 7834 __kmp_nested_nth.used = 0; 7835 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7836 __kmp_nested_proc_bind.bind_types = NULL; 7837 __kmp_nested_proc_bind.size = 0; 7838 __kmp_nested_proc_bind.used = 0; 7839 if (__kmp_affinity_format) { 7840 KMP_INTERNAL_FREE(__kmp_affinity_format); 7841 __kmp_affinity_format = NULL; 7842 } 7843 7844 __kmp_i18n_catclose(); 7845 7846 #if KMP_USE_HIER_SCHED 7847 __kmp_hier_scheds.deallocate(); 7848 #endif 7849 7850 #if KMP_STATS_ENABLED 7851 __kmp_stats_fini(); 7852 #endif 7853 7854 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7855 } 7856 7857 /* ------------------------------------------------------------------------ */ 7858 7859 int __kmp_ignore_mppbeg(void) { 7860 char *env; 7861 7862 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7863 if (__kmp_str_match_false(env)) 7864 return FALSE; 7865 } 7866 // By default __kmpc_begin() is no-op. 7867 return TRUE; 7868 } 7869 7870 int __kmp_ignore_mppend(void) { 7871 char *env; 7872 7873 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7874 if (__kmp_str_match_false(env)) 7875 return FALSE; 7876 } 7877 // By default __kmpc_end() is no-op. 7878 return TRUE; 7879 } 7880 7881 void __kmp_internal_begin(void) { 7882 int gtid; 7883 kmp_root_t *root; 7884 7885 /* this is a very important step as it will register new sibling threads 7886 and assign these new uber threads a new gtid */ 7887 gtid = __kmp_entry_gtid(); 7888 root = __kmp_threads[gtid]->th.th_root; 7889 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7890 7891 if (root->r.r_begin) 7892 return; 7893 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7894 if (root->r.r_begin) { 7895 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7896 return; 7897 } 7898 7899 root->r.r_begin = TRUE; 7900 7901 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7902 } 7903 7904 /* ------------------------------------------------------------------------ */ 7905 7906 void __kmp_user_set_library(enum library_type arg) { 7907 int gtid; 7908 kmp_root_t *root; 7909 kmp_info_t *thread; 7910 7911 /* first, make sure we are initialized so we can get our gtid */ 7912 7913 gtid = __kmp_entry_gtid(); 7914 thread = __kmp_threads[gtid]; 7915 7916 root = thread->th.th_root; 7917 7918 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7919 library_serial)); 7920 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7921 thread */ 7922 KMP_WARNING(SetLibraryIncorrectCall); 7923 return; 7924 } 7925 7926 switch (arg) { 7927 case library_serial: 7928 thread->th.th_set_nproc = 0; 7929 set__nproc(thread, 1); 7930 break; 7931 case library_turnaround: 7932 thread->th.th_set_nproc = 0; 7933 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7934 : __kmp_dflt_team_nth_ub); 7935 break; 7936 case library_throughput: 7937 thread->th.th_set_nproc = 0; 7938 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7939 : __kmp_dflt_team_nth_ub); 7940 break; 7941 default: 7942 KMP_FATAL(UnknownLibraryType, arg); 7943 } 7944 7945 __kmp_aux_set_library(arg); 7946 } 7947 7948 void __kmp_aux_set_stacksize(size_t arg) { 7949 if (!__kmp_init_serial) 7950 __kmp_serial_initialize(); 7951 7952 #if KMP_OS_DARWIN 7953 if (arg & (0x1000 - 1)) { 7954 arg &= ~(0x1000 - 1); 7955 if (arg + 0x1000) /* check for overflow if we round up */ 7956 arg += 0x1000; 7957 } 7958 #endif 7959 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7960 7961 /* only change the default stacksize before the first parallel region */ 7962 if (!TCR_4(__kmp_init_parallel)) { 7963 size_t value = arg; /* argument is in bytes */ 7964 7965 if (value < __kmp_sys_min_stksize) 7966 value = __kmp_sys_min_stksize; 7967 else if (value > KMP_MAX_STKSIZE) 7968 value = KMP_MAX_STKSIZE; 7969 7970 __kmp_stksize = value; 7971 7972 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7973 } 7974 7975 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7976 } 7977 7978 /* set the behaviour of the runtime library */ 7979 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7980 void __kmp_aux_set_library(enum library_type arg) { 7981 __kmp_library = arg; 7982 7983 switch (__kmp_library) { 7984 case library_serial: { 7985 KMP_INFORM(LibraryIsSerial); 7986 } break; 7987 case library_turnaround: 7988 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7989 __kmp_use_yield = 2; // only yield when oversubscribed 7990 break; 7991 case library_throughput: 7992 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7993 __kmp_dflt_blocktime = 200; 7994 break; 7995 default: 7996 KMP_FATAL(UnknownLibraryType, arg); 7997 } 7998 } 7999 8000 /* Getting team information common for all team API */ 8001 // Returns NULL if not in teams construct 8002 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8003 kmp_info_t *thr = __kmp_entry_thread(); 8004 teams_serialized = 0; 8005 if (thr->th.th_teams_microtask) { 8006 kmp_team_t *team = thr->th.th_team; 8007 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8008 int ii = team->t.t_level; 8009 teams_serialized = team->t.t_serialized; 8010 int level = tlevel + 1; 8011 KMP_DEBUG_ASSERT(ii >= tlevel); 8012 while (ii > level) { 8013 for (teams_serialized = team->t.t_serialized; 8014 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8015 } 8016 if (team->t.t_serialized && (!teams_serialized)) { 8017 team = team->t.t_parent; 8018 continue; 8019 } 8020 if (ii > level) { 8021 team = team->t.t_parent; 8022 ii--; 8023 } 8024 } 8025 return team; 8026 } 8027 return NULL; 8028 } 8029 8030 int __kmp_aux_get_team_num() { 8031 int serialized; 8032 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8033 if (team) { 8034 if (serialized > 1) { 8035 return 0; // teams region is serialized ( 1 team of 1 thread ). 8036 } else { 8037 return team->t.t_master_tid; 8038 } 8039 } 8040 return 0; 8041 } 8042 8043 int __kmp_aux_get_num_teams() { 8044 int serialized; 8045 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8046 if (team) { 8047 if (serialized > 1) { 8048 return 1; 8049 } else { 8050 return team->t.t_parent->t.t_nproc; 8051 } 8052 } 8053 return 1; 8054 } 8055 8056 /* ------------------------------------------------------------------------ */ 8057 8058 /* 8059 * Affinity Format Parser 8060 * 8061 * Field is in form of: %[[[0].]size]type 8062 * % and type are required (%% means print a literal '%') 8063 * type is either single char or long name surrounded by {}, 8064 * e.g., N or {num_threads} 8065 * 0 => leading zeros 8066 * . => right justified when size is specified 8067 * by default output is left justified 8068 * size is the *minimum* field length 8069 * All other characters are printed as is 8070 * 8071 * Available field types: 8072 * L {thread_level} - omp_get_level() 8073 * n {thread_num} - omp_get_thread_num() 8074 * h {host} - name of host machine 8075 * P {process_id} - process id (integer) 8076 * T {thread_identifier} - native thread identifier (integer) 8077 * N {num_threads} - omp_get_num_threads() 8078 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8079 * a {thread_affinity} - comma separated list of integers or integer ranges 8080 * (values of affinity mask) 8081 * 8082 * Implementation-specific field types can be added 8083 * If a type is unknown, print "undefined" 8084 */ 8085 8086 // Structure holding the short name, long name, and corresponding data type 8087 // for snprintf. A table of these will represent the entire valid keyword 8088 // field types. 8089 typedef struct kmp_affinity_format_field_t { 8090 char short_name; // from spec e.g., L -> thread level 8091 const char *long_name; // from spec thread_level -> thread level 8092 char field_format; // data type for snprintf (typically 'd' or 's' 8093 // for integer or string) 8094 } kmp_affinity_format_field_t; 8095 8096 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8097 #if KMP_AFFINITY_SUPPORTED 8098 {'A', "thread_affinity", 's'}, 8099 #endif 8100 {'t', "team_num", 'd'}, 8101 {'T', "num_teams", 'd'}, 8102 {'L', "nesting_level", 'd'}, 8103 {'n', "thread_num", 'd'}, 8104 {'N', "num_threads", 'd'}, 8105 {'a', "ancestor_tnum", 'd'}, 8106 {'H', "host", 's'}, 8107 {'P', "process_id", 'd'}, 8108 {'i', "native_thread_id", 'd'}}; 8109 8110 // Return the number of characters it takes to hold field 8111 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8112 const char **ptr, 8113 kmp_str_buf_t *field_buffer) { 8114 int rc, format_index, field_value; 8115 const char *width_left, *width_right; 8116 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8117 static const int FORMAT_SIZE = 20; 8118 char format[FORMAT_SIZE] = {0}; 8119 char absolute_short_name = 0; 8120 8121 KMP_DEBUG_ASSERT(gtid >= 0); 8122 KMP_DEBUG_ASSERT(th); 8123 KMP_DEBUG_ASSERT(**ptr == '%'); 8124 KMP_DEBUG_ASSERT(field_buffer); 8125 8126 __kmp_str_buf_clear(field_buffer); 8127 8128 // Skip the initial % 8129 (*ptr)++; 8130 8131 // Check for %% first 8132 if (**ptr == '%') { 8133 __kmp_str_buf_cat(field_buffer, "%", 1); 8134 (*ptr)++; // skip over the second % 8135 return 1; 8136 } 8137 8138 // Parse field modifiers if they are present 8139 pad_zeros = false; 8140 if (**ptr == '0') { 8141 pad_zeros = true; 8142 (*ptr)++; // skip over 0 8143 } 8144 right_justify = false; 8145 if (**ptr == '.') { 8146 right_justify = true; 8147 (*ptr)++; // skip over . 8148 } 8149 // Parse width of field: [width_left, width_right) 8150 width_left = width_right = NULL; 8151 if (**ptr >= '0' && **ptr <= '9') { 8152 width_left = *ptr; 8153 SKIP_DIGITS(*ptr); 8154 width_right = *ptr; 8155 } 8156 8157 // Create the format for KMP_SNPRINTF based on flags parsed above 8158 format_index = 0; 8159 format[format_index++] = '%'; 8160 if (!right_justify) 8161 format[format_index++] = '-'; 8162 if (pad_zeros) 8163 format[format_index++] = '0'; 8164 if (width_left && width_right) { 8165 int i = 0; 8166 // Only allow 8 digit number widths. 8167 // This also prevents overflowing format variable 8168 while (i < 8 && width_left < width_right) { 8169 format[format_index++] = *width_left; 8170 width_left++; 8171 i++; 8172 } 8173 } 8174 8175 // Parse a name (long or short) 8176 // Canonicalize the name into absolute_short_name 8177 found_valid_name = false; 8178 parse_long_name = (**ptr == '{'); 8179 if (parse_long_name) 8180 (*ptr)++; // skip initial left brace 8181 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8182 sizeof(__kmp_affinity_format_table[0]); 8183 ++i) { 8184 char short_name = __kmp_affinity_format_table[i].short_name; 8185 const char *long_name = __kmp_affinity_format_table[i].long_name; 8186 char field_format = __kmp_affinity_format_table[i].field_format; 8187 if (parse_long_name) { 8188 size_t length = KMP_STRLEN(long_name); 8189 if (strncmp(*ptr, long_name, length) == 0) { 8190 found_valid_name = true; 8191 (*ptr) += length; // skip the long name 8192 } 8193 } else if (**ptr == short_name) { 8194 found_valid_name = true; 8195 (*ptr)++; // skip the short name 8196 } 8197 if (found_valid_name) { 8198 format[format_index++] = field_format; 8199 format[format_index++] = '\0'; 8200 absolute_short_name = short_name; 8201 break; 8202 } 8203 } 8204 if (parse_long_name) { 8205 if (**ptr != '}') { 8206 absolute_short_name = 0; 8207 } else { 8208 (*ptr)++; // skip over the right brace 8209 } 8210 } 8211 8212 // Attempt to fill the buffer with the requested 8213 // value using snprintf within __kmp_str_buf_print() 8214 switch (absolute_short_name) { 8215 case 't': 8216 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8217 break; 8218 case 'T': 8219 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8220 break; 8221 case 'L': 8222 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8223 break; 8224 case 'n': 8225 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8226 break; 8227 case 'H': { 8228 static const int BUFFER_SIZE = 256; 8229 char buf[BUFFER_SIZE]; 8230 __kmp_expand_host_name(buf, BUFFER_SIZE); 8231 rc = __kmp_str_buf_print(field_buffer, format, buf); 8232 } break; 8233 case 'P': 8234 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8235 break; 8236 case 'i': 8237 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8238 break; 8239 case 'N': 8240 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8241 break; 8242 case 'a': 8243 field_value = 8244 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8245 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8246 break; 8247 #if KMP_AFFINITY_SUPPORTED 8248 case 'A': { 8249 kmp_str_buf_t buf; 8250 __kmp_str_buf_init(&buf); 8251 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8252 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8253 __kmp_str_buf_free(&buf); 8254 } break; 8255 #endif 8256 default: 8257 // According to spec, If an implementation does not have info for field 8258 // type, then "undefined" is printed 8259 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8260 // Skip the field 8261 if (parse_long_name) { 8262 SKIP_TOKEN(*ptr); 8263 if (**ptr == '}') 8264 (*ptr)++; 8265 } else { 8266 (*ptr)++; 8267 } 8268 } 8269 8270 KMP_ASSERT(format_index <= FORMAT_SIZE); 8271 return rc; 8272 } 8273 8274 /* 8275 * Return number of characters needed to hold the affinity string 8276 * (not including null byte character) 8277 * The resultant string is printed to buffer, which the caller can then 8278 * handle afterwards 8279 */ 8280 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8281 kmp_str_buf_t *buffer) { 8282 const char *parse_ptr; 8283 size_t retval; 8284 const kmp_info_t *th; 8285 kmp_str_buf_t field; 8286 8287 KMP_DEBUG_ASSERT(buffer); 8288 KMP_DEBUG_ASSERT(gtid >= 0); 8289 8290 __kmp_str_buf_init(&field); 8291 __kmp_str_buf_clear(buffer); 8292 8293 th = __kmp_threads[gtid]; 8294 retval = 0; 8295 8296 // If format is NULL or zero-length string, then we use 8297 // affinity-format-var ICV 8298 parse_ptr = format; 8299 if (parse_ptr == NULL || *parse_ptr == '\0') { 8300 parse_ptr = __kmp_affinity_format; 8301 } 8302 KMP_DEBUG_ASSERT(parse_ptr); 8303 8304 while (*parse_ptr != '\0') { 8305 // Parse a field 8306 if (*parse_ptr == '%') { 8307 // Put field in the buffer 8308 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8309 __kmp_str_buf_catbuf(buffer, &field); 8310 retval += rc; 8311 } else { 8312 // Put literal character in buffer 8313 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8314 retval++; 8315 parse_ptr++; 8316 } 8317 } 8318 __kmp_str_buf_free(&field); 8319 return retval; 8320 } 8321 8322 // Displays the affinity string to stdout 8323 void __kmp_aux_display_affinity(int gtid, const char *format) { 8324 kmp_str_buf_t buf; 8325 __kmp_str_buf_init(&buf); 8326 __kmp_aux_capture_affinity(gtid, format, &buf); 8327 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8328 __kmp_str_buf_free(&buf); 8329 } 8330 8331 /* ------------------------------------------------------------------------ */ 8332 8333 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8334 int blocktime = arg; /* argument is in milliseconds */ 8335 #if KMP_USE_MONITOR 8336 int bt_intervals; 8337 #endif 8338 kmp_int8 bt_set; 8339 8340 __kmp_save_internal_controls(thread); 8341 8342 /* Normalize and set blocktime for the teams */ 8343 if (blocktime < KMP_MIN_BLOCKTIME) 8344 blocktime = KMP_MIN_BLOCKTIME; 8345 else if (blocktime > KMP_MAX_BLOCKTIME) 8346 blocktime = KMP_MAX_BLOCKTIME; 8347 8348 set__blocktime_team(thread->th.th_team, tid, blocktime); 8349 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8350 8351 #if KMP_USE_MONITOR 8352 /* Calculate and set blocktime intervals for the teams */ 8353 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8354 8355 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8356 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8357 #endif 8358 8359 /* Set whether blocktime has been set to "TRUE" */ 8360 bt_set = TRUE; 8361 8362 set__bt_set_team(thread->th.th_team, tid, bt_set); 8363 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8364 #if KMP_USE_MONITOR 8365 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8366 "bt_intervals=%d, monitor_updates=%d\n", 8367 __kmp_gtid_from_tid(tid, thread->th.th_team), 8368 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8369 __kmp_monitor_wakeups)); 8370 #else 8371 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8372 __kmp_gtid_from_tid(tid, thread->th.th_team), 8373 thread->th.th_team->t.t_id, tid, blocktime)); 8374 #endif 8375 } 8376 8377 void __kmp_aux_set_defaults(char const *str, size_t len) { 8378 if (!__kmp_init_serial) { 8379 __kmp_serial_initialize(); 8380 } 8381 __kmp_env_initialize(str); 8382 8383 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8384 __kmp_env_print(); 8385 } 8386 } // __kmp_aux_set_defaults 8387 8388 /* ------------------------------------------------------------------------ */ 8389 /* internal fast reduction routines */ 8390 8391 PACKED_REDUCTION_METHOD_T 8392 __kmp_determine_reduction_method( 8393 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8394 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8395 kmp_critical_name *lck) { 8396 8397 // Default reduction method: critical construct ( lck != NULL, like in current 8398 // PAROPT ) 8399 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8400 // can be selected by RTL 8401 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8402 // can be selected by RTL 8403 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8404 // among generated by PAROPT. 8405 8406 PACKED_REDUCTION_METHOD_T retval; 8407 8408 int team_size; 8409 8410 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8411 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8412 8413 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8414 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8415 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8416 8417 retval = critical_reduce_block; 8418 8419 // another choice of getting a team size (with 1 dynamic deference) is slower 8420 team_size = __kmp_get_team_num_threads(global_tid); 8421 if (team_size == 1) { 8422 8423 retval = empty_reduce_block; 8424 8425 } else { 8426 8427 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8428 8429 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8430 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8431 8432 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8433 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8434 8435 int teamsize_cutoff = 4; 8436 8437 #if KMP_MIC_SUPPORTED 8438 if (__kmp_mic_type != non_mic) { 8439 teamsize_cutoff = 8; 8440 } 8441 #endif 8442 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8443 if (tree_available) { 8444 if (team_size <= teamsize_cutoff) { 8445 if (atomic_available) { 8446 retval = atomic_reduce_block; 8447 } 8448 } else { 8449 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8450 } 8451 } else if (atomic_available) { 8452 retval = atomic_reduce_block; 8453 } 8454 #else 8455 #error "Unknown or unsupported OS" 8456 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8457 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8458 8459 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8460 8461 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8462 8463 // basic tuning 8464 8465 if (atomic_available) { 8466 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8467 retval = atomic_reduce_block; 8468 } 8469 } // otherwise: use critical section 8470 8471 #elif KMP_OS_DARWIN 8472 8473 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8474 if (atomic_available && (num_vars <= 3)) { 8475 retval = atomic_reduce_block; 8476 } else if (tree_available) { 8477 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8478 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8479 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8480 } 8481 } // otherwise: use critical section 8482 8483 #else 8484 #error "Unknown or unsupported OS" 8485 #endif 8486 8487 #else 8488 #error "Unknown or unsupported architecture" 8489 #endif 8490 } 8491 8492 // KMP_FORCE_REDUCTION 8493 8494 // If the team is serialized (team_size == 1), ignore the forced reduction 8495 // method and stay with the unsynchronized method (empty_reduce_block) 8496 if (__kmp_force_reduction_method != reduction_method_not_defined && 8497 team_size != 1) { 8498 8499 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8500 8501 int atomic_available, tree_available; 8502 8503 switch ((forced_retval = __kmp_force_reduction_method)) { 8504 case critical_reduce_block: 8505 KMP_ASSERT(lck); // lck should be != 0 8506 break; 8507 8508 case atomic_reduce_block: 8509 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8510 if (!atomic_available) { 8511 KMP_WARNING(RedMethodNotSupported, "atomic"); 8512 forced_retval = critical_reduce_block; 8513 } 8514 break; 8515 8516 case tree_reduce_block: 8517 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8518 if (!tree_available) { 8519 KMP_WARNING(RedMethodNotSupported, "tree"); 8520 forced_retval = critical_reduce_block; 8521 } else { 8522 #if KMP_FAST_REDUCTION_BARRIER 8523 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8524 #endif 8525 } 8526 break; 8527 8528 default: 8529 KMP_ASSERT(0); // "unsupported method specified" 8530 } 8531 8532 retval = forced_retval; 8533 } 8534 8535 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8536 8537 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8538 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8539 8540 return (retval); 8541 } 8542 // this function is for testing set/get/determine reduce method 8543 kmp_int32 __kmp_get_reduce_method(void) { 8544 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8545 } 8546 8547 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8548 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8549 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8550 8551 // Hard pause shuts down the runtime completely. Resume happens naturally when 8552 // OpenMP is used subsequently. 8553 void __kmp_hard_pause() { 8554 __kmp_pause_status = kmp_hard_paused; 8555 __kmp_internal_end_thread(-1); 8556 } 8557 8558 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8559 void __kmp_resume_if_soft_paused() { 8560 if (__kmp_pause_status == kmp_soft_paused) { 8561 __kmp_pause_status = kmp_not_paused; 8562 8563 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8564 kmp_info_t *thread = __kmp_threads[gtid]; 8565 if (thread) { // Wake it if sleeping 8566 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8567 thread); 8568 if (fl.is_sleeping()) 8569 fl.resume(gtid); 8570 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8571 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8572 } else { // thread holds the lock and may sleep soon 8573 do { // until either the thread sleeps, or we can get the lock 8574 if (fl.is_sleeping()) { 8575 fl.resume(gtid); 8576 break; 8577 } else if (__kmp_try_suspend_mx(thread)) { 8578 __kmp_unlock_suspend_mx(thread); 8579 break; 8580 } 8581 } while (1); 8582 } 8583 } 8584 } 8585 } 8586 } 8587 8588 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8589 // TODO: add warning messages 8590 int __kmp_pause_resource(kmp_pause_status_t level) { 8591 if (level == kmp_not_paused) { // requesting resume 8592 if (__kmp_pause_status == kmp_not_paused) { 8593 // error message about runtime not being paused, so can't resume 8594 return 1; 8595 } else { 8596 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8597 __kmp_pause_status == kmp_hard_paused); 8598 __kmp_pause_status = kmp_not_paused; 8599 return 0; 8600 } 8601 } else if (level == kmp_soft_paused) { // requesting soft pause 8602 if (__kmp_pause_status != kmp_not_paused) { 8603 // error message about already being paused 8604 return 1; 8605 } else { 8606 __kmp_soft_pause(); 8607 return 0; 8608 } 8609 } else if (level == kmp_hard_paused) { // requesting hard pause 8610 if (__kmp_pause_status != kmp_not_paused) { 8611 // error message about already being paused 8612 return 1; 8613 } else { 8614 __kmp_hard_pause(); 8615 return 0; 8616 } 8617 } else { 8618 // error message about invalid level 8619 return 1; 8620 } 8621 } 8622 8623 void __kmp_omp_display_env(int verbose) { 8624 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8625 if (__kmp_init_serial == 0) 8626 __kmp_do_serial_initialize(); 8627 __kmp_display_env_impl(!verbose, verbose); 8628 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8629 } 8630 8631 // Globals and functions for hidden helper task 8632 kmp_info_t **__kmp_hidden_helper_threads; 8633 kmp_info_t *__kmp_hidden_helper_main_thread; 8634 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8635 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8636 #if KMP_OS_LINUX 8637 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8638 #else 8639 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8640 #endif 8641 8642 namespace { 8643 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8644 8645 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8646 // This is an explicit synchronization on all hidden helper threads in case 8647 // that when a regular thread pushes a hidden helper task to one hidden 8648 // helper thread, the thread has not been awaken once since they're released 8649 // by the main thread after creating the team. 8650 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8651 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8652 __kmp_hidden_helper_threads_num) 8653 ; 8654 8655 // If main thread, then wait for signal 8656 if (__kmpc_master(nullptr, *gtid)) { 8657 // First, unset the initial state and release the initial thread 8658 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8659 __kmp_hidden_helper_initz_release(); 8660 __kmp_hidden_helper_main_thread_wait(); 8661 // Now wake up all worker threads 8662 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8663 __kmp_hidden_helper_worker_thread_signal(); 8664 } 8665 } 8666 } 8667 } // namespace 8668 8669 void __kmp_hidden_helper_threads_initz_routine() { 8670 // Create a new root for hidden helper team/threads 8671 const int gtid = __kmp_register_root(TRUE); 8672 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8673 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8674 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8675 __kmp_hidden_helper_threads_num; 8676 8677 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8678 8679 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8680 8681 // Set the initialization flag to FALSE 8682 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8683 8684 __kmp_hidden_helper_threads_deinitz_release(); 8685 } 8686