1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 #if OMP_PROFILING_SUPPORT 36 #include "llvm/Support/TimeProfiler.h" 37 static char *ProfileTraceFile = nullptr; 38 #endif 39 40 /* these are temporary issues to be dealt with */ 41 #define KMP_USE_PRCTL 0 42 43 #if KMP_OS_WINDOWS 44 #include <process.h> 45 #endif 46 47 #include "tsan_annotations.h" 48 49 #if KMP_OS_WINDOWS 50 // windows does not need include files as it doesn't use shared memory 51 #else 52 #include <sys/mman.h> 53 #include <sys/stat.h> 54 #include <fcntl.h> 55 #define SHM_SIZE 1024 56 #endif 57 58 #if defined(KMP_GOMP_COMPAT) 59 char const __kmp_version_alt_comp[] = 60 KMP_VERSION_PREFIX "alternative compiler support: yes"; 61 #endif /* defined(KMP_GOMP_COMPAT) */ 62 63 char const __kmp_version_omp_api[] = 64 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 65 66 #ifdef KMP_DEBUG 67 char const __kmp_version_lock[] = 68 KMP_VERSION_PREFIX "lock type: run time selectable"; 69 #endif /* KMP_DEBUG */ 70 71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 72 73 /* ------------------------------------------------------------------------ */ 74 75 #if KMP_USE_MONITOR 76 kmp_info_t __kmp_monitor; 77 #endif 78 79 /* Forward declarations */ 80 81 void __kmp_cleanup(void); 82 83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 84 int gtid); 85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 86 kmp_internal_control_t *new_icvs, 87 ident_t *loc); 88 #if KMP_AFFINITY_SUPPORTED 89 static void __kmp_partition_places(kmp_team_t *team, 90 int update_master_only = 0); 91 #endif 92 static void __kmp_do_serial_initialize(void); 93 void __kmp_fork_barrier(int gtid, int tid); 94 void __kmp_join_barrier(int gtid); 95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 96 kmp_internal_control_t *new_icvs, ident_t *loc); 97 98 #ifdef USE_LOAD_BALANCE 99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 100 #endif 101 102 static int __kmp_expand_threads(int nNeed); 103 #if KMP_OS_WINDOWS 104 static int __kmp_unregister_root_other_thread(int gtid); 105 #endif 106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 108 109 /* Calculate the identifier of the current thread */ 110 /* fast (and somewhat portable) way to get unique identifier of executing 111 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 112 int __kmp_get_global_thread_id() { 113 int i; 114 kmp_info_t **other_threads; 115 size_t stack_data; 116 char *stack_addr; 117 size_t stack_size; 118 char *stack_base; 119 120 KA_TRACE( 121 1000, 122 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 123 __kmp_nth, __kmp_all_nth)); 124 125 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 126 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 127 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 128 __kmp_init_gtid for this to work. */ 129 130 if (!TCR_4(__kmp_init_gtid)) 131 return KMP_GTID_DNE; 132 133 #ifdef KMP_TDATA_GTID 134 if (TCR_4(__kmp_gtid_mode) >= 3) { 135 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 136 return __kmp_gtid; 137 } 138 #endif 139 if (TCR_4(__kmp_gtid_mode) >= 2) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 141 return __kmp_gtid_get_specific(); 142 } 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 144 145 stack_addr = (char *)&stack_data; 146 other_threads = __kmp_threads; 147 148 /* ATT: The code below is a source of potential bugs due to unsynchronized 149 access to __kmp_threads array. For example: 150 1. Current thread loads other_threads[i] to thr and checks it, it is 151 non-NULL. 152 2. Current thread is suspended by OS. 153 3. Another thread unregisters and finishes (debug versions of free() 154 may fill memory with something like 0xEF). 155 4. Current thread is resumed. 156 5. Current thread reads junk from *thr. 157 TODO: Fix it. --ln */ 158 159 for (i = 0; i < __kmp_threads_capacity; i++) { 160 161 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 162 if (!thr) 163 continue; 164 165 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 166 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 167 168 /* stack grows down -- search through all of the active threads */ 169 170 if (stack_addr <= stack_base) { 171 size_t stack_diff = stack_base - stack_addr; 172 173 if (stack_diff <= stack_size) { 174 /* The only way we can be closer than the allocated */ 175 /* stack size is if we are running on this thread. */ 176 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 177 return i; 178 } 179 } 180 } 181 182 /* get specific to try and determine our gtid */ 183 KA_TRACE(1000, 184 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 185 "thread, using TLS\n")); 186 i = __kmp_gtid_get_specific(); 187 188 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 189 190 /* if we havn't been assigned a gtid, then return code */ 191 if (i < 0) 192 return i; 193 194 /* dynamically updated stack window for uber threads to avoid get_specific 195 call */ 196 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 197 KMP_FATAL(StackOverflow, i); 198 } 199 200 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 201 if (stack_addr > stack_base) { 202 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 203 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 204 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 205 stack_base); 206 } else { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 208 stack_base - stack_addr); 209 } 210 211 /* Reprint stack bounds for ubermaster since they have been refined */ 212 if (__kmp_storage_map) { 213 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 214 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 215 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 216 other_threads[i]->th.th_info.ds.ds_stacksize, 217 "th_%d stack (refinement)", i); 218 } 219 return i; 220 } 221 222 int __kmp_get_global_thread_id_reg() { 223 int gtid; 224 225 if (!__kmp_init_serial) { 226 gtid = KMP_GTID_DNE; 227 } else 228 #ifdef KMP_TDATA_GTID 229 if (TCR_4(__kmp_gtid_mode) >= 3) { 230 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 231 gtid = __kmp_gtid; 232 } else 233 #endif 234 if (TCR_4(__kmp_gtid_mode) >= 2) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 236 gtid = __kmp_gtid_get_specific(); 237 } else { 238 KA_TRACE(1000, 239 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 240 gtid = __kmp_get_global_thread_id(); 241 } 242 243 /* we must be a new uber master sibling thread */ 244 if (gtid == KMP_GTID_DNE) { 245 KA_TRACE(10, 246 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 247 "Registering a new gtid.\n")); 248 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 249 if (!__kmp_init_serial) { 250 __kmp_do_serial_initialize(); 251 gtid = __kmp_gtid_get_specific(); 252 } else { 253 gtid = __kmp_register_root(FALSE); 254 } 255 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 256 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 257 } 258 259 KMP_DEBUG_ASSERT(gtid >= 0); 260 261 return gtid; 262 } 263 264 /* caller must hold forkjoin_lock */ 265 void __kmp_check_stack_overlap(kmp_info_t *th) { 266 int f; 267 char *stack_beg = NULL; 268 char *stack_end = NULL; 269 int gtid; 270 271 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 272 if (__kmp_storage_map) { 273 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 274 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 275 276 gtid = __kmp_gtid_from_thread(th); 277 278 if (gtid == KMP_GTID_MONITOR) { 279 __kmp_print_storage_map_gtid( 280 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 281 "th_%s stack (%s)", "mon", 282 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 283 } else { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%d stack (%s)", gtid, 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } 289 } 290 291 /* No point in checking ubermaster threads since they use refinement and 292 * cannot overlap */ 293 gtid = __kmp_gtid_from_thread(th); 294 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 295 KA_TRACE(10, 296 ("__kmp_check_stack_overlap: performing extensive checking\n")); 297 if (stack_beg == NULL) { 298 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 299 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 300 } 301 302 for (f = 0; f < __kmp_threads_capacity; f++) { 303 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 304 305 if (f_th && f_th != th) { 306 char *other_stack_end = 307 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 308 char *other_stack_beg = 309 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 310 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 311 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 312 313 /* Print the other stack values before the abort */ 314 if (__kmp_storage_map) 315 __kmp_print_storage_map_gtid( 316 -1, other_stack_beg, other_stack_end, 317 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 318 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 319 320 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 321 __kmp_msg_null); 322 } 323 } 324 } 325 } 326 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 327 } 328 329 /* ------------------------------------------------------------------------ */ 330 331 void __kmp_infinite_loop(void) { 332 static int done = FALSE; 333 334 while (!done) { 335 KMP_YIELD(TRUE); 336 } 337 } 338 339 #define MAX_MESSAGE 512 340 341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 342 char const *format, ...) { 343 char buffer[MAX_MESSAGE]; 344 va_list ap; 345 346 va_start(ap, format); 347 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 348 p2, (unsigned long)size, format); 349 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 350 __kmp_vprintf(kmp_err, buffer, ap); 351 #if KMP_PRINT_DATA_PLACEMENT 352 int node; 353 if (gtid >= 0) { 354 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 355 if (__kmp_storage_map_verbose) { 356 node = __kmp_get_host_node(p1); 357 if (node < 0) /* doesn't work, so don't try this next time */ 358 __kmp_storage_map_verbose = FALSE; 359 else { 360 char *last; 361 int lastNode; 362 int localProc = __kmp_get_cpu_from_gtid(gtid); 363 364 const int page_size = KMP_GET_PAGE_SIZE(); 365 366 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 367 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 368 if (localProc >= 0) 369 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 370 localProc >> 1); 371 else 372 __kmp_printf_no_lock(" GTID %d\n", gtid); 373 #if KMP_USE_PRCTL 374 /* The more elaborate format is disabled for now because of the prctl 375 * hanging bug. */ 376 do { 377 last = p1; 378 lastNode = node; 379 /* This loop collates adjacent pages with the same host node. */ 380 do { 381 (char *)p1 += page_size; 382 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 383 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 384 lastNode); 385 } while (p1 <= p2); 386 #else 387 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 388 (char *)p1 + (page_size - 1), 389 __kmp_get_host_node(p1)); 390 if (p1 < p2) { 391 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 392 (char *)p2 + (page_size - 1), 393 __kmp_get_host_node(p2)); 394 } 395 #endif 396 } 397 } 398 } else 399 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 400 } 401 #endif /* KMP_PRINT_DATA_PLACEMENT */ 402 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 403 } 404 405 void __kmp_warn(char const *format, ...) { 406 char buffer[MAX_MESSAGE]; 407 va_list ap; 408 409 if (__kmp_generate_warnings == kmp_warnings_off) { 410 return; 411 } 412 413 va_start(ap, format); 414 415 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 416 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 417 __kmp_vprintf(kmp_err, buffer, ap); 418 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 419 420 va_end(ap); 421 } 422 423 void __kmp_abort_process() { 424 // Later threads may stall here, but that's ok because abort() will kill them. 425 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 426 427 if (__kmp_debug_buf) { 428 __kmp_dump_debug_buffer(); 429 } 430 431 if (KMP_OS_WINDOWS) { 432 // Let other threads know of abnormal termination and prevent deadlock 433 // if abort happened during library initialization or shutdown 434 __kmp_global.g.g_abort = SIGABRT; 435 436 /* On Windows* OS by default abort() causes pop-up error box, which stalls 437 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 438 boxes. _set_abort_behavior() works well, but this function is not 439 available in VS7 (this is not problem for DLL, but it is a problem for 440 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 441 help, at least in some versions of MS C RTL. 442 443 It seems following sequence is the only way to simulate abort() and 444 avoid pop-up error box. */ 445 raise(SIGABRT); 446 _exit(3); // Just in case, if signal ignored, exit anyway. 447 } else { 448 __kmp_unregister_library(); 449 abort(); 450 } 451 452 __kmp_infinite_loop(); 453 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 454 455 } // __kmp_abort_process 456 457 void __kmp_abort_thread(void) { 458 // TODO: Eliminate g_abort global variable and this function. 459 // In case of abort just call abort(), it will kill all the threads. 460 __kmp_infinite_loop(); 461 } // __kmp_abort_thread 462 463 /* Print out the storage map for the major kmp_info_t thread data structures 464 that are allocated together. */ 465 466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 467 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 468 gtid); 469 470 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 471 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 474 sizeof(kmp_local_t), "th_%d.th_local", gtid); 475 476 __kmp_print_storage_map_gtid( 477 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 478 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 479 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 481 &thr->th.th_bar[bs_plain_barrier + 1], 482 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 483 gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 486 &thr->th.th_bar[bs_forkjoin_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 488 gtid); 489 490 #if KMP_FAST_REDUCTION_BARRIER 491 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 492 &thr->th.th_bar[bs_reduction_barrier + 1], 493 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 494 gtid); 495 #endif // KMP_FAST_REDUCTION_BARRIER 496 } 497 498 /* Print out the storage map for the major kmp_team_t team data structures 499 that are allocated together. */ 500 501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 502 int team_id, int num_thr) { 503 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 504 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 505 header, team_id); 506 507 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 508 &team->t.t_bar[bs_last_barrier], 509 sizeof(kmp_balign_team_t) * bs_last_barrier, 510 "%s_%d.t_bar", header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 513 &team->t.t_bar[bs_plain_barrier + 1], 514 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 515 header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 518 &team->t.t_bar[bs_forkjoin_barrier + 1], 519 sizeof(kmp_balign_team_t), 520 "%s_%d.t_bar[forkjoin]", header, team_id); 521 522 #if KMP_FAST_REDUCTION_BARRIER 523 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 524 &team->t.t_bar[bs_reduction_barrier + 1], 525 sizeof(kmp_balign_team_t), 526 "%s_%d.t_bar[reduction]", header, team_id); 527 #endif // KMP_FAST_REDUCTION_BARRIER 528 529 __kmp_print_storage_map_gtid( 530 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 531 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 532 533 __kmp_print_storage_map_gtid( 534 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 535 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 536 537 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 538 &team->t.t_disp_buffer[num_disp_buff], 539 sizeof(dispatch_shared_info_t) * num_disp_buff, 540 "%s_%d.t_disp_buffer", header, team_id); 541 } 542 543 static void __kmp_init_allocator() { __kmp_init_memkind(); } 544 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 545 546 /* ------------------------------------------------------------------------ */ 547 548 #if KMP_DYNAMIC_LIB 549 #if KMP_OS_WINDOWS 550 551 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 552 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 553 554 switch (fdwReason) { 555 556 case DLL_PROCESS_ATTACH: 557 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 558 559 return TRUE; 560 561 case DLL_PROCESS_DETACH: 562 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 563 564 // According to Windows* documentation for DllMain entry point: 565 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 566 // lpReserved == NULL when FreeLibrary() is called, 567 // lpReserved != NULL when the process is terminated. 568 // When FreeLibrary() is called, worker threads remain alive. So the 569 // runtime's state is consistent and executing proper shutdown is OK. 570 // When the process is terminated, worker threads have exited or been 571 // forcefully terminated by the OS and only the shutdown thread remains. 572 // This can leave the runtime in an inconsistent state. 573 // Hence, only attempt proper cleanup when FreeLibrary() is called. 574 // Otherwise, rely on OS to reclaim resources. 575 if (lpReserved == NULL) 576 __kmp_internal_end_library(__kmp_gtid_get_specific()); 577 578 return TRUE; 579 580 case DLL_THREAD_ATTACH: 581 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 582 583 /* if we want to register new siblings all the time here call 584 * __kmp_get_gtid(); */ 585 return TRUE; 586 587 case DLL_THREAD_DETACH: 588 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 589 590 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 591 return TRUE; 592 } 593 594 return TRUE; 595 } 596 597 #endif /* KMP_OS_WINDOWS */ 598 #endif /* KMP_DYNAMIC_LIB */ 599 600 /* __kmp_parallel_deo -- Wait until it's our turn. */ 601 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 602 int gtid = *gtid_ref; 603 #ifdef BUILD_PARALLEL_ORDERED 604 kmp_team_t *team = __kmp_team_from_gtid(gtid); 605 #endif /* BUILD_PARALLEL_ORDERED */ 606 607 if (__kmp_env_consistency_check) { 608 if (__kmp_threads[gtid]->th.th_root->r.r_active) 609 #if KMP_USE_DYNAMIC_LOCK 610 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 611 #else 612 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 613 #endif 614 } 615 #ifdef BUILD_PARALLEL_ORDERED 616 if (!team->t.t_serialized) { 617 KMP_MB(); 618 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 619 NULL); 620 KMP_MB(); 621 } 622 #endif /* BUILD_PARALLEL_ORDERED */ 623 } 624 625 /* __kmp_parallel_dxo -- Signal the next task. */ 626 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 627 int gtid = *gtid_ref; 628 #ifdef BUILD_PARALLEL_ORDERED 629 int tid = __kmp_tid_from_gtid(gtid); 630 kmp_team_t *team = __kmp_team_from_gtid(gtid); 631 #endif /* BUILD_PARALLEL_ORDERED */ 632 633 if (__kmp_env_consistency_check) { 634 if (__kmp_threads[gtid]->th.th_root->r.r_active) 635 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 636 } 637 #ifdef BUILD_PARALLEL_ORDERED 638 if (!team->t.t_serialized) { 639 KMP_MB(); /* Flush all pending memory write invalidates. */ 640 641 /* use the tid of the next thread in this team */ 642 /* TODO replace with general release procedure */ 643 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 644 645 KMP_MB(); /* Flush all pending memory write invalidates. */ 646 } 647 #endif /* BUILD_PARALLEL_ORDERED */ 648 } 649 650 /* ------------------------------------------------------------------------ */ 651 /* The BARRIER for a SINGLE process section is always explicit */ 652 653 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 654 int status; 655 kmp_info_t *th; 656 kmp_team_t *team; 657 658 if (!TCR_4(__kmp_init_parallel)) 659 __kmp_parallel_initialize(); 660 __kmp_resume_if_soft_paused(); 661 662 th = __kmp_threads[gtid]; 663 team = th->th.th_team; 664 status = 0; 665 666 th->th.th_ident = id_ref; 667 668 if (team->t.t_serialized) { 669 status = 1; 670 } else { 671 kmp_int32 old_this = th->th.th_local.this_construct; 672 673 ++th->th.th_local.this_construct; 674 /* try to set team count to thread count--success means thread got the 675 single block */ 676 /* TODO: Should this be acquire or release? */ 677 if (team->t.t_construct == old_this) { 678 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 679 th->th.th_local.this_construct); 680 } 681 #if USE_ITT_BUILD 682 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 683 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 684 team->t.t_active_level == 685 1) { // Only report metadata by master of active team at level 1 686 __kmp_itt_metadata_single(id_ref); 687 } 688 #endif /* USE_ITT_BUILD */ 689 } 690 691 if (__kmp_env_consistency_check) { 692 if (status && push_ws) { 693 __kmp_push_workshare(gtid, ct_psingle, id_ref); 694 } else { 695 __kmp_check_workshare(gtid, ct_psingle, id_ref); 696 } 697 } 698 #if USE_ITT_BUILD 699 if (status) { 700 __kmp_itt_single_start(gtid); 701 } 702 #endif /* USE_ITT_BUILD */ 703 return status; 704 } 705 706 void __kmp_exit_single(int gtid) { 707 #if USE_ITT_BUILD 708 __kmp_itt_single_end(gtid); 709 #endif /* USE_ITT_BUILD */ 710 if (__kmp_env_consistency_check) 711 __kmp_pop_workshare(gtid, ct_psingle, NULL); 712 } 713 714 /* determine if we can go parallel or must use a serialized parallel region and 715 * how many threads we can use 716 * set_nproc is the number of threads requested for the team 717 * returns 0 if we should serialize or only use one thread, 718 * otherwise the number of threads to use 719 * The forkjoin lock is held by the caller. */ 720 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 721 int master_tid, int set_nthreads, 722 int enter_teams) { 723 int capacity; 724 int new_nthreads; 725 KMP_DEBUG_ASSERT(__kmp_init_serial); 726 KMP_DEBUG_ASSERT(root && parent_team); 727 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 728 729 // If dyn-var is set, dynamically adjust the number of desired threads, 730 // according to the method specified by dynamic_mode. 731 new_nthreads = set_nthreads; 732 if (!get__dynamic_2(parent_team, master_tid)) { 733 ; 734 } 735 #ifdef USE_LOAD_BALANCE 736 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 737 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 738 if (new_nthreads == 1) { 739 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 740 "reservation to 1 thread\n", 741 master_tid)); 742 return 1; 743 } 744 if (new_nthreads < set_nthreads) { 745 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 746 "reservation to %d threads\n", 747 master_tid, new_nthreads)); 748 } 749 } 750 #endif /* USE_LOAD_BALANCE */ 751 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 752 new_nthreads = __kmp_avail_proc - __kmp_nth + 753 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 754 if (new_nthreads <= 1) { 755 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 756 "reservation to 1 thread\n", 757 master_tid)); 758 return 1; 759 } 760 if (new_nthreads < set_nthreads) { 761 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 762 "reservation to %d threads\n", 763 master_tid, new_nthreads)); 764 } else { 765 new_nthreads = set_nthreads; 766 } 767 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 768 if (set_nthreads > 2) { 769 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 770 new_nthreads = (new_nthreads % set_nthreads) + 1; 771 if (new_nthreads == 1) { 772 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 773 "reservation to 1 thread\n", 774 master_tid)); 775 return 1; 776 } 777 if (new_nthreads < set_nthreads) { 778 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 779 "reservation to %d threads\n", 780 master_tid, new_nthreads)); 781 } 782 } 783 } else { 784 KMP_ASSERT(0); 785 } 786 787 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 788 if (__kmp_nth + new_nthreads - 789 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 790 __kmp_max_nth) { 791 int tl_nthreads = __kmp_max_nth - __kmp_nth + 792 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 793 if (tl_nthreads <= 0) { 794 tl_nthreads = 1; 795 } 796 797 // If dyn-var is false, emit a 1-time warning. 798 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 799 __kmp_reserve_warn = 1; 800 __kmp_msg(kmp_ms_warning, 801 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 802 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 803 } 804 if (tl_nthreads == 1) { 805 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 806 "reduced reservation to 1 thread\n", 807 master_tid)); 808 return 1; 809 } 810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 811 "reservation to %d threads\n", 812 master_tid, tl_nthreads)); 813 new_nthreads = tl_nthreads; 814 } 815 816 // Respect OMP_THREAD_LIMIT 817 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 818 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 819 if (cg_nthreads + new_nthreads - 820 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 821 max_cg_threads) { 822 int tl_nthreads = max_cg_threads - cg_nthreads + 823 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 824 if (tl_nthreads <= 0) { 825 tl_nthreads = 1; 826 } 827 828 // If dyn-var is false, emit a 1-time warning. 829 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 830 __kmp_reserve_warn = 1; 831 __kmp_msg(kmp_ms_warning, 832 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 833 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 834 } 835 if (tl_nthreads == 1) { 836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 837 "reduced reservation to 1 thread\n", 838 master_tid)); 839 return 1; 840 } 841 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 842 "reservation to %d threads\n", 843 master_tid, tl_nthreads)); 844 new_nthreads = tl_nthreads; 845 } 846 847 // Check if the threads array is large enough, or needs expanding. 848 // See comment in __kmp_register_root() about the adjustment if 849 // __kmp_threads[0] == NULL. 850 capacity = __kmp_threads_capacity; 851 if (TCR_PTR(__kmp_threads[0]) == NULL) { 852 --capacity; 853 } 854 if (__kmp_nth + new_nthreads - 855 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 856 capacity) { 857 // Expand the threads array. 858 int slotsRequired = __kmp_nth + new_nthreads - 859 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 860 capacity; 861 int slotsAdded = __kmp_expand_threads(slotsRequired); 862 if (slotsAdded < slotsRequired) { 863 // The threads array was not expanded enough. 864 new_nthreads -= (slotsRequired - slotsAdded); 865 KMP_ASSERT(new_nthreads >= 1); 866 867 // If dyn-var is false, emit a 1-time warning. 868 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 869 __kmp_reserve_warn = 1; 870 if (__kmp_tp_cached) { 871 __kmp_msg(kmp_ms_warning, 872 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 873 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 874 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 875 } else { 876 __kmp_msg(kmp_ms_warning, 877 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 878 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 879 } 880 } 881 } 882 } 883 884 #ifdef KMP_DEBUG 885 if (new_nthreads == 1) { 886 KC_TRACE(10, 887 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 888 "dead roots and rechecking; requested %d threads\n", 889 __kmp_get_gtid(), set_nthreads)); 890 } else { 891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 892 " %d threads\n", 893 __kmp_get_gtid(), new_nthreads, set_nthreads)); 894 } 895 #endif // KMP_DEBUG 896 return new_nthreads; 897 } 898 899 /* Allocate threads from the thread pool and assign them to the new team. We are 900 assured that there are enough threads available, because we checked on that 901 earlier within critical section forkjoin */ 902 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 903 kmp_info_t *master_th, int master_gtid) { 904 int i; 905 int use_hot_team; 906 907 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 908 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 909 KMP_MB(); 910 911 /* first, let's setup the master thread */ 912 master_th->th.th_info.ds.ds_tid = 0; 913 master_th->th.th_team = team; 914 master_th->th.th_team_nproc = team->t.t_nproc; 915 master_th->th.th_team_master = master_th; 916 master_th->th.th_team_serialized = FALSE; 917 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 918 919 /* make sure we are not the optimized hot team */ 920 #if KMP_NESTED_HOT_TEAMS 921 use_hot_team = 0; 922 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 923 if (hot_teams) { // hot teams array is not allocated if 924 // KMP_HOT_TEAMS_MAX_LEVEL=0 925 int level = team->t.t_active_level - 1; // index in array of hot teams 926 if (master_th->th.th_teams_microtask) { // are we inside the teams? 927 if (master_th->th.th_teams_size.nteams > 1) { 928 ++level; // level was not increased in teams construct for 929 // team_of_masters 930 } 931 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 932 master_th->th.th_teams_level == team->t.t_level) { 933 ++level; // level was not increased in teams construct for 934 // team_of_workers before the parallel 935 } // team->t.t_level will be increased inside parallel 936 } 937 if (level < __kmp_hot_teams_max_level) { 938 if (hot_teams[level].hot_team) { 939 // hot team has already been allocated for given level 940 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 941 use_hot_team = 1; // the team is ready to use 942 } else { 943 use_hot_team = 0; // AC: threads are not allocated yet 944 hot_teams[level].hot_team = team; // remember new hot team 945 hot_teams[level].hot_team_nth = team->t.t_nproc; 946 } 947 } else { 948 use_hot_team = 0; 949 } 950 } 951 #else 952 use_hot_team = team == root->r.r_hot_team; 953 #endif 954 if (!use_hot_team) { 955 956 /* install the master thread */ 957 team->t.t_threads[0] = master_th; 958 __kmp_initialize_info(master_th, team, 0, master_gtid); 959 960 /* now, install the worker threads */ 961 for (i = 1; i < team->t.t_nproc; i++) { 962 963 /* fork or reallocate a new thread and install it in team */ 964 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 965 team->t.t_threads[i] = thr; 966 KMP_DEBUG_ASSERT(thr); 967 KMP_DEBUG_ASSERT(thr->th.th_team == team); 968 /* align team and thread arrived states */ 969 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 970 "T#%d(%d:%d) join =%llu, plain=%llu\n", 971 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 972 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 973 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 974 team->t.t_bar[bs_plain_barrier].b_arrived)); 975 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 976 thr->th.th_teams_level = master_th->th.th_teams_level; 977 thr->th.th_teams_size = master_th->th.th_teams_size; 978 { // Initialize threads' barrier data. 979 int b; 980 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 981 for (b = 0; b < bs_last_barrier; ++b) { 982 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 983 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 984 #if USE_DEBUGGER 985 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 986 #endif 987 } 988 } 989 } 990 991 #if KMP_AFFINITY_SUPPORTED 992 __kmp_partition_places(team); 993 #endif 994 } 995 996 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 997 for (i = 0; i < team->t.t_nproc; i++) { 998 kmp_info_t *thr = team->t.t_threads[i]; 999 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1000 thr->th.th_prev_level != team->t.t_level) { 1001 team->t.t_display_affinity = 1; 1002 break; 1003 } 1004 } 1005 } 1006 1007 KMP_MB(); 1008 } 1009 1010 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1011 // Propagate any changes to the floating point control registers out to the team 1012 // We try to avoid unnecessary writes to the relevant cache line in the team 1013 // structure, so we don't make changes unless they are needed. 1014 inline static void propagateFPControl(kmp_team_t *team) { 1015 if (__kmp_inherit_fp_control) { 1016 kmp_int16 x87_fpu_control_word; 1017 kmp_uint32 mxcsr; 1018 1019 // Get master values of FPU control flags (both X87 and vector) 1020 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1021 __kmp_store_mxcsr(&mxcsr); 1022 mxcsr &= KMP_X86_MXCSR_MASK; 1023 1024 // There is no point looking at t_fp_control_saved here. 1025 // If it is TRUE, we still have to update the values if they are different 1026 // from those we now have. If it is FALSE we didn't save anything yet, but 1027 // our objective is the same. We have to ensure that the values in the team 1028 // are the same as those we have. 1029 // So, this code achieves what we need whether or not t_fp_control_saved is 1030 // true. By checking whether the value needs updating we avoid unnecessary 1031 // writes that would put the cache-line into a written state, causing all 1032 // threads in the team to have to read it again. 1033 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1034 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1035 // Although we don't use this value, other code in the runtime wants to know 1036 // whether it should restore them. So we must ensure it is correct. 1037 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1038 } else { 1039 // Similarly here. Don't write to this cache-line in the team structure 1040 // unless we have to. 1041 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1042 } 1043 } 1044 1045 // Do the opposite, setting the hardware registers to the updated values from 1046 // the team. 1047 inline static void updateHWFPControl(kmp_team_t *team) { 1048 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1049 // Only reset the fp control regs if they have been changed in the team. 1050 // the parallel region that we are exiting. 1051 kmp_int16 x87_fpu_control_word; 1052 kmp_uint32 mxcsr; 1053 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1054 __kmp_store_mxcsr(&mxcsr); 1055 mxcsr &= KMP_X86_MXCSR_MASK; 1056 1057 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1058 __kmp_clear_x87_fpu_status_word(); 1059 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1060 } 1061 1062 if (team->t.t_mxcsr != mxcsr) { 1063 __kmp_load_mxcsr(&team->t.t_mxcsr); 1064 } 1065 } 1066 } 1067 #else 1068 #define propagateFPControl(x) ((void)0) 1069 #define updateHWFPControl(x) ((void)0) 1070 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1071 1072 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1073 int realloc); // forward declaration 1074 1075 /* Run a parallel region that has been serialized, so runs only in a team of the 1076 single master thread. */ 1077 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1078 kmp_info_t *this_thr; 1079 kmp_team_t *serial_team; 1080 1081 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1082 1083 /* Skip all this code for autopar serialized loops since it results in 1084 unacceptable overhead */ 1085 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1086 return; 1087 1088 if (!TCR_4(__kmp_init_parallel)) 1089 __kmp_parallel_initialize(); 1090 __kmp_resume_if_soft_paused(); 1091 1092 this_thr = __kmp_threads[global_tid]; 1093 serial_team = this_thr->th.th_serial_team; 1094 1095 /* utilize the serialized team held by this thread */ 1096 KMP_DEBUG_ASSERT(serial_team); 1097 KMP_MB(); 1098 1099 if (__kmp_tasking_mode != tskm_immediate_exec) { 1100 KMP_DEBUG_ASSERT( 1101 this_thr->th.th_task_team == 1102 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1103 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1104 NULL); 1105 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1106 "team %p, new task_team = NULL\n", 1107 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1108 this_thr->th.th_task_team = NULL; 1109 } 1110 1111 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1112 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1113 proc_bind = proc_bind_false; 1114 } else if (proc_bind == proc_bind_default) { 1115 // No proc_bind clause was specified, so use the current value 1116 // of proc-bind-var for this parallel region. 1117 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1118 } 1119 // Reset for next parallel region 1120 this_thr->th.th_set_proc_bind = proc_bind_default; 1121 1122 #if OMPT_SUPPORT 1123 ompt_data_t ompt_parallel_data = ompt_data_none; 1124 ompt_data_t *implicit_task_data; 1125 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1126 if (ompt_enabled.enabled && 1127 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1128 1129 ompt_task_info_t *parent_task_info; 1130 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1131 1132 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1133 if (ompt_enabled.ompt_callback_parallel_begin) { 1134 int team_size = 1; 1135 1136 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1137 &(parent_task_info->task_data), &(parent_task_info->frame), 1138 &ompt_parallel_data, team_size, 1139 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1140 } 1141 } 1142 #endif // OMPT_SUPPORT 1143 1144 if (this_thr->th.th_team != serial_team) { 1145 // Nested level will be an index in the nested nthreads array 1146 int level = this_thr->th.th_team->t.t_level; 1147 1148 if (serial_team->t.t_serialized) { 1149 /* this serial team was already used 1150 TODO increase performance by making this locks more specific */ 1151 kmp_team_t *new_team; 1152 1153 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1154 1155 new_team = 1156 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1157 #if OMPT_SUPPORT 1158 ompt_parallel_data, 1159 #endif 1160 proc_bind, &this_thr->th.th_current_task->td_icvs, 1161 0 USE_NESTED_HOT_ARG(NULL)); 1162 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1163 KMP_ASSERT(new_team); 1164 1165 /* setup new serialized team and install it */ 1166 new_team->t.t_threads[0] = this_thr; 1167 new_team->t.t_parent = this_thr->th.th_team; 1168 serial_team = new_team; 1169 this_thr->th.th_serial_team = serial_team; 1170 1171 KF_TRACE( 1172 10, 1173 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1174 global_tid, serial_team)); 1175 1176 /* TODO the above breaks the requirement that if we run out of resources, 1177 then we can still guarantee that serialized teams are ok, since we may 1178 need to allocate a new one */ 1179 } else { 1180 KF_TRACE( 1181 10, 1182 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1183 global_tid, serial_team)); 1184 } 1185 1186 /* we have to initialize this serial team */ 1187 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1188 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1189 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1190 serial_team->t.t_ident = loc; 1191 serial_team->t.t_serialized = 1; 1192 serial_team->t.t_nproc = 1; 1193 serial_team->t.t_parent = this_thr->th.th_team; 1194 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1195 this_thr->th.th_team = serial_team; 1196 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1197 1198 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1199 this_thr->th.th_current_task)); 1200 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1201 this_thr->th.th_current_task->td_flags.executing = 0; 1202 1203 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1204 1205 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1206 implicit task for each serialized task represented by 1207 team->t.t_serialized? */ 1208 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1209 &this_thr->th.th_current_task->td_parent->td_icvs); 1210 1211 // Thread value exists in the nested nthreads array for the next nested 1212 // level 1213 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1214 this_thr->th.th_current_task->td_icvs.nproc = 1215 __kmp_nested_nth.nth[level + 1]; 1216 } 1217 1218 if (__kmp_nested_proc_bind.used && 1219 (level + 1 < __kmp_nested_proc_bind.used)) { 1220 this_thr->th.th_current_task->td_icvs.proc_bind = 1221 __kmp_nested_proc_bind.bind_types[level + 1]; 1222 } 1223 1224 #if USE_DEBUGGER 1225 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1226 #endif 1227 this_thr->th.th_info.ds.ds_tid = 0; 1228 1229 /* set thread cache values */ 1230 this_thr->th.th_team_nproc = 1; 1231 this_thr->th.th_team_master = this_thr; 1232 this_thr->th.th_team_serialized = 1; 1233 1234 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1235 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1236 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1237 1238 propagateFPControl(serial_team); 1239 1240 /* check if we need to allocate dispatch buffers stack */ 1241 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1242 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1243 serial_team->t.t_dispatch->th_disp_buffer = 1244 (dispatch_private_info_t *)__kmp_allocate( 1245 sizeof(dispatch_private_info_t)); 1246 } 1247 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1248 1249 KMP_MB(); 1250 1251 } else { 1252 /* this serialized team is already being used, 1253 * that's fine, just add another nested level */ 1254 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1255 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1256 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1257 ++serial_team->t.t_serialized; 1258 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1259 1260 // Nested level will be an index in the nested nthreads array 1261 int level = this_thr->th.th_team->t.t_level; 1262 // Thread value exists in the nested nthreads array for the next nested 1263 // level 1264 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1265 this_thr->th.th_current_task->td_icvs.nproc = 1266 __kmp_nested_nth.nth[level + 1]; 1267 } 1268 serial_team->t.t_level++; 1269 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1270 "of serial team %p to %d\n", 1271 global_tid, serial_team, serial_team->t.t_level)); 1272 1273 /* allocate/push dispatch buffers stack */ 1274 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1275 { 1276 dispatch_private_info_t *disp_buffer = 1277 (dispatch_private_info_t *)__kmp_allocate( 1278 sizeof(dispatch_private_info_t)); 1279 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1280 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1281 } 1282 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1283 1284 KMP_MB(); 1285 } 1286 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1287 1288 // Perform the display affinity functionality for 1289 // serialized parallel regions 1290 if (__kmp_display_affinity) { 1291 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1292 this_thr->th.th_prev_num_threads != 1) { 1293 // NULL means use the affinity-format-var ICV 1294 __kmp_aux_display_affinity(global_tid, NULL); 1295 this_thr->th.th_prev_level = serial_team->t.t_level; 1296 this_thr->th.th_prev_num_threads = 1; 1297 } 1298 } 1299 1300 if (__kmp_env_consistency_check) 1301 __kmp_push_parallel(global_tid, NULL); 1302 #if OMPT_SUPPORT 1303 serial_team->t.ompt_team_info.master_return_address = codeptr; 1304 if (ompt_enabled.enabled && 1305 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1306 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1307 OMPT_GET_FRAME_ADDRESS(0); 1308 1309 ompt_lw_taskteam_t lw_taskteam; 1310 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1311 &ompt_parallel_data, codeptr); 1312 1313 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1314 // don't use lw_taskteam after linking. content was swaped 1315 1316 /* OMPT implicit task begin */ 1317 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1318 if (ompt_enabled.ompt_callback_implicit_task) { 1319 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1320 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1321 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1322 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1323 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1324 __kmp_tid_from_gtid(global_tid); 1325 } 1326 1327 /* OMPT state */ 1328 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1329 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1330 OMPT_GET_FRAME_ADDRESS(0); 1331 } 1332 #endif 1333 } 1334 1335 /* most of the work for a fork */ 1336 /* return true if we really went parallel, false if serialized */ 1337 int __kmp_fork_call(ident_t *loc, int gtid, 1338 enum fork_context_e call_context, // Intel, GNU, ... 1339 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1340 kmp_va_list ap) { 1341 void **argv; 1342 int i; 1343 int master_tid; 1344 int master_this_cons; 1345 kmp_team_t *team; 1346 kmp_team_t *parent_team; 1347 kmp_info_t *master_th; 1348 kmp_root_t *root; 1349 int nthreads; 1350 int master_active; 1351 int master_set_numthreads; 1352 int level; 1353 int active_level; 1354 int teams_level; 1355 #if KMP_NESTED_HOT_TEAMS 1356 kmp_hot_team_ptr_t **p_hot_teams; 1357 #endif 1358 { // KMP_TIME_BLOCK 1359 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1360 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1361 1362 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1363 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1364 /* Some systems prefer the stack for the root thread(s) to start with */ 1365 /* some gap from the parent stack to prevent false sharing. */ 1366 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1367 /* These 2 lines below are so this does not get optimized out */ 1368 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1369 __kmp_stkpadding += (short)((kmp_int64)dummy); 1370 } 1371 1372 /* initialize if needed */ 1373 KMP_DEBUG_ASSERT( 1374 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1375 if (!TCR_4(__kmp_init_parallel)) 1376 __kmp_parallel_initialize(); 1377 __kmp_resume_if_soft_paused(); 1378 1379 /* setup current data */ 1380 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1381 // shutdown 1382 parent_team = master_th->th.th_team; 1383 master_tid = master_th->th.th_info.ds.ds_tid; 1384 master_this_cons = master_th->th.th_local.this_construct; 1385 root = master_th->th.th_root; 1386 master_active = root->r.r_active; 1387 master_set_numthreads = master_th->th.th_set_nproc; 1388 1389 #if OMPT_SUPPORT 1390 ompt_data_t ompt_parallel_data = ompt_data_none; 1391 ompt_data_t *parent_task_data; 1392 ompt_frame_t *ompt_frame; 1393 ompt_data_t *implicit_task_data; 1394 void *return_address = NULL; 1395 1396 if (ompt_enabled.enabled) { 1397 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1398 NULL, NULL); 1399 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1400 } 1401 #endif 1402 1403 // Nested level will be an index in the nested nthreads array 1404 level = parent_team->t.t_level; 1405 // used to launch non-serial teams even if nested is not allowed 1406 active_level = parent_team->t.t_active_level; 1407 // needed to check nesting inside the teams 1408 teams_level = master_th->th.th_teams_level; 1409 #if KMP_NESTED_HOT_TEAMS 1410 p_hot_teams = &master_th->th.th_hot_teams; 1411 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1412 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1413 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1414 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1415 // it is either actual or not needed (when active_level > 0) 1416 (*p_hot_teams)[0].hot_team_nth = 1; 1417 } 1418 #endif 1419 1420 #if OMPT_SUPPORT 1421 if (ompt_enabled.enabled) { 1422 if (ompt_enabled.ompt_callback_parallel_begin) { 1423 int team_size = master_set_numthreads 1424 ? master_set_numthreads 1425 : get__nproc_2(parent_team, master_tid); 1426 int flags = OMPT_INVOKER(call_context) | 1427 ((microtask == (microtask_t)__kmp_teams_master) 1428 ? ompt_parallel_league 1429 : ompt_parallel_team); 1430 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1431 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1432 return_address); 1433 } 1434 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1435 } 1436 #endif 1437 1438 master_th->th.th_ident = loc; 1439 1440 if (master_th->th.th_teams_microtask && ap && 1441 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1442 // AC: This is start of parallel that is nested inside teams construct. 1443 // The team is actual (hot), all workers are ready at the fork barrier. 1444 // No lock needed to initialize the team a bit, then free workers. 1445 parent_team->t.t_ident = loc; 1446 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1447 parent_team->t.t_argc = argc; 1448 argv = (void **)parent_team->t.t_argv; 1449 for (i = argc - 1; i >= 0; --i) 1450 *argv++ = va_arg(kmp_va_deref(ap), void *); 1451 // Increment our nested depth levels, but not increase the serialization 1452 if (parent_team == master_th->th.th_serial_team) { 1453 // AC: we are in serialized parallel 1454 __kmpc_serialized_parallel(loc, gtid); 1455 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1456 1457 if (call_context == fork_context_gnu) { 1458 // AC: need to decrement t_serialized for enquiry functions to work 1459 // correctly, will restore at join time 1460 parent_team->t.t_serialized--; 1461 return TRUE; 1462 } 1463 1464 #if OMPT_SUPPORT 1465 void *dummy; 1466 void **exit_frame_p; 1467 1468 ompt_lw_taskteam_t lw_taskteam; 1469 1470 if (ompt_enabled.enabled) { 1471 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1472 &ompt_parallel_data, return_address); 1473 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1474 1475 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1476 // don't use lw_taskteam after linking. content was swaped 1477 1478 /* OMPT implicit task begin */ 1479 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1480 if (ompt_enabled.ompt_callback_implicit_task) { 1481 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1482 __kmp_tid_from_gtid(gtid); 1483 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1484 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1485 implicit_task_data, 1, 1486 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1487 } 1488 1489 /* OMPT state */ 1490 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1491 } else { 1492 exit_frame_p = &dummy; 1493 } 1494 #endif 1495 // AC: need to decrement t_serialized for enquiry functions to work 1496 // correctly, will restore at join time 1497 parent_team->t.t_serialized--; 1498 1499 { 1500 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1501 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1502 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1503 #if OMPT_SUPPORT 1504 , 1505 exit_frame_p 1506 #endif 1507 ); 1508 } 1509 1510 #if OMPT_SUPPORT 1511 if (ompt_enabled.enabled) { 1512 *exit_frame_p = NULL; 1513 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1514 if (ompt_enabled.ompt_callback_implicit_task) { 1515 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1516 ompt_scope_end, NULL, implicit_task_data, 1, 1517 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1518 } 1519 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1520 __ompt_lw_taskteam_unlink(master_th); 1521 if (ompt_enabled.ompt_callback_parallel_end) { 1522 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1523 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1524 OMPT_INVOKER(call_context) | ompt_parallel_team, 1525 return_address); 1526 } 1527 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1528 } 1529 #endif 1530 return TRUE; 1531 } 1532 1533 parent_team->t.t_pkfn = microtask; 1534 parent_team->t.t_invoke = invoker; 1535 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1536 parent_team->t.t_active_level++; 1537 parent_team->t.t_level++; 1538 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1539 1540 #if OMPT_SUPPORT 1541 if (ompt_enabled.enabled) { 1542 ompt_lw_taskteam_t lw_taskteam; 1543 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1544 &ompt_parallel_data, return_address); 1545 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1546 } 1547 #endif 1548 1549 /* Change number of threads in the team if requested */ 1550 if (master_set_numthreads) { // The parallel has num_threads clause 1551 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1552 // AC: only can reduce number of threads dynamically, can't increase 1553 kmp_info_t **other_threads = parent_team->t.t_threads; 1554 parent_team->t.t_nproc = master_set_numthreads; 1555 for (i = 0; i < master_set_numthreads; ++i) { 1556 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1557 } 1558 // Keep extra threads hot in the team for possible next parallels 1559 } 1560 master_th->th.th_set_nproc = 0; 1561 } 1562 1563 #if USE_DEBUGGER 1564 if (__kmp_debugging) { // Let debugger override number of threads. 1565 int nth = __kmp_omp_num_threads(loc); 1566 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1567 master_set_numthreads = nth; 1568 } 1569 } 1570 #endif 1571 1572 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1573 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1574 KMP_ITT_DEBUG) && 1575 __kmp_forkjoin_frames_mode == 3 && 1576 parent_team->t.t_active_level == 1 // only report frames at level 1 1577 && master_th->th.th_teams_size.nteams == 1) { 1578 kmp_uint64 tmp_time = __itt_get_timestamp(); 1579 master_th->th.th_frame_time = tmp_time; 1580 parent_team->t.t_region_time = tmp_time; 1581 } 1582 if (__itt_stack_caller_create_ptr) { 1583 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1584 // create new stack stitching id before entering fork barrier 1585 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1586 } 1587 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1588 1589 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1590 "master_th=%p, gtid=%d\n", 1591 root, parent_team, master_th, gtid)); 1592 __kmp_internal_fork(loc, gtid, parent_team); 1593 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1594 "master_th=%p, gtid=%d\n", 1595 root, parent_team, master_th, gtid)); 1596 1597 if (call_context == fork_context_gnu) 1598 return TRUE; 1599 1600 /* Invoke microtask for MASTER thread */ 1601 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1602 parent_team->t.t_id, parent_team->t.t_pkfn)); 1603 1604 if (!parent_team->t.t_invoke(gtid)) { 1605 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1606 } 1607 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1608 parent_team->t.t_id, parent_team->t.t_pkfn)); 1609 KMP_MB(); /* Flush all pending memory write invalidates. */ 1610 1611 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1612 1613 return TRUE; 1614 } // Parallel closely nested in teams construct 1615 1616 #if KMP_DEBUG 1617 if (__kmp_tasking_mode != tskm_immediate_exec) { 1618 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1619 parent_team->t.t_task_team[master_th->th.th_task_state]); 1620 } 1621 #endif 1622 1623 int enter_teams = 0; 1624 if (parent_team->t.t_active_level >= 1625 master_th->th.th_current_task->td_icvs.max_active_levels) { 1626 nthreads = 1; 1627 } else { 1628 enter_teams = ((ap == NULL && active_level == 0) || 1629 (ap && teams_level > 0 && teams_level == level)); 1630 nthreads = 1631 master_set_numthreads 1632 ? master_set_numthreads 1633 : get__nproc_2( 1634 parent_team, 1635 master_tid); // TODO: get nproc directly from current task 1636 1637 // Check if we need to take forkjoin lock? (no need for serialized 1638 // parallel out of teams construct). This code moved here from 1639 // __kmp_reserve_threads() to speedup nested serialized parallels. 1640 if (nthreads > 1) { 1641 if ((get__max_active_levels(master_th) == 1 && 1642 (root->r.r_in_parallel && !enter_teams)) || 1643 (__kmp_library == library_serial)) { 1644 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1645 " threads\n", 1646 gtid, nthreads)); 1647 nthreads = 1; 1648 } 1649 } 1650 if (nthreads > 1) { 1651 /* determine how many new threads we can use */ 1652 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1653 /* AC: If we execute teams from parallel region (on host), then teams 1654 should be created but each can only have 1 thread if nesting is 1655 disabled. If teams called from serial region, then teams and their 1656 threads should be created regardless of the nesting setting. */ 1657 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1658 nthreads, enter_teams); 1659 if (nthreads == 1) { 1660 // Free lock for single thread execution here; for multi-thread 1661 // execution it will be freed later after team of threads created 1662 // and initialized 1663 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1664 } 1665 } 1666 } 1667 KMP_DEBUG_ASSERT(nthreads > 0); 1668 1669 // If we temporarily changed the set number of threads then restore it now 1670 master_th->th.th_set_nproc = 0; 1671 1672 /* create a serialized parallel region? */ 1673 if (nthreads == 1) { 1674 /* josh todo: hypothetical question: what do we do for OS X*? */ 1675 #if KMP_OS_LINUX && \ 1676 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1677 void *args[argc]; 1678 #else 1679 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1680 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1681 KMP_ARCH_AARCH64) */ 1682 1683 KA_TRACE(20, 1684 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1685 1686 __kmpc_serialized_parallel(loc, gtid); 1687 1688 if (call_context == fork_context_intel) { 1689 /* TODO this sucks, use the compiler itself to pass args! :) */ 1690 master_th->th.th_serial_team->t.t_ident = loc; 1691 if (!ap) { 1692 // revert change made in __kmpc_serialized_parallel() 1693 master_th->th.th_serial_team->t.t_level--; 1694 // Get args from parent team for teams construct 1695 1696 #if OMPT_SUPPORT 1697 void *dummy; 1698 void **exit_frame_p; 1699 ompt_task_info_t *task_info; 1700 1701 ompt_lw_taskteam_t lw_taskteam; 1702 1703 if (ompt_enabled.enabled) { 1704 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1705 &ompt_parallel_data, return_address); 1706 1707 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1708 // don't use lw_taskteam after linking. content was swaped 1709 1710 task_info = OMPT_CUR_TASK_INFO(master_th); 1711 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1712 if (ompt_enabled.ompt_callback_implicit_task) { 1713 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1714 __kmp_tid_from_gtid(gtid); 1715 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1716 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1717 &(task_info->task_data), 1, 1718 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1719 ompt_task_implicit); 1720 } 1721 1722 /* OMPT state */ 1723 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1724 } else { 1725 exit_frame_p = &dummy; 1726 } 1727 #endif 1728 1729 { 1730 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1731 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1732 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1733 parent_team->t.t_argv 1734 #if OMPT_SUPPORT 1735 , 1736 exit_frame_p 1737 #endif 1738 ); 1739 } 1740 1741 #if OMPT_SUPPORT 1742 if (ompt_enabled.enabled) { 1743 *exit_frame_p = NULL; 1744 if (ompt_enabled.ompt_callback_implicit_task) { 1745 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1746 ompt_scope_end, NULL, &(task_info->task_data), 1, 1747 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1748 ompt_task_implicit); 1749 } 1750 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1751 __ompt_lw_taskteam_unlink(master_th); 1752 if (ompt_enabled.ompt_callback_parallel_end) { 1753 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1754 &ompt_parallel_data, parent_task_data, 1755 OMPT_INVOKER(call_context) | ompt_parallel_team, 1756 return_address); 1757 } 1758 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1759 } 1760 #endif 1761 } else if (microtask == (microtask_t)__kmp_teams_master) { 1762 KMP_DEBUG_ASSERT(master_th->th.th_team == 1763 master_th->th.th_serial_team); 1764 team = master_th->th.th_team; 1765 // team->t.t_pkfn = microtask; 1766 team->t.t_invoke = invoker; 1767 __kmp_alloc_argv_entries(argc, team, TRUE); 1768 team->t.t_argc = argc; 1769 argv = (void **)team->t.t_argv; 1770 if (ap) { 1771 for (i = argc - 1; i >= 0; --i) 1772 *argv++ = va_arg(kmp_va_deref(ap), void *); 1773 } else { 1774 for (i = 0; i < argc; ++i) 1775 // Get args from parent team for teams construct 1776 argv[i] = parent_team->t.t_argv[i]; 1777 } 1778 // AC: revert change made in __kmpc_serialized_parallel() 1779 // because initial code in teams should have level=0 1780 team->t.t_level--; 1781 // AC: call special invoker for outer "parallel" of teams construct 1782 invoker(gtid); 1783 #if OMPT_SUPPORT 1784 if (ompt_enabled.enabled) { 1785 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1786 if (ompt_enabled.ompt_callback_implicit_task) { 1787 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1788 ompt_scope_end, NULL, &(task_info->task_data), 0, 1789 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1790 } 1791 if (ompt_enabled.ompt_callback_parallel_end) { 1792 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1793 &ompt_parallel_data, parent_task_data, 1794 OMPT_INVOKER(call_context) | ompt_parallel_league, 1795 return_address); 1796 } 1797 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1798 } 1799 #endif 1800 } else { 1801 argv = args; 1802 for (i = argc - 1; i >= 0; --i) 1803 *argv++ = va_arg(kmp_va_deref(ap), void *); 1804 KMP_MB(); 1805 1806 #if OMPT_SUPPORT 1807 void *dummy; 1808 void **exit_frame_p; 1809 ompt_task_info_t *task_info; 1810 1811 ompt_lw_taskteam_t lw_taskteam; 1812 1813 if (ompt_enabled.enabled) { 1814 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1815 &ompt_parallel_data, return_address); 1816 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1817 // don't use lw_taskteam after linking. content was swaped 1818 task_info = OMPT_CUR_TASK_INFO(master_th); 1819 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1820 1821 /* OMPT implicit task begin */ 1822 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1823 if (ompt_enabled.ompt_callback_implicit_task) { 1824 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1825 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1826 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1827 ompt_task_implicit); 1828 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1829 __kmp_tid_from_gtid(gtid); 1830 } 1831 1832 /* OMPT state */ 1833 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1834 } else { 1835 exit_frame_p = &dummy; 1836 } 1837 #endif 1838 1839 { 1840 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1841 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1842 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1843 #if OMPT_SUPPORT 1844 , 1845 exit_frame_p 1846 #endif 1847 ); 1848 } 1849 1850 #if OMPT_SUPPORT 1851 if (ompt_enabled.enabled) { 1852 *exit_frame_p = NULL; 1853 if (ompt_enabled.ompt_callback_implicit_task) { 1854 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1855 ompt_scope_end, NULL, &(task_info->task_data), 1, 1856 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1857 ompt_task_implicit); 1858 } 1859 1860 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1861 __ompt_lw_taskteam_unlink(master_th); 1862 if (ompt_enabled.ompt_callback_parallel_end) { 1863 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1864 &ompt_parallel_data, parent_task_data, 1865 OMPT_INVOKER(call_context) | ompt_parallel_team, 1866 return_address); 1867 } 1868 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1869 } 1870 #endif 1871 } 1872 } else if (call_context == fork_context_gnu) { 1873 #if OMPT_SUPPORT 1874 ompt_lw_taskteam_t lwt; 1875 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1876 return_address); 1877 1878 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1879 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1880 // don't use lw_taskteam after linking. content was swaped 1881 #endif 1882 1883 // we were called from GNU native code 1884 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1885 return FALSE; 1886 } else { 1887 KMP_ASSERT2(call_context < fork_context_last, 1888 "__kmp_fork_call: unknown fork_context parameter"); 1889 } 1890 1891 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1892 KMP_MB(); 1893 return FALSE; 1894 } // if (nthreads == 1) 1895 1896 // GEH: only modify the executing flag in the case when not serialized 1897 // serialized case is handled in kmpc_serialized_parallel 1898 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1899 "curtask=%p, curtask_max_aclevel=%d\n", 1900 parent_team->t.t_active_level, master_th, 1901 master_th->th.th_current_task, 1902 master_th->th.th_current_task->td_icvs.max_active_levels)); 1903 // TODO: GEH - cannot do this assertion because root thread not set up as 1904 // executing 1905 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1906 master_th->th.th_current_task->td_flags.executing = 0; 1907 1908 if (!master_th->th.th_teams_microtask || level > teams_level) { 1909 /* Increment our nested depth level */ 1910 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1911 } 1912 1913 // See if we need to make a copy of the ICVs. 1914 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1915 if ((level + 1 < __kmp_nested_nth.used) && 1916 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1917 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1918 } else { 1919 nthreads_icv = 0; // don't update 1920 } 1921 1922 // Figure out the proc_bind_policy for the new team. 1923 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1924 kmp_proc_bind_t proc_bind_icv = 1925 proc_bind_default; // proc_bind_default means don't update 1926 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1927 proc_bind = proc_bind_false; 1928 } else { 1929 if (proc_bind == proc_bind_default) { 1930 // No proc_bind clause specified; use current proc-bind-var for this 1931 // parallel region 1932 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1933 } 1934 /* else: The proc_bind policy was specified explicitly on parallel clause. 1935 This overrides proc-bind-var for this parallel region, but does not 1936 change proc-bind-var. */ 1937 // Figure the value of proc-bind-var for the child threads. 1938 if ((level + 1 < __kmp_nested_proc_bind.used) && 1939 (__kmp_nested_proc_bind.bind_types[level + 1] != 1940 master_th->th.th_current_task->td_icvs.proc_bind)) { 1941 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1942 } 1943 } 1944 1945 // Reset for next parallel region 1946 master_th->th.th_set_proc_bind = proc_bind_default; 1947 1948 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1949 kmp_internal_control_t new_icvs; 1950 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1951 new_icvs.next = NULL; 1952 if (nthreads_icv > 0) { 1953 new_icvs.nproc = nthreads_icv; 1954 } 1955 if (proc_bind_icv != proc_bind_default) { 1956 new_icvs.proc_bind = proc_bind_icv; 1957 } 1958 1959 /* allocate a new parallel team */ 1960 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1961 team = __kmp_allocate_team(root, nthreads, nthreads, 1962 #if OMPT_SUPPORT 1963 ompt_parallel_data, 1964 #endif 1965 proc_bind, &new_icvs, 1966 argc USE_NESTED_HOT_ARG(master_th)); 1967 } else { 1968 /* allocate a new parallel team */ 1969 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1970 team = __kmp_allocate_team(root, nthreads, nthreads, 1971 #if OMPT_SUPPORT 1972 ompt_parallel_data, 1973 #endif 1974 proc_bind, 1975 &master_th->th.th_current_task->td_icvs, 1976 argc USE_NESTED_HOT_ARG(master_th)); 1977 } 1978 KF_TRACE( 1979 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 1980 1981 /* setup the new team */ 1982 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 1983 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 1984 KMP_CHECK_UPDATE(team->t.t_ident, loc); 1985 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 1986 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 1987 #if OMPT_SUPPORT 1988 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 1989 return_address); 1990 #endif 1991 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 1992 // TODO: parent_team->t.t_level == INT_MAX ??? 1993 if (!master_th->th.th_teams_microtask || level > teams_level) { 1994 int new_level = parent_team->t.t_level + 1; 1995 KMP_CHECK_UPDATE(team->t.t_level, new_level); 1996 new_level = parent_team->t.t_active_level + 1; 1997 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 1998 } else { 1999 // AC: Do not increase parallel level at start of the teams construct 2000 int new_level = parent_team->t.t_level; 2001 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2002 new_level = parent_team->t.t_active_level; 2003 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2004 } 2005 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2006 // set master's schedule as new run-time schedule 2007 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2008 2009 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2010 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2011 2012 // Update the floating point rounding in the team if required. 2013 propagateFPControl(team); 2014 2015 if (__kmp_tasking_mode != tskm_immediate_exec) { 2016 // Set master's task team to team's task team. Unless this is hot team, it 2017 // should be NULL. 2018 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2019 parent_team->t.t_task_team[master_th->th.th_task_state]); 2020 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2021 "%p, new task_team %p / team %p\n", 2022 __kmp_gtid_from_thread(master_th), 2023 master_th->th.th_task_team, parent_team, 2024 team->t.t_task_team[master_th->th.th_task_state], team)); 2025 2026 if (active_level || master_th->th.th_task_team) { 2027 // Take a memo of master's task_state 2028 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2029 if (master_th->th.th_task_state_top >= 2030 master_th->th.th_task_state_stack_sz) { // increase size 2031 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2032 kmp_uint8 *old_stack, *new_stack; 2033 kmp_uint32 i; 2034 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2035 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2036 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2037 } 2038 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2039 ++i) { // zero-init rest of stack 2040 new_stack[i] = 0; 2041 } 2042 old_stack = master_th->th.th_task_state_memo_stack; 2043 master_th->th.th_task_state_memo_stack = new_stack; 2044 master_th->th.th_task_state_stack_sz = new_size; 2045 __kmp_free(old_stack); 2046 } 2047 // Store master's task_state on stack 2048 master_th->th 2049 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2050 master_th->th.th_task_state; 2051 master_th->th.th_task_state_top++; 2052 #if KMP_NESTED_HOT_TEAMS 2053 if (master_th->th.th_hot_teams && 2054 active_level < __kmp_hot_teams_max_level && 2055 team == master_th->th.th_hot_teams[active_level].hot_team) { 2056 // Restore master's nested state if nested hot team 2057 master_th->th.th_task_state = 2058 master_th->th 2059 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2060 } else { 2061 #endif 2062 master_th->th.th_task_state = 0; 2063 #if KMP_NESTED_HOT_TEAMS 2064 } 2065 #endif 2066 } 2067 #if !KMP_NESTED_HOT_TEAMS 2068 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2069 (team == root->r.r_hot_team)); 2070 #endif 2071 } 2072 2073 KA_TRACE( 2074 20, 2075 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2076 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2077 team->t.t_nproc)); 2078 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2079 (team->t.t_master_tid == 0 && 2080 (team->t.t_parent == root->r.r_root_team || 2081 team->t.t_parent->t.t_serialized))); 2082 KMP_MB(); 2083 2084 /* now, setup the arguments */ 2085 argv = (void **)team->t.t_argv; 2086 if (ap) { 2087 for (i = argc - 1; i >= 0; --i) { 2088 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2089 KMP_CHECK_UPDATE(*argv, new_argv); 2090 argv++; 2091 } 2092 } else { 2093 for (i = 0; i < argc; ++i) { 2094 // Get args from parent team for teams construct 2095 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2096 } 2097 } 2098 2099 /* now actually fork the threads */ 2100 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2101 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2102 root->r.r_active = TRUE; 2103 2104 __kmp_fork_team_threads(root, team, master_th, gtid); 2105 __kmp_setup_icv_copy(team, nthreads, 2106 &master_th->th.th_current_task->td_icvs, loc); 2107 2108 #if OMPT_SUPPORT 2109 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2110 #endif 2111 2112 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2113 2114 #if USE_ITT_BUILD 2115 if (team->t.t_active_level == 1 // only report frames at level 1 2116 && !master_th->th.th_teams_microtask) { // not in teams construct 2117 #if USE_ITT_NOTIFY 2118 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2119 (__kmp_forkjoin_frames_mode == 3 || 2120 __kmp_forkjoin_frames_mode == 1)) { 2121 kmp_uint64 tmp_time = 0; 2122 if (__itt_get_timestamp_ptr) 2123 tmp_time = __itt_get_timestamp(); 2124 // Internal fork - report frame begin 2125 master_th->th.th_frame_time = tmp_time; 2126 if (__kmp_forkjoin_frames_mode == 3) 2127 team->t.t_region_time = tmp_time; 2128 } else 2129 // only one notification scheme (either "submit" or "forking/joined", not both) 2130 #endif /* USE_ITT_NOTIFY */ 2131 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2132 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2133 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2134 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2135 } 2136 } 2137 #endif /* USE_ITT_BUILD */ 2138 2139 /* now go on and do the work */ 2140 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2141 KMP_MB(); 2142 KF_TRACE(10, 2143 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2144 root, team, master_th, gtid)); 2145 2146 #if USE_ITT_BUILD 2147 if (__itt_stack_caller_create_ptr) { 2148 // create new stack stitching id before entering fork barrier 2149 if (!enter_teams) { 2150 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2151 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2152 } else if (parent_team->t.t_serialized) { 2153 // keep stack stitching id in the serialized parent_team; 2154 // current team will be used for parallel inside the teams; 2155 // if parent_team is active, then it already keeps stack stitching id 2156 // for the league of teams 2157 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2158 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2159 } 2160 } 2161 #endif /* USE_ITT_BUILD */ 2162 2163 // AC: skip __kmp_internal_fork at teams construct, let only master 2164 // threads execute 2165 if (ap) { 2166 __kmp_internal_fork(loc, gtid, team); 2167 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2168 "master_th=%p, gtid=%d\n", 2169 root, team, master_th, gtid)); 2170 } 2171 2172 if (call_context == fork_context_gnu) { 2173 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2174 return TRUE; 2175 } 2176 2177 /* Invoke microtask for MASTER thread */ 2178 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2179 team->t.t_id, team->t.t_pkfn)); 2180 } // END of timer KMP_fork_call block 2181 2182 #if KMP_STATS_ENABLED 2183 // If beginning a teams construct, then change thread state 2184 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2185 if (!ap) { 2186 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2187 } 2188 #endif 2189 2190 if (!team->t.t_invoke(gtid)) { 2191 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2192 } 2193 2194 #if KMP_STATS_ENABLED 2195 // If was beginning of a teams construct, then reset thread state 2196 if (!ap) { 2197 KMP_SET_THREAD_STATE(previous_state); 2198 } 2199 #endif 2200 2201 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2202 team->t.t_id, team->t.t_pkfn)); 2203 KMP_MB(); /* Flush all pending memory write invalidates. */ 2204 2205 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2206 2207 #if OMPT_SUPPORT 2208 if (ompt_enabled.enabled) { 2209 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2210 } 2211 #endif 2212 2213 return TRUE; 2214 } 2215 2216 #if OMPT_SUPPORT 2217 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2218 kmp_team_t *team) { 2219 // restore state outside the region 2220 thread->th.ompt_thread_info.state = 2221 ((team->t.t_serialized) ? ompt_state_work_serial 2222 : ompt_state_work_parallel); 2223 } 2224 2225 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2226 kmp_team_t *team, ompt_data_t *parallel_data, 2227 int flags, void *codeptr) { 2228 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2229 if (ompt_enabled.ompt_callback_parallel_end) { 2230 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2231 parallel_data, &(task_info->task_data), flags, codeptr); 2232 } 2233 2234 task_info->frame.enter_frame = ompt_data_none; 2235 __kmp_join_restore_state(thread, team); 2236 } 2237 #endif 2238 2239 void __kmp_join_call(ident_t *loc, int gtid 2240 #if OMPT_SUPPORT 2241 , 2242 enum fork_context_e fork_context 2243 #endif 2244 , 2245 int exit_teams) { 2246 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2247 kmp_team_t *team; 2248 kmp_team_t *parent_team; 2249 kmp_info_t *master_th; 2250 kmp_root_t *root; 2251 int master_active; 2252 2253 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2254 2255 /* setup current data */ 2256 master_th = __kmp_threads[gtid]; 2257 root = master_th->th.th_root; 2258 team = master_th->th.th_team; 2259 parent_team = team->t.t_parent; 2260 2261 master_th->th.th_ident = loc; 2262 2263 #if OMPT_SUPPORT 2264 void *team_microtask = (void *)team->t.t_pkfn; 2265 // For GOMP interface with serialized parallel, need the 2266 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2267 // and end-parallel events. 2268 if (ompt_enabled.enabled && 2269 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2270 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2271 } 2272 #endif 2273 2274 #if KMP_DEBUG 2275 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2276 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2277 "th_task_team = %p\n", 2278 __kmp_gtid_from_thread(master_th), team, 2279 team->t.t_task_team[master_th->th.th_task_state], 2280 master_th->th.th_task_team)); 2281 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2282 team->t.t_task_team[master_th->th.th_task_state]); 2283 } 2284 #endif 2285 2286 if (team->t.t_serialized) { 2287 if (master_th->th.th_teams_microtask) { 2288 // We are in teams construct 2289 int level = team->t.t_level; 2290 int tlevel = master_th->th.th_teams_level; 2291 if (level == tlevel) { 2292 // AC: we haven't incremented it earlier at start of teams construct, 2293 // so do it here - at the end of teams construct 2294 team->t.t_level++; 2295 } else if (level == tlevel + 1) { 2296 // AC: we are exiting parallel inside teams, need to increment 2297 // serialization in order to restore it in the next call to 2298 // __kmpc_end_serialized_parallel 2299 team->t.t_serialized++; 2300 } 2301 } 2302 __kmpc_end_serialized_parallel(loc, gtid); 2303 2304 #if OMPT_SUPPORT 2305 if (ompt_enabled.enabled) { 2306 __kmp_join_restore_state(master_th, parent_team); 2307 } 2308 #endif 2309 2310 return; 2311 } 2312 2313 master_active = team->t.t_master_active; 2314 2315 if (!exit_teams) { 2316 // AC: No barrier for internal teams at exit from teams construct. 2317 // But there is barrier for external team (league). 2318 __kmp_internal_join(loc, gtid, team); 2319 #if USE_ITT_BUILD 2320 if (__itt_stack_caller_create_ptr) { 2321 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2322 // destroy the stack stitching id after join barrier 2323 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2324 team->t.t_stack_id = NULL; 2325 } 2326 #endif 2327 } else { 2328 master_th->th.th_task_state = 2329 0; // AC: no tasking in teams (out of any parallel) 2330 #if USE_ITT_BUILD 2331 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2332 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2333 // destroy the stack stitching id on exit from the teams construct 2334 // if parent_team is active, then the id will be destroyed later on 2335 // by master of the league of teams 2336 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2337 parent_team->t.t_stack_id = NULL; 2338 } 2339 #endif 2340 } 2341 2342 KMP_MB(); 2343 2344 #if OMPT_SUPPORT 2345 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2346 void *codeptr = team->t.ompt_team_info.master_return_address; 2347 #endif 2348 2349 #if USE_ITT_BUILD 2350 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2351 if (team->t.t_active_level == 1 && 2352 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2353 master_th->th.th_teams_size.nteams == 1)) { 2354 master_th->th.th_ident = loc; 2355 // only one notification scheme (either "submit" or "forking/joined", not 2356 // both) 2357 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2358 __kmp_forkjoin_frames_mode == 3) 2359 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2360 master_th->th.th_frame_time, 0, loc, 2361 master_th->th.th_team_nproc, 1); 2362 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2363 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2364 __kmp_itt_region_joined(gtid); 2365 } // active_level == 1 2366 #endif /* USE_ITT_BUILD */ 2367 2368 if (master_th->th.th_teams_microtask && !exit_teams && 2369 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2370 team->t.t_level == master_th->th.th_teams_level + 1) { 2371 // AC: We need to leave the team structure intact at the end of parallel 2372 // inside the teams construct, so that at the next parallel same (hot) team 2373 // works, only adjust nesting levels 2374 #if OMPT_SUPPORT 2375 ompt_data_t ompt_parallel_data = ompt_data_none; 2376 if (ompt_enabled.enabled) { 2377 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2378 if (ompt_enabled.ompt_callback_implicit_task) { 2379 int ompt_team_size = team->t.t_nproc; 2380 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2381 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2382 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2383 } 2384 task_info->frame.exit_frame = ompt_data_none; 2385 task_info->task_data = ompt_data_none; 2386 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2387 __ompt_lw_taskteam_unlink(master_th); 2388 } 2389 #endif 2390 /* Decrement our nested depth level */ 2391 team->t.t_level--; 2392 team->t.t_active_level--; 2393 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2394 2395 // Restore number of threads in the team if needed. This code relies on 2396 // the proper adjustment of th_teams_size.nth after the fork in 2397 // __kmp_teams_master on each teams master in the case that 2398 // __kmp_reserve_threads reduced it. 2399 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2400 int old_num = master_th->th.th_team_nproc; 2401 int new_num = master_th->th.th_teams_size.nth; 2402 kmp_info_t **other_threads = team->t.t_threads; 2403 team->t.t_nproc = new_num; 2404 for (int i = 0; i < old_num; ++i) { 2405 other_threads[i]->th.th_team_nproc = new_num; 2406 } 2407 // Adjust states of non-used threads of the team 2408 for (int i = old_num; i < new_num; ++i) { 2409 // Re-initialize thread's barrier data. 2410 KMP_DEBUG_ASSERT(other_threads[i]); 2411 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2412 for (int b = 0; b < bs_last_barrier; ++b) { 2413 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2414 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2415 #if USE_DEBUGGER 2416 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2417 #endif 2418 } 2419 if (__kmp_tasking_mode != tskm_immediate_exec) { 2420 // Synchronize thread's task state 2421 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2422 } 2423 } 2424 } 2425 2426 #if OMPT_SUPPORT 2427 if (ompt_enabled.enabled) { 2428 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2429 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2430 } 2431 #endif 2432 2433 return; 2434 } 2435 2436 /* do cleanup and restore the parent team */ 2437 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2438 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2439 2440 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2441 2442 /* jc: The following lock has instructions with REL and ACQ semantics, 2443 separating the parallel user code called in this parallel region 2444 from the serial user code called after this function returns. */ 2445 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2446 2447 if (!master_th->th.th_teams_microtask || 2448 team->t.t_level > master_th->th.th_teams_level) { 2449 /* Decrement our nested depth level */ 2450 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2451 } 2452 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2453 2454 #if OMPT_SUPPORT 2455 if (ompt_enabled.enabled) { 2456 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2457 if (ompt_enabled.ompt_callback_implicit_task) { 2458 int flags = (team_microtask == (void *)__kmp_teams_master) 2459 ? ompt_task_initial 2460 : ompt_task_implicit; 2461 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2462 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2463 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2464 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2465 } 2466 task_info->frame.exit_frame = ompt_data_none; 2467 task_info->task_data = ompt_data_none; 2468 } 2469 #endif 2470 2471 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2472 master_th, team)); 2473 __kmp_pop_current_task_from_thread(master_th); 2474 2475 #if KMP_AFFINITY_SUPPORTED 2476 // Restore master thread's partition. 2477 master_th->th.th_first_place = team->t.t_first_place; 2478 master_th->th.th_last_place = team->t.t_last_place; 2479 #endif // KMP_AFFINITY_SUPPORTED 2480 master_th->th.th_def_allocator = team->t.t_def_allocator; 2481 2482 updateHWFPControl(team); 2483 2484 if (root->r.r_active != master_active) 2485 root->r.r_active = master_active; 2486 2487 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2488 master_th)); // this will free worker threads 2489 2490 /* this race was fun to find. make sure the following is in the critical 2491 region otherwise assertions may fail occasionally since the old team may be 2492 reallocated and the hierarchy appears inconsistent. it is actually safe to 2493 run and won't cause any bugs, but will cause those assertion failures. it's 2494 only one deref&assign so might as well put this in the critical region */ 2495 master_th->th.th_team = parent_team; 2496 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2497 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2498 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2499 2500 /* restore serialized team, if need be */ 2501 if (parent_team->t.t_serialized && 2502 parent_team != master_th->th.th_serial_team && 2503 parent_team != root->r.r_root_team) { 2504 __kmp_free_team(root, 2505 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2506 master_th->th.th_serial_team = parent_team; 2507 } 2508 2509 if (__kmp_tasking_mode != tskm_immediate_exec) { 2510 if (master_th->th.th_task_state_top > 2511 0) { // Restore task state from memo stack 2512 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2513 // Remember master's state if we re-use this nested hot team 2514 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2515 master_th->th.th_task_state; 2516 --master_th->th.th_task_state_top; // pop 2517 // Now restore state at this level 2518 master_th->th.th_task_state = 2519 master_th->th 2520 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2521 } 2522 // Copy the task team from the parent team to the master thread 2523 master_th->th.th_task_team = 2524 parent_team->t.t_task_team[master_th->th.th_task_state]; 2525 KA_TRACE(20, 2526 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2527 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2528 parent_team)); 2529 } 2530 2531 // TODO: GEH - cannot do this assertion because root thread not set up as 2532 // executing 2533 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2534 master_th->th.th_current_task->td_flags.executing = 1; 2535 2536 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2537 2538 #if OMPT_SUPPORT 2539 int flags = 2540 OMPT_INVOKER(fork_context) | 2541 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2542 : ompt_parallel_team); 2543 if (ompt_enabled.enabled) { 2544 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2545 codeptr); 2546 } 2547 #endif 2548 2549 KMP_MB(); 2550 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2551 } 2552 2553 /* Check whether we should push an internal control record onto the 2554 serial team stack. If so, do it. */ 2555 void __kmp_save_internal_controls(kmp_info_t *thread) { 2556 2557 if (thread->th.th_team != thread->th.th_serial_team) { 2558 return; 2559 } 2560 if (thread->th.th_team->t.t_serialized > 1) { 2561 int push = 0; 2562 2563 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2564 push = 1; 2565 } else { 2566 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2567 thread->th.th_team->t.t_serialized) { 2568 push = 1; 2569 } 2570 } 2571 if (push) { /* push a record on the serial team's stack */ 2572 kmp_internal_control_t *control = 2573 (kmp_internal_control_t *)__kmp_allocate( 2574 sizeof(kmp_internal_control_t)); 2575 2576 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2577 2578 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2579 2580 control->next = thread->th.th_team->t.t_control_stack_top; 2581 thread->th.th_team->t.t_control_stack_top = control; 2582 } 2583 } 2584 } 2585 2586 /* Changes set_nproc */ 2587 void __kmp_set_num_threads(int new_nth, int gtid) { 2588 kmp_info_t *thread; 2589 kmp_root_t *root; 2590 2591 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2592 KMP_DEBUG_ASSERT(__kmp_init_serial); 2593 2594 if (new_nth < 1) 2595 new_nth = 1; 2596 else if (new_nth > __kmp_max_nth) 2597 new_nth = __kmp_max_nth; 2598 2599 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2600 thread = __kmp_threads[gtid]; 2601 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2602 return; // nothing to do 2603 2604 __kmp_save_internal_controls(thread); 2605 2606 set__nproc(thread, new_nth); 2607 2608 // If this omp_set_num_threads() call will cause the hot team size to be 2609 // reduced (in the absence of a num_threads clause), then reduce it now, 2610 // rather than waiting for the next parallel region. 2611 root = thread->th.th_root; 2612 if (__kmp_init_parallel && (!root->r.r_active) && 2613 (root->r.r_hot_team->t.t_nproc > new_nth) 2614 #if KMP_NESTED_HOT_TEAMS 2615 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2616 #endif 2617 ) { 2618 kmp_team_t *hot_team = root->r.r_hot_team; 2619 int f; 2620 2621 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2622 2623 // Release the extra threads we don't need any more. 2624 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2625 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2626 if (__kmp_tasking_mode != tskm_immediate_exec) { 2627 // When decreasing team size, threads no longer in the team should unref 2628 // task team. 2629 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2630 } 2631 __kmp_free_thread(hot_team->t.t_threads[f]); 2632 hot_team->t.t_threads[f] = NULL; 2633 } 2634 hot_team->t.t_nproc = new_nth; 2635 #if KMP_NESTED_HOT_TEAMS 2636 if (thread->th.th_hot_teams) { 2637 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2638 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2639 } 2640 #endif 2641 2642 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2643 2644 // Update the t_nproc field in the threads that are still active. 2645 for (f = 0; f < new_nth; f++) { 2646 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2647 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2648 } 2649 // Special flag in case omp_set_num_threads() call 2650 hot_team->t.t_size_changed = -1; 2651 } 2652 } 2653 2654 /* Changes max_active_levels */ 2655 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2656 kmp_info_t *thread; 2657 2658 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2659 "%d = (%d)\n", 2660 gtid, max_active_levels)); 2661 KMP_DEBUG_ASSERT(__kmp_init_serial); 2662 2663 // validate max_active_levels 2664 if (max_active_levels < 0) { 2665 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2666 // We ignore this call if the user has specified a negative value. 2667 // The current setting won't be changed. The last valid setting will be 2668 // used. A warning will be issued (if warnings are allowed as controlled by 2669 // the KMP_WARNINGS env var). 2670 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2671 "max_active_levels for thread %d = (%d)\n", 2672 gtid, max_active_levels)); 2673 return; 2674 } 2675 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2676 // it's OK, the max_active_levels is within the valid range: [ 0; 2677 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2678 // We allow a zero value. (implementation defined behavior) 2679 } else { 2680 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2681 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2682 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2683 // Current upper limit is MAX_INT. (implementation defined behavior) 2684 // If the input exceeds the upper limit, we correct the input to be the 2685 // upper limit. (implementation defined behavior) 2686 // Actually, the flow should never get here until we use MAX_INT limit. 2687 } 2688 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2689 "max_active_levels for thread %d = (%d)\n", 2690 gtid, max_active_levels)); 2691 2692 thread = __kmp_threads[gtid]; 2693 2694 __kmp_save_internal_controls(thread); 2695 2696 set__max_active_levels(thread, max_active_levels); 2697 } 2698 2699 /* Gets max_active_levels */ 2700 int __kmp_get_max_active_levels(int gtid) { 2701 kmp_info_t *thread; 2702 2703 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2704 KMP_DEBUG_ASSERT(__kmp_init_serial); 2705 2706 thread = __kmp_threads[gtid]; 2707 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2708 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2709 "curtask_maxaclevel=%d\n", 2710 gtid, thread->th.th_current_task, 2711 thread->th.th_current_task->td_icvs.max_active_levels)); 2712 return thread->th.th_current_task->td_icvs.max_active_levels; 2713 } 2714 2715 // nteams-var per-device ICV 2716 void __kmp_set_num_teams(int num_teams) { 2717 if (num_teams > 0) 2718 __kmp_nteams = num_teams; 2719 } 2720 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2721 // teams-thread-limit-var per-device ICV 2722 void __kmp_set_teams_thread_limit(int limit) { 2723 if (limit > 0) 2724 __kmp_teams_thread_limit = limit; 2725 } 2726 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2727 2728 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2729 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2730 2731 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2732 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2733 kmp_info_t *thread; 2734 kmp_sched_t orig_kind; 2735 // kmp_team_t *team; 2736 2737 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2738 gtid, (int)kind, chunk)); 2739 KMP_DEBUG_ASSERT(__kmp_init_serial); 2740 2741 // Check if the kind parameter is valid, correct if needed. 2742 // Valid parameters should fit in one of two intervals - standard or extended: 2743 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2744 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2745 orig_kind = kind; 2746 kind = __kmp_sched_without_mods(kind); 2747 2748 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2749 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2750 // TODO: Hint needs attention in case we change the default schedule. 2751 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2752 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2753 __kmp_msg_null); 2754 kind = kmp_sched_default; 2755 chunk = 0; // ignore chunk value in case of bad kind 2756 } 2757 2758 thread = __kmp_threads[gtid]; 2759 2760 __kmp_save_internal_controls(thread); 2761 2762 if (kind < kmp_sched_upper_std) { 2763 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2764 // differ static chunked vs. unchunked: chunk should be invalid to 2765 // indicate unchunked schedule (which is the default) 2766 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2767 } else { 2768 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2769 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2770 } 2771 } else { 2772 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2773 // kmp_sched_lower - 2 ]; 2774 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2775 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2776 kmp_sched_lower - 2]; 2777 } 2778 __kmp_sched_apply_mods_intkind( 2779 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2780 if (kind == kmp_sched_auto || chunk < 1) { 2781 // ignore parameter chunk for schedule auto 2782 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2783 } else { 2784 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2785 } 2786 } 2787 2788 /* Gets def_sched_var ICV values */ 2789 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2790 kmp_info_t *thread; 2791 enum sched_type th_type; 2792 2793 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2794 KMP_DEBUG_ASSERT(__kmp_init_serial); 2795 2796 thread = __kmp_threads[gtid]; 2797 2798 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2799 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2800 case kmp_sch_static: 2801 case kmp_sch_static_greedy: 2802 case kmp_sch_static_balanced: 2803 *kind = kmp_sched_static; 2804 __kmp_sched_apply_mods_stdkind(kind, th_type); 2805 *chunk = 0; // chunk was not set, try to show this fact via zero value 2806 return; 2807 case kmp_sch_static_chunked: 2808 *kind = kmp_sched_static; 2809 break; 2810 case kmp_sch_dynamic_chunked: 2811 *kind = kmp_sched_dynamic; 2812 break; 2813 case kmp_sch_guided_chunked: 2814 case kmp_sch_guided_iterative_chunked: 2815 case kmp_sch_guided_analytical_chunked: 2816 *kind = kmp_sched_guided; 2817 break; 2818 case kmp_sch_auto: 2819 *kind = kmp_sched_auto; 2820 break; 2821 case kmp_sch_trapezoidal: 2822 *kind = kmp_sched_trapezoidal; 2823 break; 2824 #if KMP_STATIC_STEAL_ENABLED 2825 case kmp_sch_static_steal: 2826 *kind = kmp_sched_static_steal; 2827 break; 2828 #endif 2829 default: 2830 KMP_FATAL(UnknownSchedulingType, th_type); 2831 } 2832 2833 __kmp_sched_apply_mods_stdkind(kind, th_type); 2834 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2835 } 2836 2837 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2838 2839 int ii, dd; 2840 kmp_team_t *team; 2841 kmp_info_t *thr; 2842 2843 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2844 KMP_DEBUG_ASSERT(__kmp_init_serial); 2845 2846 // validate level 2847 if (level == 0) 2848 return 0; 2849 if (level < 0) 2850 return -1; 2851 thr = __kmp_threads[gtid]; 2852 team = thr->th.th_team; 2853 ii = team->t.t_level; 2854 if (level > ii) 2855 return -1; 2856 2857 if (thr->th.th_teams_microtask) { 2858 // AC: we are in teams region where multiple nested teams have same level 2859 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2860 if (level <= 2861 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2862 KMP_DEBUG_ASSERT(ii >= tlevel); 2863 // AC: As we need to pass by the teams league, we need to artificially 2864 // increase ii 2865 if (ii == tlevel) { 2866 ii += 2; // three teams have same level 2867 } else { 2868 ii++; // two teams have same level 2869 } 2870 } 2871 } 2872 2873 if (ii == level) 2874 return __kmp_tid_from_gtid(gtid); 2875 2876 dd = team->t.t_serialized; 2877 level++; 2878 while (ii > level) { 2879 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2880 } 2881 if ((team->t.t_serialized) && (!dd)) { 2882 team = team->t.t_parent; 2883 continue; 2884 } 2885 if (ii > level) { 2886 team = team->t.t_parent; 2887 dd = team->t.t_serialized; 2888 ii--; 2889 } 2890 } 2891 2892 return (dd > 1) ? (0) : (team->t.t_master_tid); 2893 } 2894 2895 int __kmp_get_team_size(int gtid, int level) { 2896 2897 int ii, dd; 2898 kmp_team_t *team; 2899 kmp_info_t *thr; 2900 2901 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2902 KMP_DEBUG_ASSERT(__kmp_init_serial); 2903 2904 // validate level 2905 if (level == 0) 2906 return 1; 2907 if (level < 0) 2908 return -1; 2909 thr = __kmp_threads[gtid]; 2910 team = thr->th.th_team; 2911 ii = team->t.t_level; 2912 if (level > ii) 2913 return -1; 2914 2915 if (thr->th.th_teams_microtask) { 2916 // AC: we are in teams region where multiple nested teams have same level 2917 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2918 if (level <= 2919 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2920 KMP_DEBUG_ASSERT(ii >= tlevel); 2921 // AC: As we need to pass by the teams league, we need to artificially 2922 // increase ii 2923 if (ii == tlevel) { 2924 ii += 2; // three teams have same level 2925 } else { 2926 ii++; // two teams have same level 2927 } 2928 } 2929 } 2930 2931 while (ii > level) { 2932 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2933 } 2934 if (team->t.t_serialized && (!dd)) { 2935 team = team->t.t_parent; 2936 continue; 2937 } 2938 if (ii > level) { 2939 team = team->t.t_parent; 2940 ii--; 2941 } 2942 } 2943 2944 return team->t.t_nproc; 2945 } 2946 2947 kmp_r_sched_t __kmp_get_schedule_global() { 2948 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2949 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2950 // independently. So one can get the updated schedule here. 2951 2952 kmp_r_sched_t r_sched; 2953 2954 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2955 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2956 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2957 // different roots (even in OMP 2.5) 2958 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2959 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2960 if (s == kmp_sch_static) { 2961 // replace STATIC with more detailed schedule (balanced or greedy) 2962 r_sched.r_sched_type = __kmp_static; 2963 } else if (s == kmp_sch_guided_chunked) { 2964 // replace GUIDED with more detailed schedule (iterative or analytical) 2965 r_sched.r_sched_type = __kmp_guided; 2966 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2967 r_sched.r_sched_type = __kmp_sched; 2968 } 2969 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2970 2971 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2972 // __kmp_chunk may be wrong here (if it was not ever set) 2973 r_sched.chunk = KMP_DEFAULT_CHUNK; 2974 } else { 2975 r_sched.chunk = __kmp_chunk; 2976 } 2977 2978 return r_sched; 2979 } 2980 2981 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2982 at least argc number of *t_argv entries for the requested team. */ 2983 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2984 2985 KMP_DEBUG_ASSERT(team); 2986 if (!realloc || argc > team->t.t_max_argc) { 2987 2988 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 2989 "current entries=%d\n", 2990 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 2991 /* if previously allocated heap space for args, free them */ 2992 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 2993 __kmp_free((void *)team->t.t_argv); 2994 2995 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 2996 /* use unused space in the cache line for arguments */ 2997 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 2998 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 2999 "argv entries\n", 3000 team->t.t_id, team->t.t_max_argc)); 3001 team->t.t_argv = &team->t.t_inline_argv[0]; 3002 if (__kmp_storage_map) { 3003 __kmp_print_storage_map_gtid( 3004 -1, &team->t.t_inline_argv[0], 3005 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3006 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3007 team->t.t_id); 3008 } 3009 } else { 3010 /* allocate space for arguments in the heap */ 3011 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3012 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3013 : 2 * argc; 3014 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3015 "argv entries\n", 3016 team->t.t_id, team->t.t_max_argc)); 3017 team->t.t_argv = 3018 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3019 if (__kmp_storage_map) { 3020 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3021 &team->t.t_argv[team->t.t_max_argc], 3022 sizeof(void *) * team->t.t_max_argc, 3023 "team_%d.t_argv", team->t.t_id); 3024 } 3025 } 3026 } 3027 } 3028 3029 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3030 int i; 3031 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3032 team->t.t_threads = 3033 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3034 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3035 sizeof(dispatch_shared_info_t) * num_disp_buff); 3036 team->t.t_dispatch = 3037 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3038 team->t.t_implicit_task_taskdata = 3039 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3040 team->t.t_max_nproc = max_nth; 3041 3042 /* setup dispatch buffers */ 3043 for (i = 0; i < num_disp_buff; ++i) { 3044 team->t.t_disp_buffer[i].buffer_index = i; 3045 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3046 } 3047 } 3048 3049 static void __kmp_free_team_arrays(kmp_team_t *team) { 3050 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3051 int i; 3052 for (i = 0; i < team->t.t_max_nproc; ++i) { 3053 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3054 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3055 team->t.t_dispatch[i].th_disp_buffer = NULL; 3056 } 3057 } 3058 #if KMP_USE_HIER_SCHED 3059 __kmp_dispatch_free_hierarchies(team); 3060 #endif 3061 __kmp_free(team->t.t_threads); 3062 __kmp_free(team->t.t_disp_buffer); 3063 __kmp_free(team->t.t_dispatch); 3064 __kmp_free(team->t.t_implicit_task_taskdata); 3065 team->t.t_threads = NULL; 3066 team->t.t_disp_buffer = NULL; 3067 team->t.t_dispatch = NULL; 3068 team->t.t_implicit_task_taskdata = 0; 3069 } 3070 3071 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3072 kmp_info_t **oldThreads = team->t.t_threads; 3073 3074 __kmp_free(team->t.t_disp_buffer); 3075 __kmp_free(team->t.t_dispatch); 3076 __kmp_free(team->t.t_implicit_task_taskdata); 3077 __kmp_allocate_team_arrays(team, max_nth); 3078 3079 KMP_MEMCPY(team->t.t_threads, oldThreads, 3080 team->t.t_nproc * sizeof(kmp_info_t *)); 3081 3082 __kmp_free(oldThreads); 3083 } 3084 3085 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3086 3087 kmp_r_sched_t r_sched = 3088 __kmp_get_schedule_global(); // get current state of scheduling globals 3089 3090 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3091 3092 kmp_internal_control_t g_icvs = { 3093 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3094 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3095 // adjustment of threads (per thread) 3096 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3097 // whether blocktime is explicitly set 3098 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3099 #if KMP_USE_MONITOR 3100 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3101 // intervals 3102 #endif 3103 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3104 // next parallel region (per thread) 3105 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3106 __kmp_cg_max_nth, // int thread_limit; 3107 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3108 // for max_active_levels 3109 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3110 // {sched,chunk} pair 3111 __kmp_nested_proc_bind.bind_types[0], 3112 __kmp_default_device, 3113 NULL // struct kmp_internal_control *next; 3114 }; 3115 3116 return g_icvs; 3117 } 3118 3119 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3120 3121 kmp_internal_control_t gx_icvs; 3122 gx_icvs.serial_nesting_level = 3123 0; // probably =team->t.t_serial like in save_inter_controls 3124 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3125 gx_icvs.next = NULL; 3126 3127 return gx_icvs; 3128 } 3129 3130 static void __kmp_initialize_root(kmp_root_t *root) { 3131 int f; 3132 kmp_team_t *root_team; 3133 kmp_team_t *hot_team; 3134 int hot_team_max_nth; 3135 kmp_r_sched_t r_sched = 3136 __kmp_get_schedule_global(); // get current state of scheduling globals 3137 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3138 KMP_DEBUG_ASSERT(root); 3139 KMP_ASSERT(!root->r.r_begin); 3140 3141 /* setup the root state structure */ 3142 __kmp_init_lock(&root->r.r_begin_lock); 3143 root->r.r_begin = FALSE; 3144 root->r.r_active = FALSE; 3145 root->r.r_in_parallel = 0; 3146 root->r.r_blocktime = __kmp_dflt_blocktime; 3147 3148 /* setup the root team for this task */ 3149 /* allocate the root team structure */ 3150 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3151 3152 root_team = 3153 __kmp_allocate_team(root, 3154 1, // new_nproc 3155 1, // max_nproc 3156 #if OMPT_SUPPORT 3157 ompt_data_none, // root parallel id 3158 #endif 3159 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3160 0 // argc 3161 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3162 ); 3163 #if USE_DEBUGGER 3164 // Non-NULL value should be assigned to make the debugger display the root 3165 // team. 3166 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3167 #endif 3168 3169 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3170 3171 root->r.r_root_team = root_team; 3172 root_team->t.t_control_stack_top = NULL; 3173 3174 /* initialize root team */ 3175 root_team->t.t_threads[0] = NULL; 3176 root_team->t.t_nproc = 1; 3177 root_team->t.t_serialized = 1; 3178 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3179 root_team->t.t_sched.sched = r_sched.sched; 3180 KA_TRACE( 3181 20, 3182 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3183 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3184 3185 /* setup the hot team for this task */ 3186 /* allocate the hot team structure */ 3187 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3188 3189 hot_team = 3190 __kmp_allocate_team(root, 3191 1, // new_nproc 3192 __kmp_dflt_team_nth_ub * 2, // max_nproc 3193 #if OMPT_SUPPORT 3194 ompt_data_none, // root parallel id 3195 #endif 3196 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3197 0 // argc 3198 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3199 ); 3200 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3201 3202 root->r.r_hot_team = hot_team; 3203 root_team->t.t_control_stack_top = NULL; 3204 3205 /* first-time initialization */ 3206 hot_team->t.t_parent = root_team; 3207 3208 /* initialize hot team */ 3209 hot_team_max_nth = hot_team->t.t_max_nproc; 3210 for (f = 0; f < hot_team_max_nth; ++f) { 3211 hot_team->t.t_threads[f] = NULL; 3212 } 3213 hot_team->t.t_nproc = 1; 3214 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3215 hot_team->t.t_sched.sched = r_sched.sched; 3216 hot_team->t.t_size_changed = 0; 3217 } 3218 3219 #ifdef KMP_DEBUG 3220 3221 typedef struct kmp_team_list_item { 3222 kmp_team_p const *entry; 3223 struct kmp_team_list_item *next; 3224 } kmp_team_list_item_t; 3225 typedef kmp_team_list_item_t *kmp_team_list_t; 3226 3227 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3228 kmp_team_list_t list, // List of teams. 3229 kmp_team_p const *team // Team to add. 3230 ) { 3231 3232 // List must terminate with item where both entry and next are NULL. 3233 // Team is added to the list only once. 3234 // List is sorted in ascending order by team id. 3235 // Team id is *not* a key. 3236 3237 kmp_team_list_t l; 3238 3239 KMP_DEBUG_ASSERT(list != NULL); 3240 if (team == NULL) { 3241 return; 3242 } 3243 3244 __kmp_print_structure_team_accum(list, team->t.t_parent); 3245 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3246 3247 // Search list for the team. 3248 l = list; 3249 while (l->next != NULL && l->entry != team) { 3250 l = l->next; 3251 } 3252 if (l->next != NULL) { 3253 return; // Team has been added before, exit. 3254 } 3255 3256 // Team is not found. Search list again for insertion point. 3257 l = list; 3258 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3259 l = l->next; 3260 } 3261 3262 // Insert team. 3263 { 3264 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3265 sizeof(kmp_team_list_item_t)); 3266 *item = *l; 3267 l->entry = team; 3268 l->next = item; 3269 } 3270 } 3271 3272 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3273 3274 ) { 3275 __kmp_printf("%s", title); 3276 if (team != NULL) { 3277 __kmp_printf("%2x %p\n", team->t.t_id, team); 3278 } else { 3279 __kmp_printf(" - (nil)\n"); 3280 } 3281 } 3282 3283 static void __kmp_print_structure_thread(char const *title, 3284 kmp_info_p const *thread) { 3285 __kmp_printf("%s", title); 3286 if (thread != NULL) { 3287 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3288 } else { 3289 __kmp_printf(" - (nil)\n"); 3290 } 3291 } 3292 3293 void __kmp_print_structure(void) { 3294 3295 kmp_team_list_t list; 3296 3297 // Initialize list of teams. 3298 list = 3299 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3300 list->entry = NULL; 3301 list->next = NULL; 3302 3303 __kmp_printf("\n------------------------------\nGlobal Thread " 3304 "Table\n------------------------------\n"); 3305 { 3306 int gtid; 3307 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3308 __kmp_printf("%2d", gtid); 3309 if (__kmp_threads != NULL) { 3310 __kmp_printf(" %p", __kmp_threads[gtid]); 3311 } 3312 if (__kmp_root != NULL) { 3313 __kmp_printf(" %p", __kmp_root[gtid]); 3314 } 3315 __kmp_printf("\n"); 3316 } 3317 } 3318 3319 // Print out __kmp_threads array. 3320 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3321 "----------\n"); 3322 if (__kmp_threads != NULL) { 3323 int gtid; 3324 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3325 kmp_info_t const *thread = __kmp_threads[gtid]; 3326 if (thread != NULL) { 3327 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3328 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3329 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3330 __kmp_print_structure_team(" Serial Team: ", 3331 thread->th.th_serial_team); 3332 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3333 __kmp_print_structure_thread(" Master: ", 3334 thread->th.th_team_master); 3335 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3336 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3337 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3338 __kmp_print_structure_thread(" Next in pool: ", 3339 thread->th.th_next_pool); 3340 __kmp_printf("\n"); 3341 __kmp_print_structure_team_accum(list, thread->th.th_team); 3342 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3343 } 3344 } 3345 } else { 3346 __kmp_printf("Threads array is not allocated.\n"); 3347 } 3348 3349 // Print out __kmp_root array. 3350 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3351 "--------\n"); 3352 if (__kmp_root != NULL) { 3353 int gtid; 3354 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3355 kmp_root_t const *root = __kmp_root[gtid]; 3356 if (root != NULL) { 3357 __kmp_printf("GTID %2d %p:\n", gtid, root); 3358 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3359 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3360 __kmp_print_structure_thread(" Uber Thread: ", 3361 root->r.r_uber_thread); 3362 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3363 __kmp_printf(" In Parallel: %2d\n", 3364 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3365 __kmp_printf("\n"); 3366 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3367 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3368 } 3369 } 3370 } else { 3371 __kmp_printf("Ubers array is not allocated.\n"); 3372 } 3373 3374 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3375 "--------\n"); 3376 while (list->next != NULL) { 3377 kmp_team_p const *team = list->entry; 3378 int i; 3379 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3380 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3381 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3382 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3383 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3384 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3385 for (i = 0; i < team->t.t_nproc; ++i) { 3386 __kmp_printf(" Thread %2d: ", i); 3387 __kmp_print_structure_thread("", team->t.t_threads[i]); 3388 } 3389 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3390 __kmp_printf("\n"); 3391 list = list->next; 3392 } 3393 3394 // Print out __kmp_thread_pool and __kmp_team_pool. 3395 __kmp_printf("\n------------------------------\nPools\n----------------------" 3396 "--------\n"); 3397 __kmp_print_structure_thread("Thread pool: ", 3398 CCAST(kmp_info_t *, __kmp_thread_pool)); 3399 __kmp_print_structure_team("Team pool: ", 3400 CCAST(kmp_team_t *, __kmp_team_pool)); 3401 __kmp_printf("\n"); 3402 3403 // Free team list. 3404 while (list != NULL) { 3405 kmp_team_list_item_t *item = list; 3406 list = list->next; 3407 KMP_INTERNAL_FREE(item); 3408 } 3409 } 3410 3411 #endif 3412 3413 //--------------------------------------------------------------------------- 3414 // Stuff for per-thread fast random number generator 3415 // Table of primes 3416 static const unsigned __kmp_primes[] = { 3417 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3418 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3419 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3420 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3421 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3422 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3423 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3424 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3425 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3426 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3427 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3428 3429 //--------------------------------------------------------------------------- 3430 // __kmp_get_random: Get a random number using a linear congruential method. 3431 unsigned short __kmp_get_random(kmp_info_t *thread) { 3432 unsigned x = thread->th.th_x; 3433 unsigned short r = (unsigned short)(x >> 16); 3434 3435 thread->th.th_x = x * thread->th.th_a + 1; 3436 3437 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3438 thread->th.th_info.ds.ds_tid, r)); 3439 3440 return r; 3441 } 3442 //-------------------------------------------------------- 3443 // __kmp_init_random: Initialize a random number generator 3444 void __kmp_init_random(kmp_info_t *thread) { 3445 unsigned seed = thread->th.th_info.ds.ds_tid; 3446 3447 thread->th.th_a = 3448 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3449 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3450 KA_TRACE(30, 3451 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3452 } 3453 3454 #if KMP_OS_WINDOWS 3455 /* reclaim array entries for root threads that are already dead, returns number 3456 * reclaimed */ 3457 static int __kmp_reclaim_dead_roots(void) { 3458 int i, r = 0; 3459 3460 for (i = 0; i < __kmp_threads_capacity; ++i) { 3461 if (KMP_UBER_GTID(i) && 3462 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3463 !__kmp_root[i] 3464 ->r.r_active) { // AC: reclaim only roots died in non-active state 3465 r += __kmp_unregister_root_other_thread(i); 3466 } 3467 } 3468 return r; 3469 } 3470 #endif 3471 3472 /* This function attempts to create free entries in __kmp_threads and 3473 __kmp_root, and returns the number of free entries generated. 3474 3475 For Windows* OS static library, the first mechanism used is to reclaim array 3476 entries for root threads that are already dead. 3477 3478 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3479 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3480 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3481 threadprivate cache array has been created. Synchronization with 3482 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3483 3484 After any dead root reclamation, if the clipping value allows array expansion 3485 to result in the generation of a total of nNeed free slots, the function does 3486 that expansion. If not, nothing is done beyond the possible initial root 3487 thread reclamation. 3488 3489 If any argument is negative, the behavior is undefined. */ 3490 static int __kmp_expand_threads(int nNeed) { 3491 int added = 0; 3492 int minimumRequiredCapacity; 3493 int newCapacity; 3494 kmp_info_t **newThreads; 3495 kmp_root_t **newRoot; 3496 3497 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3498 // resizing __kmp_threads does not need additional protection if foreign 3499 // threads are present 3500 3501 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3502 /* only for Windows static library */ 3503 /* reclaim array entries for root threads that are already dead */ 3504 added = __kmp_reclaim_dead_roots(); 3505 3506 if (nNeed) { 3507 nNeed -= added; 3508 if (nNeed < 0) 3509 nNeed = 0; 3510 } 3511 #endif 3512 if (nNeed <= 0) 3513 return added; 3514 3515 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3516 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3517 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3518 // > __kmp_max_nth in one of two ways: 3519 // 3520 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3521 // may not be reused by another thread, so we may need to increase 3522 // __kmp_threads_capacity to __kmp_max_nth + 1. 3523 // 3524 // 2) New foreign root(s) are encountered. We always register new foreign 3525 // roots. This may cause a smaller # of threads to be allocated at 3526 // subsequent parallel regions, but the worker threads hang around (and 3527 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3528 // 3529 // Anyway, that is the reason for moving the check to see if 3530 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3531 // instead of having it performed here. -BB 3532 3533 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3534 3535 /* compute expansion headroom to check if we can expand */ 3536 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3537 /* possible expansion too small -- give up */ 3538 return added; 3539 } 3540 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3541 3542 newCapacity = __kmp_threads_capacity; 3543 do { 3544 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3545 : __kmp_sys_max_nth; 3546 } while (newCapacity < minimumRequiredCapacity); 3547 newThreads = (kmp_info_t **)__kmp_allocate( 3548 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3549 newRoot = 3550 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3551 KMP_MEMCPY(newThreads, __kmp_threads, 3552 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3553 KMP_MEMCPY(newRoot, __kmp_root, 3554 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3555 3556 kmp_info_t **temp_threads = __kmp_threads; 3557 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3558 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3559 __kmp_free(temp_threads); 3560 added += newCapacity - __kmp_threads_capacity; 3561 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3562 3563 if (newCapacity > __kmp_tp_capacity) { 3564 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3565 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3566 __kmp_threadprivate_resize_cache(newCapacity); 3567 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3568 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3569 } 3570 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3571 } 3572 3573 return added; 3574 } 3575 3576 /* Register the current thread as a root thread and obtain our gtid. We must 3577 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3578 thread that calls from __kmp_do_serial_initialize() */ 3579 int __kmp_register_root(int initial_thread) { 3580 kmp_info_t *root_thread; 3581 kmp_root_t *root; 3582 int gtid; 3583 int capacity; 3584 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3585 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3586 KMP_MB(); 3587 3588 /* 2007-03-02: 3589 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3590 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3591 work as expected -- it may return false (that means there is at least one 3592 empty slot in __kmp_threads array), but it is possible the only free slot 3593 is #0, which is reserved for initial thread and so cannot be used for this 3594 one. Following code workarounds this bug. 3595 3596 However, right solution seems to be not reserving slot #0 for initial 3597 thread because: 3598 (1) there is no magic in slot #0, 3599 (2) we cannot detect initial thread reliably (the first thread which does 3600 serial initialization may be not a real initial thread). 3601 */ 3602 capacity = __kmp_threads_capacity; 3603 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3604 --capacity; 3605 } 3606 3607 /* see if there are too many threads */ 3608 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3609 if (__kmp_tp_cached) { 3610 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3611 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3612 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3613 } else { 3614 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3615 __kmp_msg_null); 3616 } 3617 } 3618 3619 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3620 // 0: initial thread, also a regular OpenMP thread. 3621 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3622 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3623 // regular OpenMP threads. 3624 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3625 // Find an available thread slot for hidden helper thread. Slots for hidden 3626 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3627 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3628 gtid <= __kmp_hidden_helper_threads_num; 3629 gtid++) 3630 ; 3631 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3632 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3633 "hidden helper thread: T#%d\n", 3634 gtid)); 3635 } else { 3636 /* find an available thread slot */ 3637 // Don't reassign the zero slot since we need that to only be used by 3638 // initial thread. Slots for hidden helper threads should also be skipped. 3639 if (initial_thread && __kmp_threads[0] == NULL) { 3640 gtid = 0; 3641 } else { 3642 for (gtid = __kmp_hidden_helper_threads_num + 1; 3643 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3644 ; 3645 } 3646 KA_TRACE( 3647 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3648 KMP_ASSERT(gtid < __kmp_threads_capacity); 3649 } 3650 3651 /* update global accounting */ 3652 __kmp_all_nth++; 3653 TCW_4(__kmp_nth, __kmp_nth + 1); 3654 3655 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3656 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3657 if (__kmp_adjust_gtid_mode) { 3658 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3659 if (TCR_4(__kmp_gtid_mode) != 2) { 3660 TCW_4(__kmp_gtid_mode, 2); 3661 } 3662 } else { 3663 if (TCR_4(__kmp_gtid_mode) != 1) { 3664 TCW_4(__kmp_gtid_mode, 1); 3665 } 3666 } 3667 } 3668 3669 #ifdef KMP_ADJUST_BLOCKTIME 3670 /* Adjust blocktime to zero if necessary */ 3671 /* Middle initialization might not have occurred yet */ 3672 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3673 if (__kmp_nth > __kmp_avail_proc) { 3674 __kmp_zero_bt = TRUE; 3675 } 3676 } 3677 #endif /* KMP_ADJUST_BLOCKTIME */ 3678 3679 /* setup this new hierarchy */ 3680 if (!(root = __kmp_root[gtid])) { 3681 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3682 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3683 } 3684 3685 #if KMP_STATS_ENABLED 3686 // Initialize stats as soon as possible (right after gtid assignment). 3687 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3688 __kmp_stats_thread_ptr->startLife(); 3689 KMP_SET_THREAD_STATE(SERIAL_REGION); 3690 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3691 #endif 3692 __kmp_initialize_root(root); 3693 3694 /* setup new root thread structure */ 3695 if (root->r.r_uber_thread) { 3696 root_thread = root->r.r_uber_thread; 3697 } else { 3698 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3699 if (__kmp_storage_map) { 3700 __kmp_print_thread_storage_map(root_thread, gtid); 3701 } 3702 root_thread->th.th_info.ds.ds_gtid = gtid; 3703 #if OMPT_SUPPORT 3704 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3705 #endif 3706 root_thread->th.th_root = root; 3707 if (__kmp_env_consistency_check) { 3708 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3709 } 3710 #if USE_FAST_MEMORY 3711 __kmp_initialize_fast_memory(root_thread); 3712 #endif /* USE_FAST_MEMORY */ 3713 3714 #if KMP_USE_BGET 3715 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3716 __kmp_initialize_bget(root_thread); 3717 #endif 3718 __kmp_init_random(root_thread); // Initialize random number generator 3719 } 3720 3721 /* setup the serial team held in reserve by the root thread */ 3722 if (!root_thread->th.th_serial_team) { 3723 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3724 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3725 root_thread->th.th_serial_team = __kmp_allocate_team( 3726 root, 1, 1, 3727 #if OMPT_SUPPORT 3728 ompt_data_none, // root parallel id 3729 #endif 3730 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3731 } 3732 KMP_ASSERT(root_thread->th.th_serial_team); 3733 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3734 root_thread->th.th_serial_team)); 3735 3736 /* drop root_thread into place */ 3737 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3738 3739 root->r.r_root_team->t.t_threads[0] = root_thread; 3740 root->r.r_hot_team->t.t_threads[0] = root_thread; 3741 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3742 // AC: the team created in reserve, not for execution (it is unused for now). 3743 root_thread->th.th_serial_team->t.t_serialized = 0; 3744 root->r.r_uber_thread = root_thread; 3745 3746 /* initialize the thread, get it ready to go */ 3747 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3748 TCW_4(__kmp_init_gtid, TRUE); 3749 3750 /* prepare the master thread for get_gtid() */ 3751 __kmp_gtid_set_specific(gtid); 3752 3753 #if USE_ITT_BUILD 3754 __kmp_itt_thread_name(gtid); 3755 #endif /* USE_ITT_BUILD */ 3756 3757 #ifdef KMP_TDATA_GTID 3758 __kmp_gtid = gtid; 3759 #endif 3760 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3761 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3762 3763 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3764 "plain=%u\n", 3765 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3766 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3767 KMP_INIT_BARRIER_STATE)); 3768 { // Initialize barrier data. 3769 int b; 3770 for (b = 0; b < bs_last_barrier; ++b) { 3771 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3772 #if USE_DEBUGGER 3773 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3774 #endif 3775 } 3776 } 3777 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3778 KMP_INIT_BARRIER_STATE); 3779 3780 #if KMP_AFFINITY_SUPPORTED 3781 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3782 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3783 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3784 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3785 if (TCR_4(__kmp_init_middle)) { 3786 __kmp_affinity_set_init_mask(gtid, TRUE); 3787 } 3788 #endif /* KMP_AFFINITY_SUPPORTED */ 3789 root_thread->th.th_def_allocator = __kmp_def_allocator; 3790 root_thread->th.th_prev_level = 0; 3791 root_thread->th.th_prev_num_threads = 1; 3792 3793 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3794 tmp->cg_root = root_thread; 3795 tmp->cg_thread_limit = __kmp_cg_max_nth; 3796 tmp->cg_nthreads = 1; 3797 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3798 " cg_nthreads init to 1\n", 3799 root_thread, tmp)); 3800 tmp->up = NULL; 3801 root_thread->th.th_cg_roots = tmp; 3802 3803 __kmp_root_counter++; 3804 3805 #if OMPT_SUPPORT 3806 if (!initial_thread && ompt_enabled.enabled) { 3807 3808 kmp_info_t *root_thread = ompt_get_thread(); 3809 3810 ompt_set_thread_state(root_thread, ompt_state_overhead); 3811 3812 if (ompt_enabled.ompt_callback_thread_begin) { 3813 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3814 ompt_thread_initial, __ompt_get_thread_data_internal()); 3815 } 3816 ompt_data_t *task_data; 3817 ompt_data_t *parallel_data; 3818 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3819 NULL); 3820 if (ompt_enabled.ompt_callback_implicit_task) { 3821 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3822 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3823 } 3824 3825 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3826 } 3827 #endif 3828 3829 KMP_MB(); 3830 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3831 3832 return gtid; 3833 } 3834 3835 #if KMP_NESTED_HOT_TEAMS 3836 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3837 const int max_level) { 3838 int i, n, nth; 3839 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3840 if (!hot_teams || !hot_teams[level].hot_team) { 3841 return 0; 3842 } 3843 KMP_DEBUG_ASSERT(level < max_level); 3844 kmp_team_t *team = hot_teams[level].hot_team; 3845 nth = hot_teams[level].hot_team_nth; 3846 n = nth - 1; // master is not freed 3847 if (level < max_level - 1) { 3848 for (i = 0; i < nth; ++i) { 3849 kmp_info_t *th = team->t.t_threads[i]; 3850 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3851 if (i > 0 && th->th.th_hot_teams) { 3852 __kmp_free(th->th.th_hot_teams); 3853 th->th.th_hot_teams = NULL; 3854 } 3855 } 3856 } 3857 __kmp_free_team(root, team, NULL); 3858 return n; 3859 } 3860 #endif 3861 3862 // Resets a root thread and clear its root and hot teams. 3863 // Returns the number of __kmp_threads entries directly and indirectly freed. 3864 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3865 kmp_team_t *root_team = root->r.r_root_team; 3866 kmp_team_t *hot_team = root->r.r_hot_team; 3867 int n = hot_team->t.t_nproc; 3868 int i; 3869 3870 KMP_DEBUG_ASSERT(!root->r.r_active); 3871 3872 root->r.r_root_team = NULL; 3873 root->r.r_hot_team = NULL; 3874 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3875 // before call to __kmp_free_team(). 3876 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3877 #if KMP_NESTED_HOT_TEAMS 3878 if (__kmp_hot_teams_max_level > 3879 0) { // need to free nested hot teams and their threads if any 3880 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3881 kmp_info_t *th = hot_team->t.t_threads[i]; 3882 if (__kmp_hot_teams_max_level > 1) { 3883 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3884 } 3885 if (th->th.th_hot_teams) { 3886 __kmp_free(th->th.th_hot_teams); 3887 th->th.th_hot_teams = NULL; 3888 } 3889 } 3890 } 3891 #endif 3892 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3893 3894 // Before we can reap the thread, we need to make certain that all other 3895 // threads in the teams that had this root as ancestor have stopped trying to 3896 // steal tasks. 3897 if (__kmp_tasking_mode != tskm_immediate_exec) { 3898 __kmp_wait_to_unref_task_teams(); 3899 } 3900 3901 #if KMP_OS_WINDOWS 3902 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3903 KA_TRACE( 3904 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3905 "\n", 3906 (LPVOID) & (root->r.r_uber_thread->th), 3907 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3908 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3909 #endif /* KMP_OS_WINDOWS */ 3910 3911 #if OMPT_SUPPORT 3912 ompt_data_t *task_data; 3913 ompt_data_t *parallel_data; 3914 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3915 NULL); 3916 if (ompt_enabled.ompt_callback_implicit_task) { 3917 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3918 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3919 } 3920 if (ompt_enabled.ompt_callback_thread_end) { 3921 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3922 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3923 } 3924 #endif 3925 3926 TCW_4(__kmp_nth, 3927 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3928 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3929 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3930 " to %d\n", 3931 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3932 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3933 if (i == 1) { 3934 // need to free contention group structure 3935 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3936 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3937 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3938 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3939 root->r.r_uber_thread->th.th_cg_roots = NULL; 3940 } 3941 __kmp_reap_thread(root->r.r_uber_thread, 1); 3942 3943 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3944 // instead of freeing. 3945 root->r.r_uber_thread = NULL; 3946 /* mark root as no longer in use */ 3947 root->r.r_begin = FALSE; 3948 3949 return n; 3950 } 3951 3952 void __kmp_unregister_root_current_thread(int gtid) { 3953 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3954 /* this lock should be ok, since unregister_root_current_thread is never 3955 called during an abort, only during a normal close. furthermore, if you 3956 have the forkjoin lock, you should never try to get the initz lock */ 3957 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3958 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3959 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3960 "exiting T#%d\n", 3961 gtid)); 3962 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3963 return; 3964 } 3965 kmp_root_t *root = __kmp_root[gtid]; 3966 3967 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3968 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3969 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3970 KMP_ASSERT(root->r.r_active == FALSE); 3971 3972 KMP_MB(); 3973 3974 kmp_info_t *thread = __kmp_threads[gtid]; 3975 kmp_team_t *team = thread->th.th_team; 3976 kmp_task_team_t *task_team = thread->th.th_task_team; 3977 3978 // we need to wait for the proxy tasks before finishing the thread 3979 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3980 #if OMPT_SUPPORT 3981 // the runtime is shutting down so we won't report any events 3982 thread->th.ompt_thread_info.state = ompt_state_undefined; 3983 #endif 3984 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3985 } 3986 3987 __kmp_reset_root(gtid, root); 3988 3989 KMP_MB(); 3990 KC_TRACE(10, 3991 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3992 3993 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3994 } 3995 3996 #if KMP_OS_WINDOWS 3997 /* __kmp_forkjoin_lock must be already held 3998 Unregisters a root thread that is not the current thread. Returns the number 3999 of __kmp_threads entries freed as a result. */ 4000 static int __kmp_unregister_root_other_thread(int gtid) { 4001 kmp_root_t *root = __kmp_root[gtid]; 4002 int r; 4003 4004 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4005 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4006 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4007 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4008 KMP_ASSERT(root->r.r_active == FALSE); 4009 4010 r = __kmp_reset_root(gtid, root); 4011 KC_TRACE(10, 4012 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4013 return r; 4014 } 4015 #endif 4016 4017 #if KMP_DEBUG 4018 void __kmp_task_info() { 4019 4020 kmp_int32 gtid = __kmp_entry_gtid(); 4021 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4022 kmp_info_t *this_thr = __kmp_threads[gtid]; 4023 kmp_team_t *steam = this_thr->th.th_serial_team; 4024 kmp_team_t *team = this_thr->th.th_team; 4025 4026 __kmp_printf( 4027 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4028 "ptask=%p\n", 4029 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4030 team->t.t_implicit_task_taskdata[tid].td_parent); 4031 } 4032 #endif // KMP_DEBUG 4033 4034 /* TODO optimize with one big memclr, take out what isn't needed, split 4035 responsibility to workers as much as possible, and delay initialization of 4036 features as much as possible */ 4037 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4038 int tid, int gtid) { 4039 /* this_thr->th.th_info.ds.ds_gtid is setup in 4040 kmp_allocate_thread/create_worker. 4041 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4042 kmp_info_t *master = team->t.t_threads[0]; 4043 KMP_DEBUG_ASSERT(this_thr != NULL); 4044 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4045 KMP_DEBUG_ASSERT(team); 4046 KMP_DEBUG_ASSERT(team->t.t_threads); 4047 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4048 KMP_DEBUG_ASSERT(master); 4049 KMP_DEBUG_ASSERT(master->th.th_root); 4050 4051 KMP_MB(); 4052 4053 TCW_SYNC_PTR(this_thr->th.th_team, team); 4054 4055 this_thr->th.th_info.ds.ds_tid = tid; 4056 this_thr->th.th_set_nproc = 0; 4057 if (__kmp_tasking_mode != tskm_immediate_exec) 4058 // When tasking is possible, threads are not safe to reap until they are 4059 // done tasking; this will be set when tasking code is exited in wait 4060 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4061 else // no tasking --> always safe to reap 4062 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4063 this_thr->th.th_set_proc_bind = proc_bind_default; 4064 #if KMP_AFFINITY_SUPPORTED 4065 this_thr->th.th_new_place = this_thr->th.th_current_place; 4066 #endif 4067 this_thr->th.th_root = master->th.th_root; 4068 4069 /* setup the thread's cache of the team structure */ 4070 this_thr->th.th_team_nproc = team->t.t_nproc; 4071 this_thr->th.th_team_master = master; 4072 this_thr->th.th_team_serialized = team->t.t_serialized; 4073 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4074 4075 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4076 4077 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4078 tid, gtid, this_thr, this_thr->th.th_current_task)); 4079 4080 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4081 team, tid, TRUE); 4082 4083 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4084 tid, gtid, this_thr, this_thr->th.th_current_task)); 4085 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4086 // __kmp_initialize_team()? 4087 4088 /* TODO no worksharing in speculative threads */ 4089 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4090 4091 this_thr->th.th_local.this_construct = 0; 4092 4093 if (!this_thr->th.th_pri_common) { 4094 this_thr->th.th_pri_common = 4095 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4096 if (__kmp_storage_map) { 4097 __kmp_print_storage_map_gtid( 4098 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4099 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4100 } 4101 this_thr->th.th_pri_head = NULL; 4102 } 4103 4104 if (this_thr != master && // Master's CG root is initialized elsewhere 4105 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4106 // Make new thread's CG root same as master's 4107 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4108 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4109 if (tmp) { 4110 // worker changes CG, need to check if old CG should be freed 4111 int i = tmp->cg_nthreads--; 4112 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4113 " on node %p of thread %p to %d\n", 4114 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4115 if (i == 1) { 4116 __kmp_free(tmp); // last thread left CG --> free it 4117 } 4118 } 4119 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4120 // Increment new thread's CG root's counter to add the new thread 4121 this_thr->th.th_cg_roots->cg_nthreads++; 4122 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4123 " node %p of thread %p to %d\n", 4124 this_thr, this_thr->th.th_cg_roots, 4125 this_thr->th.th_cg_roots->cg_root, 4126 this_thr->th.th_cg_roots->cg_nthreads)); 4127 this_thr->th.th_current_task->td_icvs.thread_limit = 4128 this_thr->th.th_cg_roots->cg_thread_limit; 4129 } 4130 4131 /* Initialize dynamic dispatch */ 4132 { 4133 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4134 // Use team max_nproc since this will never change for the team. 4135 size_t disp_size = 4136 sizeof(dispatch_private_info_t) * 4137 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4138 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4139 team->t.t_max_nproc)); 4140 KMP_ASSERT(dispatch); 4141 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4142 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4143 4144 dispatch->th_disp_index = 0; 4145 dispatch->th_doacross_buf_idx = 0; 4146 if (!dispatch->th_disp_buffer) { 4147 dispatch->th_disp_buffer = 4148 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4149 4150 if (__kmp_storage_map) { 4151 __kmp_print_storage_map_gtid( 4152 gtid, &dispatch->th_disp_buffer[0], 4153 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4154 ? 1 4155 : __kmp_dispatch_num_buffers], 4156 disp_size, 4157 "th_%d.th_dispatch.th_disp_buffer " 4158 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4159 gtid, team->t.t_id, gtid); 4160 } 4161 } else { 4162 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4163 } 4164 4165 dispatch->th_dispatch_pr_current = 0; 4166 dispatch->th_dispatch_sh_current = 0; 4167 4168 dispatch->th_deo_fcn = 0; /* ORDERED */ 4169 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4170 } 4171 4172 this_thr->th.th_next_pool = NULL; 4173 4174 if (!this_thr->th.th_task_state_memo_stack) { 4175 size_t i; 4176 this_thr->th.th_task_state_memo_stack = 4177 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4178 this_thr->th.th_task_state_top = 0; 4179 this_thr->th.th_task_state_stack_sz = 4; 4180 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4181 ++i) // zero init the stack 4182 this_thr->th.th_task_state_memo_stack[i] = 0; 4183 } 4184 4185 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4186 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4187 4188 KMP_MB(); 4189 } 4190 4191 /* allocate a new thread for the requesting team. this is only called from 4192 within a forkjoin critical section. we will first try to get an available 4193 thread from the thread pool. if none is available, we will fork a new one 4194 assuming we are able to create a new one. this should be assured, as the 4195 caller should check on this first. */ 4196 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4197 int new_tid) { 4198 kmp_team_t *serial_team; 4199 kmp_info_t *new_thr; 4200 int new_gtid; 4201 4202 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4203 KMP_DEBUG_ASSERT(root && team); 4204 #if !KMP_NESTED_HOT_TEAMS 4205 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4206 #endif 4207 KMP_MB(); 4208 4209 /* first, try to get one from the thread pool */ 4210 if (__kmp_thread_pool) { 4211 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4212 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4213 if (new_thr == __kmp_thread_pool_insert_pt) { 4214 __kmp_thread_pool_insert_pt = NULL; 4215 } 4216 TCW_4(new_thr->th.th_in_pool, FALSE); 4217 __kmp_suspend_initialize_thread(new_thr); 4218 __kmp_lock_suspend_mx(new_thr); 4219 if (new_thr->th.th_active_in_pool == TRUE) { 4220 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4221 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4222 new_thr->th.th_active_in_pool = FALSE; 4223 } 4224 __kmp_unlock_suspend_mx(new_thr); 4225 4226 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4227 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4228 KMP_ASSERT(!new_thr->th.th_team); 4229 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4230 4231 /* setup the thread structure */ 4232 __kmp_initialize_info(new_thr, team, new_tid, 4233 new_thr->th.th_info.ds.ds_gtid); 4234 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4235 4236 TCW_4(__kmp_nth, __kmp_nth + 1); 4237 4238 new_thr->th.th_task_state = 0; 4239 new_thr->th.th_task_state_top = 0; 4240 new_thr->th.th_task_state_stack_sz = 4; 4241 4242 #ifdef KMP_ADJUST_BLOCKTIME 4243 /* Adjust blocktime back to zero if necessary */ 4244 /* Middle initialization might not have occurred yet */ 4245 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4246 if (__kmp_nth > __kmp_avail_proc) { 4247 __kmp_zero_bt = TRUE; 4248 } 4249 } 4250 #endif /* KMP_ADJUST_BLOCKTIME */ 4251 4252 #if KMP_DEBUG 4253 // If thread entered pool via __kmp_free_thread, wait_flag should != 4254 // KMP_BARRIER_PARENT_FLAG. 4255 int b; 4256 kmp_balign_t *balign = new_thr->th.th_bar; 4257 for (b = 0; b < bs_last_barrier; ++b) 4258 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4259 #endif 4260 4261 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4262 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4263 4264 KMP_MB(); 4265 return new_thr; 4266 } 4267 4268 /* no, well fork a new one */ 4269 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4270 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4271 4272 #if KMP_USE_MONITOR 4273 // If this is the first worker thread the RTL is creating, then also 4274 // launch the monitor thread. We try to do this as early as possible. 4275 if (!TCR_4(__kmp_init_monitor)) { 4276 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4277 if (!TCR_4(__kmp_init_monitor)) { 4278 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4279 TCW_4(__kmp_init_monitor, 1); 4280 __kmp_create_monitor(&__kmp_monitor); 4281 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4282 #if KMP_OS_WINDOWS 4283 // AC: wait until monitor has started. This is a fix for CQ232808. 4284 // The reason is that if the library is loaded/unloaded in a loop with 4285 // small (parallel) work in between, then there is high probability that 4286 // monitor thread started after the library shutdown. At shutdown it is 4287 // too late to cope with the problem, because when the master is in 4288 // DllMain (process detach) the monitor has no chances to start (it is 4289 // blocked), and master has no means to inform the monitor that the 4290 // library has gone, because all the memory which the monitor can access 4291 // is going to be released/reset. 4292 while (TCR_4(__kmp_init_monitor) < 2) { 4293 KMP_YIELD(TRUE); 4294 } 4295 KF_TRACE(10, ("after monitor thread has started\n")); 4296 #endif 4297 } 4298 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4299 } 4300 #endif 4301 4302 KMP_MB(); 4303 4304 { 4305 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4306 ? 1 4307 : __kmp_hidden_helper_threads_num + 1; 4308 4309 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4310 ++new_gtid) { 4311 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4312 } 4313 4314 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4315 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4316 } 4317 } 4318 4319 /* allocate space for it. */ 4320 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4321 4322 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4323 4324 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4325 // suppress race conditions detection on synchronization flags in debug mode 4326 // this helps to analyze library internals eliminating false positives 4327 __itt_suppress_mark_range( 4328 __itt_suppress_range, __itt_suppress_threading_errors, 4329 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4330 __itt_suppress_mark_range( 4331 __itt_suppress_range, __itt_suppress_threading_errors, 4332 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4333 #if KMP_OS_WINDOWS 4334 __itt_suppress_mark_range( 4335 __itt_suppress_range, __itt_suppress_threading_errors, 4336 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4337 #else 4338 __itt_suppress_mark_range(__itt_suppress_range, 4339 __itt_suppress_threading_errors, 4340 &new_thr->th.th_suspend_init_count, 4341 sizeof(new_thr->th.th_suspend_init_count)); 4342 #endif 4343 // TODO: check if we need to also suppress b_arrived flags 4344 __itt_suppress_mark_range(__itt_suppress_range, 4345 __itt_suppress_threading_errors, 4346 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4347 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4348 __itt_suppress_mark_range(__itt_suppress_range, 4349 __itt_suppress_threading_errors, 4350 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4351 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4352 __itt_suppress_mark_range(__itt_suppress_range, 4353 __itt_suppress_threading_errors, 4354 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4355 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4356 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4357 if (__kmp_storage_map) { 4358 __kmp_print_thread_storage_map(new_thr, new_gtid); 4359 } 4360 4361 // add the reserve serialized team, initialized from the team's master thread 4362 { 4363 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4364 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4365 new_thr->th.th_serial_team = serial_team = 4366 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4367 #if OMPT_SUPPORT 4368 ompt_data_none, // root parallel id 4369 #endif 4370 proc_bind_default, &r_icvs, 4371 0 USE_NESTED_HOT_ARG(NULL)); 4372 } 4373 KMP_ASSERT(serial_team); 4374 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4375 // execution (it is unused for now). 4376 serial_team->t.t_threads[0] = new_thr; 4377 KF_TRACE(10, 4378 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4379 new_thr)); 4380 4381 /* setup the thread structures */ 4382 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4383 4384 #if USE_FAST_MEMORY 4385 __kmp_initialize_fast_memory(new_thr); 4386 #endif /* USE_FAST_MEMORY */ 4387 4388 #if KMP_USE_BGET 4389 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4390 __kmp_initialize_bget(new_thr); 4391 #endif 4392 4393 __kmp_init_random(new_thr); // Initialize random number generator 4394 4395 /* Initialize these only once when thread is grabbed for a team allocation */ 4396 KA_TRACE(20, 4397 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4398 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4399 4400 int b; 4401 kmp_balign_t *balign = new_thr->th.th_bar; 4402 for (b = 0; b < bs_last_barrier; ++b) { 4403 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4404 balign[b].bb.team = NULL; 4405 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4406 balign[b].bb.use_oncore_barrier = 0; 4407 } 4408 4409 new_thr->th.th_spin_here = FALSE; 4410 new_thr->th.th_next_waiting = 0; 4411 #if KMP_OS_UNIX 4412 new_thr->th.th_blocking = false; 4413 #endif 4414 4415 #if KMP_AFFINITY_SUPPORTED 4416 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4417 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4418 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4419 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4420 #endif 4421 new_thr->th.th_def_allocator = __kmp_def_allocator; 4422 new_thr->th.th_prev_level = 0; 4423 new_thr->th.th_prev_num_threads = 1; 4424 4425 TCW_4(new_thr->th.th_in_pool, FALSE); 4426 new_thr->th.th_active_in_pool = FALSE; 4427 TCW_4(new_thr->th.th_active, TRUE); 4428 4429 /* adjust the global counters */ 4430 __kmp_all_nth++; 4431 __kmp_nth++; 4432 4433 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4434 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4435 if (__kmp_adjust_gtid_mode) { 4436 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4437 if (TCR_4(__kmp_gtid_mode) != 2) { 4438 TCW_4(__kmp_gtid_mode, 2); 4439 } 4440 } else { 4441 if (TCR_4(__kmp_gtid_mode) != 1) { 4442 TCW_4(__kmp_gtid_mode, 1); 4443 } 4444 } 4445 } 4446 4447 #ifdef KMP_ADJUST_BLOCKTIME 4448 /* Adjust blocktime back to zero if necessary */ 4449 /* Middle initialization might not have occurred yet */ 4450 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4451 if (__kmp_nth > __kmp_avail_proc) { 4452 __kmp_zero_bt = TRUE; 4453 } 4454 } 4455 #endif /* KMP_ADJUST_BLOCKTIME */ 4456 4457 /* actually fork it and create the new worker thread */ 4458 KF_TRACE( 4459 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4460 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4461 KF_TRACE(10, 4462 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4463 4464 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4465 new_gtid)); 4466 KMP_MB(); 4467 return new_thr; 4468 } 4469 4470 /* Reinitialize team for reuse. 4471 The hot team code calls this case at every fork barrier, so EPCC barrier 4472 test are extremely sensitive to changes in it, esp. writes to the team 4473 struct, which cause a cache invalidation in all threads. 4474 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4475 static void __kmp_reinitialize_team(kmp_team_t *team, 4476 kmp_internal_control_t *new_icvs, 4477 ident_t *loc) { 4478 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4479 team->t.t_threads[0], team)); 4480 KMP_DEBUG_ASSERT(team && new_icvs); 4481 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4482 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4483 4484 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4485 // Copy ICVs to the master thread's implicit taskdata 4486 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4487 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4488 4489 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4490 team->t.t_threads[0], team)); 4491 } 4492 4493 /* Initialize the team data structure. 4494 This assumes the t_threads and t_max_nproc are already set. 4495 Also, we don't touch the arguments */ 4496 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4497 kmp_internal_control_t *new_icvs, 4498 ident_t *loc) { 4499 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4500 4501 /* verify */ 4502 KMP_DEBUG_ASSERT(team); 4503 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4504 KMP_DEBUG_ASSERT(team->t.t_threads); 4505 KMP_MB(); 4506 4507 team->t.t_master_tid = 0; /* not needed */ 4508 /* team->t.t_master_bar; not needed */ 4509 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4510 team->t.t_nproc = new_nproc; 4511 4512 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4513 team->t.t_next_pool = NULL; 4514 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4515 * up hot team */ 4516 4517 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4518 team->t.t_invoke = NULL; /* not needed */ 4519 4520 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4521 team->t.t_sched.sched = new_icvs->sched.sched; 4522 4523 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4524 team->t.t_fp_control_saved = FALSE; /* not needed */ 4525 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4526 team->t.t_mxcsr = 0; /* not needed */ 4527 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4528 4529 team->t.t_construct = 0; 4530 4531 team->t.t_ordered.dt.t_value = 0; 4532 team->t.t_master_active = FALSE; 4533 4534 #ifdef KMP_DEBUG 4535 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4536 #endif 4537 #if KMP_OS_WINDOWS 4538 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4539 #endif 4540 4541 team->t.t_control_stack_top = NULL; 4542 4543 __kmp_reinitialize_team(team, new_icvs, loc); 4544 4545 KMP_MB(); 4546 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4547 } 4548 4549 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4550 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4551 static void 4552 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4553 if (KMP_AFFINITY_CAPABLE()) { 4554 int status; 4555 if (old_mask != NULL) { 4556 status = __kmp_get_system_affinity(old_mask, TRUE); 4557 int error = errno; 4558 if (status != 0) { 4559 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4560 __kmp_msg_null); 4561 } 4562 } 4563 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4564 } 4565 } 4566 #endif 4567 4568 #if KMP_AFFINITY_SUPPORTED 4569 4570 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4571 // It calculates the worker + master thread's partition based upon the parent 4572 // thread's partition, and binds each worker to a thread in their partition. 4573 // The master thread's partition should already include its current binding. 4574 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4575 // Copy the master thread's place partition to the team struct 4576 kmp_info_t *master_th = team->t.t_threads[0]; 4577 KMP_DEBUG_ASSERT(master_th != NULL); 4578 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4579 int first_place = master_th->th.th_first_place; 4580 int last_place = master_th->th.th_last_place; 4581 int masters_place = master_th->th.th_current_place; 4582 team->t.t_first_place = first_place; 4583 team->t.t_last_place = last_place; 4584 4585 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4586 "bound to place %d partition = [%d,%d]\n", 4587 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4588 team->t.t_id, masters_place, first_place, last_place)); 4589 4590 switch (proc_bind) { 4591 4592 case proc_bind_default: 4593 // serial teams might have the proc_bind policy set to proc_bind_default. It 4594 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4595 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4596 break; 4597 4598 case proc_bind_master: { 4599 int f; 4600 int n_th = team->t.t_nproc; 4601 for (f = 1; f < n_th; f++) { 4602 kmp_info_t *th = team->t.t_threads[f]; 4603 KMP_DEBUG_ASSERT(th != NULL); 4604 th->th.th_first_place = first_place; 4605 th->th.th_last_place = last_place; 4606 th->th.th_new_place = masters_place; 4607 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4608 team->t.t_display_affinity != 1) { 4609 team->t.t_display_affinity = 1; 4610 } 4611 4612 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4613 "partition = [%d,%d]\n", 4614 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4615 f, masters_place, first_place, last_place)); 4616 } 4617 } break; 4618 4619 case proc_bind_close: { 4620 int f; 4621 int n_th = team->t.t_nproc; 4622 int n_places; 4623 if (first_place <= last_place) { 4624 n_places = last_place - first_place + 1; 4625 } else { 4626 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4627 } 4628 if (n_th <= n_places) { 4629 int place = masters_place; 4630 for (f = 1; f < n_th; f++) { 4631 kmp_info_t *th = team->t.t_threads[f]; 4632 KMP_DEBUG_ASSERT(th != NULL); 4633 4634 if (place == last_place) { 4635 place = first_place; 4636 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4637 place = 0; 4638 } else { 4639 place++; 4640 } 4641 th->th.th_first_place = first_place; 4642 th->th.th_last_place = last_place; 4643 th->th.th_new_place = place; 4644 if (__kmp_display_affinity && place != th->th.th_current_place && 4645 team->t.t_display_affinity != 1) { 4646 team->t.t_display_affinity = 1; 4647 } 4648 4649 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4650 "partition = [%d,%d]\n", 4651 __kmp_gtid_from_thread(team->t.t_threads[f]), 4652 team->t.t_id, f, place, first_place, last_place)); 4653 } 4654 } else { 4655 int S, rem, gap, s_count; 4656 S = n_th / n_places; 4657 s_count = 0; 4658 rem = n_th - (S * n_places); 4659 gap = rem > 0 ? n_places / rem : n_places; 4660 int place = masters_place; 4661 int gap_ct = gap; 4662 for (f = 0; f < n_th; f++) { 4663 kmp_info_t *th = team->t.t_threads[f]; 4664 KMP_DEBUG_ASSERT(th != NULL); 4665 4666 th->th.th_first_place = first_place; 4667 th->th.th_last_place = last_place; 4668 th->th.th_new_place = place; 4669 if (__kmp_display_affinity && place != th->th.th_current_place && 4670 team->t.t_display_affinity != 1) { 4671 team->t.t_display_affinity = 1; 4672 } 4673 s_count++; 4674 4675 if ((s_count == S) && rem && (gap_ct == gap)) { 4676 // do nothing, add an extra thread to place on next iteration 4677 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4678 // we added an extra thread to this place; move to next place 4679 if (place == last_place) { 4680 place = first_place; 4681 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4682 place = 0; 4683 } else { 4684 place++; 4685 } 4686 s_count = 0; 4687 gap_ct = 1; 4688 rem--; 4689 } else if (s_count == S) { // place full; don't add extra 4690 if (place == last_place) { 4691 place = first_place; 4692 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4693 place = 0; 4694 } else { 4695 place++; 4696 } 4697 gap_ct++; 4698 s_count = 0; 4699 } 4700 4701 KA_TRACE(100, 4702 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4703 "partition = [%d,%d]\n", 4704 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4705 th->th.th_new_place, first_place, last_place)); 4706 } 4707 KMP_DEBUG_ASSERT(place == masters_place); 4708 } 4709 } break; 4710 4711 case proc_bind_spread: { 4712 int f; 4713 int n_th = team->t.t_nproc; 4714 int n_places; 4715 int thidx; 4716 if (first_place <= last_place) { 4717 n_places = last_place - first_place + 1; 4718 } else { 4719 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4720 } 4721 if (n_th <= n_places) { 4722 int place = -1; 4723 4724 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4725 int S = n_places / n_th; 4726 int s_count, rem, gap, gap_ct; 4727 4728 place = masters_place; 4729 rem = n_places - n_th * S; 4730 gap = rem ? n_th / rem : 1; 4731 gap_ct = gap; 4732 thidx = n_th; 4733 if (update_master_only == 1) 4734 thidx = 1; 4735 for (f = 0; f < thidx; f++) { 4736 kmp_info_t *th = team->t.t_threads[f]; 4737 KMP_DEBUG_ASSERT(th != NULL); 4738 4739 th->th.th_first_place = place; 4740 th->th.th_new_place = place; 4741 if (__kmp_display_affinity && place != th->th.th_current_place && 4742 team->t.t_display_affinity != 1) { 4743 team->t.t_display_affinity = 1; 4744 } 4745 s_count = 1; 4746 while (s_count < S) { 4747 if (place == last_place) { 4748 place = first_place; 4749 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4750 place = 0; 4751 } else { 4752 place++; 4753 } 4754 s_count++; 4755 } 4756 if (rem && (gap_ct == gap)) { 4757 if (place == last_place) { 4758 place = first_place; 4759 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4760 place = 0; 4761 } else { 4762 place++; 4763 } 4764 rem--; 4765 gap_ct = 0; 4766 } 4767 th->th.th_last_place = place; 4768 gap_ct++; 4769 4770 if (place == last_place) { 4771 place = first_place; 4772 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4773 place = 0; 4774 } else { 4775 place++; 4776 } 4777 4778 KA_TRACE(100, 4779 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4780 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4781 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4782 f, th->th.th_new_place, th->th.th_first_place, 4783 th->th.th_last_place, __kmp_affinity_num_masks)); 4784 } 4785 } else { 4786 /* Having uniform space of available computation places I can create 4787 T partitions of round(P/T) size and put threads into the first 4788 place of each partition. */ 4789 double current = static_cast<double>(masters_place); 4790 double spacing = 4791 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4792 int first, last; 4793 kmp_info_t *th; 4794 4795 thidx = n_th + 1; 4796 if (update_master_only == 1) 4797 thidx = 1; 4798 for (f = 0; f < thidx; f++) { 4799 first = static_cast<int>(current); 4800 last = static_cast<int>(current + spacing) - 1; 4801 KMP_DEBUG_ASSERT(last >= first); 4802 if (first >= n_places) { 4803 if (masters_place) { 4804 first -= n_places; 4805 last -= n_places; 4806 if (first == (masters_place + 1)) { 4807 KMP_DEBUG_ASSERT(f == n_th); 4808 first--; 4809 } 4810 if (last == masters_place) { 4811 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4812 last--; 4813 } 4814 } else { 4815 KMP_DEBUG_ASSERT(f == n_th); 4816 first = 0; 4817 last = 0; 4818 } 4819 } 4820 if (last >= n_places) { 4821 last = (n_places - 1); 4822 } 4823 place = first; 4824 current += spacing; 4825 if (f < n_th) { 4826 KMP_DEBUG_ASSERT(0 <= first); 4827 KMP_DEBUG_ASSERT(n_places > first); 4828 KMP_DEBUG_ASSERT(0 <= last); 4829 KMP_DEBUG_ASSERT(n_places > last); 4830 KMP_DEBUG_ASSERT(last_place >= first_place); 4831 th = team->t.t_threads[f]; 4832 KMP_DEBUG_ASSERT(th); 4833 th->th.th_first_place = first; 4834 th->th.th_new_place = place; 4835 th->th.th_last_place = last; 4836 if (__kmp_display_affinity && place != th->th.th_current_place && 4837 team->t.t_display_affinity != 1) { 4838 team->t.t_display_affinity = 1; 4839 } 4840 KA_TRACE(100, 4841 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4842 "partition = [%d,%d], spacing = %.4f\n", 4843 __kmp_gtid_from_thread(team->t.t_threads[f]), 4844 team->t.t_id, f, th->th.th_new_place, 4845 th->th.th_first_place, th->th.th_last_place, spacing)); 4846 } 4847 } 4848 } 4849 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4850 } else { 4851 int S, rem, gap, s_count; 4852 S = n_th / n_places; 4853 s_count = 0; 4854 rem = n_th - (S * n_places); 4855 gap = rem > 0 ? n_places / rem : n_places; 4856 int place = masters_place; 4857 int gap_ct = gap; 4858 thidx = n_th; 4859 if (update_master_only == 1) 4860 thidx = 1; 4861 for (f = 0; f < thidx; f++) { 4862 kmp_info_t *th = team->t.t_threads[f]; 4863 KMP_DEBUG_ASSERT(th != NULL); 4864 4865 th->th.th_first_place = place; 4866 th->th.th_last_place = place; 4867 th->th.th_new_place = place; 4868 if (__kmp_display_affinity && place != th->th.th_current_place && 4869 team->t.t_display_affinity != 1) { 4870 team->t.t_display_affinity = 1; 4871 } 4872 s_count++; 4873 4874 if ((s_count == S) && rem && (gap_ct == gap)) { 4875 // do nothing, add an extra thread to place on next iteration 4876 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4877 // we added an extra thread to this place; move on to next place 4878 if (place == last_place) { 4879 place = first_place; 4880 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4881 place = 0; 4882 } else { 4883 place++; 4884 } 4885 s_count = 0; 4886 gap_ct = 1; 4887 rem--; 4888 } else if (s_count == S) { // place is full; don't add extra thread 4889 if (place == last_place) { 4890 place = first_place; 4891 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4892 place = 0; 4893 } else { 4894 place++; 4895 } 4896 gap_ct++; 4897 s_count = 0; 4898 } 4899 4900 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4901 "partition = [%d,%d]\n", 4902 __kmp_gtid_from_thread(team->t.t_threads[f]), 4903 team->t.t_id, f, th->th.th_new_place, 4904 th->th.th_first_place, th->th.th_last_place)); 4905 } 4906 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4907 } 4908 } break; 4909 4910 default: 4911 break; 4912 } 4913 4914 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4915 } 4916 4917 #endif // KMP_AFFINITY_SUPPORTED 4918 4919 /* allocate a new team data structure to use. take one off of the free pool if 4920 available */ 4921 kmp_team_t * 4922 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4923 #if OMPT_SUPPORT 4924 ompt_data_t ompt_parallel_data, 4925 #endif 4926 kmp_proc_bind_t new_proc_bind, 4927 kmp_internal_control_t *new_icvs, 4928 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4929 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4930 int f; 4931 kmp_team_t *team; 4932 int use_hot_team = !root->r.r_active; 4933 int level = 0; 4934 4935 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4936 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4937 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4938 KMP_MB(); 4939 4940 #if KMP_NESTED_HOT_TEAMS 4941 kmp_hot_team_ptr_t *hot_teams; 4942 if (master) { 4943 team = master->th.th_team; 4944 level = team->t.t_active_level; 4945 if (master->th.th_teams_microtask) { // in teams construct? 4946 if (master->th.th_teams_size.nteams > 1 && 4947 ( // #teams > 1 4948 team->t.t_pkfn == 4949 (microtask_t)__kmp_teams_master || // inner fork of the teams 4950 master->th.th_teams_level < 4951 team->t.t_level)) { // or nested parallel inside the teams 4952 ++level; // not increment if #teams==1, or for outer fork of the teams; 4953 // increment otherwise 4954 } 4955 } 4956 hot_teams = master->th.th_hot_teams; 4957 if (level < __kmp_hot_teams_max_level && hot_teams && 4958 hot_teams[level].hot_team) { 4959 // hot team has already been allocated for given level 4960 use_hot_team = 1; 4961 } else { 4962 use_hot_team = 0; 4963 } 4964 } else { 4965 // check we won't access uninitialized hot_teams, just in case 4966 KMP_DEBUG_ASSERT(new_nproc == 1); 4967 } 4968 #endif 4969 // Optimization to use a "hot" team 4970 if (use_hot_team && new_nproc > 1) { 4971 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4972 #if KMP_NESTED_HOT_TEAMS 4973 team = hot_teams[level].hot_team; 4974 #else 4975 team = root->r.r_hot_team; 4976 #endif 4977 #if KMP_DEBUG 4978 if (__kmp_tasking_mode != tskm_immediate_exec) { 4979 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4980 "task_team[1] = %p before reinit\n", 4981 team->t.t_task_team[0], team->t.t_task_team[1])); 4982 } 4983 #endif 4984 4985 // Has the number of threads changed? 4986 /* Let's assume the most common case is that the number of threads is 4987 unchanged, and put that case first. */ 4988 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4989 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4990 // This case can mean that omp_set_num_threads() was called and the hot 4991 // team size was already reduced, so we check the special flag 4992 if (team->t.t_size_changed == -1) { 4993 team->t.t_size_changed = 1; 4994 } else { 4995 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4996 } 4997 4998 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4999 kmp_r_sched_t new_sched = new_icvs->sched; 5000 // set master's schedule as new run-time schedule 5001 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5002 5003 __kmp_reinitialize_team(team, new_icvs, 5004 root->r.r_uber_thread->th.th_ident); 5005 5006 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5007 team->t.t_threads[0], team)); 5008 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5009 5010 #if KMP_AFFINITY_SUPPORTED 5011 if ((team->t.t_size_changed == 0) && 5012 (team->t.t_proc_bind == new_proc_bind)) { 5013 if (new_proc_bind == proc_bind_spread) { 5014 __kmp_partition_places( 5015 team, 1); // add flag to update only master for spread 5016 } 5017 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5018 "proc_bind = %d, partition = [%d,%d]\n", 5019 team->t.t_id, new_proc_bind, team->t.t_first_place, 5020 team->t.t_last_place)); 5021 } else { 5022 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5023 __kmp_partition_places(team); 5024 } 5025 #else 5026 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5027 #endif /* KMP_AFFINITY_SUPPORTED */ 5028 } else if (team->t.t_nproc > new_nproc) { 5029 KA_TRACE(20, 5030 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5031 new_nproc)); 5032 5033 team->t.t_size_changed = 1; 5034 #if KMP_NESTED_HOT_TEAMS 5035 if (__kmp_hot_teams_mode == 0) { 5036 // AC: saved number of threads should correspond to team's value in this 5037 // mode, can be bigger in mode 1, when hot team has threads in reserve 5038 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5039 hot_teams[level].hot_team_nth = new_nproc; 5040 #endif // KMP_NESTED_HOT_TEAMS 5041 /* release the extra threads we don't need any more */ 5042 for (f = new_nproc; f < team->t.t_nproc; f++) { 5043 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5044 if (__kmp_tasking_mode != tskm_immediate_exec) { 5045 // When decreasing team size, threads no longer in the team should 5046 // unref task team. 5047 team->t.t_threads[f]->th.th_task_team = NULL; 5048 } 5049 __kmp_free_thread(team->t.t_threads[f]); 5050 team->t.t_threads[f] = NULL; 5051 } 5052 #if KMP_NESTED_HOT_TEAMS 5053 } // (__kmp_hot_teams_mode == 0) 5054 else { 5055 // When keeping extra threads in team, switch threads to wait on own 5056 // b_go flag 5057 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5058 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5059 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5060 for (int b = 0; b < bs_last_barrier; ++b) { 5061 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5062 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5063 } 5064 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5065 } 5066 } 5067 } 5068 #endif // KMP_NESTED_HOT_TEAMS 5069 team->t.t_nproc = new_nproc; 5070 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5071 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5072 __kmp_reinitialize_team(team, new_icvs, 5073 root->r.r_uber_thread->th.th_ident); 5074 5075 // Update remaining threads 5076 for (f = 0; f < new_nproc; ++f) { 5077 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5078 } 5079 5080 // restore the current task state of the master thread: should be the 5081 // implicit task 5082 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5083 team->t.t_threads[0], team)); 5084 5085 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5086 5087 #ifdef KMP_DEBUG 5088 for (f = 0; f < team->t.t_nproc; f++) { 5089 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5090 team->t.t_threads[f]->th.th_team_nproc == 5091 team->t.t_nproc); 5092 } 5093 #endif 5094 5095 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5096 #if KMP_AFFINITY_SUPPORTED 5097 __kmp_partition_places(team); 5098 #endif 5099 } else { // team->t.t_nproc < new_nproc 5100 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5101 kmp_affin_mask_t *old_mask; 5102 if (KMP_AFFINITY_CAPABLE()) { 5103 KMP_CPU_ALLOC(old_mask); 5104 } 5105 #endif 5106 5107 KA_TRACE(20, 5108 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5109 new_nproc)); 5110 5111 team->t.t_size_changed = 1; 5112 5113 #if KMP_NESTED_HOT_TEAMS 5114 int avail_threads = hot_teams[level].hot_team_nth; 5115 if (new_nproc < avail_threads) 5116 avail_threads = new_nproc; 5117 kmp_info_t **other_threads = team->t.t_threads; 5118 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5119 // Adjust barrier data of reserved threads (if any) of the team 5120 // Other data will be set in __kmp_initialize_info() below. 5121 int b; 5122 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5123 for (b = 0; b < bs_last_barrier; ++b) { 5124 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5125 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5126 #if USE_DEBUGGER 5127 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5128 #endif 5129 } 5130 } 5131 if (hot_teams[level].hot_team_nth >= new_nproc) { 5132 // we have all needed threads in reserve, no need to allocate any 5133 // this only possible in mode 1, cannot have reserved threads in mode 0 5134 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5135 team->t.t_nproc = new_nproc; // just get reserved threads involved 5136 } else { 5137 // we may have some threads in reserve, but not enough 5138 team->t.t_nproc = 5139 hot_teams[level] 5140 .hot_team_nth; // get reserved threads involved if any 5141 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5142 #endif // KMP_NESTED_HOT_TEAMS 5143 if (team->t.t_max_nproc < new_nproc) { 5144 /* reallocate larger arrays */ 5145 __kmp_reallocate_team_arrays(team, new_nproc); 5146 __kmp_reinitialize_team(team, new_icvs, NULL); 5147 } 5148 5149 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5150 /* Temporarily set full mask for master thread before creation of 5151 workers. The reason is that workers inherit the affinity from master, 5152 so if a lot of workers are created on the single core quickly, they 5153 don't get a chance to set their own affinity for a long time. */ 5154 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5155 #endif 5156 5157 /* allocate new threads for the hot team */ 5158 for (f = team->t.t_nproc; f < new_nproc; f++) { 5159 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5160 KMP_DEBUG_ASSERT(new_worker); 5161 team->t.t_threads[f] = new_worker; 5162 5163 KA_TRACE(20, 5164 ("__kmp_allocate_team: team %d init T#%d arrived: " 5165 "join=%llu, plain=%llu\n", 5166 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5167 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5168 team->t.t_bar[bs_plain_barrier].b_arrived)); 5169 5170 { // Initialize barrier data for new threads. 5171 int b; 5172 kmp_balign_t *balign = new_worker->th.th_bar; 5173 for (b = 0; b < bs_last_barrier; ++b) { 5174 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5175 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5176 KMP_BARRIER_PARENT_FLAG); 5177 #if USE_DEBUGGER 5178 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5179 #endif 5180 } 5181 } 5182 } 5183 5184 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5185 if (KMP_AFFINITY_CAPABLE()) { 5186 /* Restore initial master thread's affinity mask */ 5187 __kmp_set_system_affinity(old_mask, TRUE); 5188 KMP_CPU_FREE(old_mask); 5189 } 5190 #endif 5191 #if KMP_NESTED_HOT_TEAMS 5192 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5193 #endif // KMP_NESTED_HOT_TEAMS 5194 /* make sure everyone is syncronized */ 5195 int old_nproc = team->t.t_nproc; // save old value and use to update only 5196 // new threads below 5197 __kmp_initialize_team(team, new_nproc, new_icvs, 5198 root->r.r_uber_thread->th.th_ident); 5199 5200 /* reinitialize the threads */ 5201 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5202 for (f = 0; f < team->t.t_nproc; ++f) 5203 __kmp_initialize_info(team->t.t_threads[f], team, f, 5204 __kmp_gtid_from_tid(f, team)); 5205 5206 if (level) { // set th_task_state for new threads in nested hot team 5207 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5208 // only need to set the th_task_state for the new threads. th_task_state 5209 // for master thread will not be accurate until after this in 5210 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5211 // correct value. 5212 for (f = old_nproc; f < team->t.t_nproc; ++f) 5213 team->t.t_threads[f]->th.th_task_state = 5214 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5215 } else { // set th_task_state for new threads in non-nested hot team 5216 kmp_uint8 old_state = 5217 team->t.t_threads[0]->th.th_task_state; // copy master's state 5218 for (f = old_nproc; f < team->t.t_nproc; ++f) 5219 team->t.t_threads[f]->th.th_task_state = old_state; 5220 } 5221 5222 #ifdef KMP_DEBUG 5223 for (f = 0; f < team->t.t_nproc; ++f) { 5224 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5225 team->t.t_threads[f]->th.th_team_nproc == 5226 team->t.t_nproc); 5227 } 5228 #endif 5229 5230 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5231 #if KMP_AFFINITY_SUPPORTED 5232 __kmp_partition_places(team); 5233 #endif 5234 } // Check changes in number of threads 5235 5236 kmp_info_t *master = team->t.t_threads[0]; 5237 if (master->th.th_teams_microtask) { 5238 for (f = 1; f < new_nproc; ++f) { 5239 // propagate teams construct specific info to workers 5240 kmp_info_t *thr = team->t.t_threads[f]; 5241 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5242 thr->th.th_teams_level = master->th.th_teams_level; 5243 thr->th.th_teams_size = master->th.th_teams_size; 5244 } 5245 } 5246 #if KMP_NESTED_HOT_TEAMS 5247 if (level) { 5248 // Sync barrier state for nested hot teams, not needed for outermost hot 5249 // team. 5250 for (f = 1; f < new_nproc; ++f) { 5251 kmp_info_t *thr = team->t.t_threads[f]; 5252 int b; 5253 kmp_balign_t *balign = thr->th.th_bar; 5254 for (b = 0; b < bs_last_barrier; ++b) { 5255 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5256 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5257 #if USE_DEBUGGER 5258 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5259 #endif 5260 } 5261 } 5262 } 5263 #endif // KMP_NESTED_HOT_TEAMS 5264 5265 /* reallocate space for arguments if necessary */ 5266 __kmp_alloc_argv_entries(argc, team, TRUE); 5267 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5268 // The hot team re-uses the previous task team, 5269 // if untouched during the previous release->gather phase. 5270 5271 KF_TRACE(10, (" hot_team = %p\n", team)); 5272 5273 #if KMP_DEBUG 5274 if (__kmp_tasking_mode != tskm_immediate_exec) { 5275 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5276 "task_team[1] = %p after reinit\n", 5277 team->t.t_task_team[0], team->t.t_task_team[1])); 5278 } 5279 #endif 5280 5281 #if OMPT_SUPPORT 5282 __ompt_team_assign_id(team, ompt_parallel_data); 5283 #endif 5284 5285 KMP_MB(); 5286 5287 return team; 5288 } 5289 5290 /* next, let's try to take one from the team pool */ 5291 KMP_MB(); 5292 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5293 /* TODO: consider resizing undersized teams instead of reaping them, now 5294 that we have a resizing mechanism */ 5295 if (team->t.t_max_nproc >= max_nproc) { 5296 /* take this team from the team pool */ 5297 __kmp_team_pool = team->t.t_next_pool; 5298 5299 /* setup the team for fresh use */ 5300 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5301 5302 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5303 "task_team[1] %p to NULL\n", 5304 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5305 team->t.t_task_team[0] = NULL; 5306 team->t.t_task_team[1] = NULL; 5307 5308 /* reallocate space for arguments if necessary */ 5309 __kmp_alloc_argv_entries(argc, team, TRUE); 5310 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5311 5312 KA_TRACE( 5313 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5314 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5315 { // Initialize barrier data. 5316 int b; 5317 for (b = 0; b < bs_last_barrier; ++b) { 5318 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5319 #if USE_DEBUGGER 5320 team->t.t_bar[b].b_master_arrived = 0; 5321 team->t.t_bar[b].b_team_arrived = 0; 5322 #endif 5323 } 5324 } 5325 5326 team->t.t_proc_bind = new_proc_bind; 5327 5328 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5329 team->t.t_id)); 5330 5331 #if OMPT_SUPPORT 5332 __ompt_team_assign_id(team, ompt_parallel_data); 5333 #endif 5334 5335 KMP_MB(); 5336 5337 return team; 5338 } 5339 5340 /* reap team if it is too small, then loop back and check the next one */ 5341 // not sure if this is wise, but, will be redone during the hot-teams 5342 // rewrite. 5343 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5344 team = __kmp_reap_team(team); 5345 __kmp_team_pool = team; 5346 } 5347 5348 /* nothing available in the pool, no matter, make a new team! */ 5349 KMP_MB(); 5350 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5351 5352 /* and set it up */ 5353 team->t.t_max_nproc = max_nproc; 5354 /* NOTE well, for some reason allocating one big buffer and dividing it up 5355 seems to really hurt performance a lot on the P4, so, let's not use this */ 5356 __kmp_allocate_team_arrays(team, max_nproc); 5357 5358 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5359 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5360 5361 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5362 "%p to NULL\n", 5363 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5364 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5365 // memory, no need to duplicate 5366 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5367 // memory, no need to duplicate 5368 5369 if (__kmp_storage_map) { 5370 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5371 } 5372 5373 /* allocate space for arguments */ 5374 __kmp_alloc_argv_entries(argc, team, FALSE); 5375 team->t.t_argc = argc; 5376 5377 KA_TRACE(20, 5378 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5379 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5380 { // Initialize barrier data. 5381 int b; 5382 for (b = 0; b < bs_last_barrier; ++b) { 5383 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5384 #if USE_DEBUGGER 5385 team->t.t_bar[b].b_master_arrived = 0; 5386 team->t.t_bar[b].b_team_arrived = 0; 5387 #endif 5388 } 5389 } 5390 5391 team->t.t_proc_bind = new_proc_bind; 5392 5393 #if OMPT_SUPPORT 5394 __ompt_team_assign_id(team, ompt_parallel_data); 5395 team->t.ompt_serialized_team_info = NULL; 5396 #endif 5397 5398 KMP_MB(); 5399 5400 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5401 team->t.t_id)); 5402 5403 return team; 5404 } 5405 5406 /* TODO implement hot-teams at all levels */ 5407 /* TODO implement lazy thread release on demand (disband request) */ 5408 5409 /* free the team. return it to the team pool. release all the threads 5410 * associated with it */ 5411 void __kmp_free_team(kmp_root_t *root, 5412 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5413 int f; 5414 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5415 team->t.t_id)); 5416 5417 /* verify state */ 5418 KMP_DEBUG_ASSERT(root); 5419 KMP_DEBUG_ASSERT(team); 5420 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5421 KMP_DEBUG_ASSERT(team->t.t_threads); 5422 5423 int use_hot_team = team == root->r.r_hot_team; 5424 #if KMP_NESTED_HOT_TEAMS 5425 int level; 5426 kmp_hot_team_ptr_t *hot_teams; 5427 if (master) { 5428 level = team->t.t_active_level - 1; 5429 if (master->th.th_teams_microtask) { // in teams construct? 5430 if (master->th.th_teams_size.nteams > 1) { 5431 ++level; // level was not increased in teams construct for 5432 // team_of_masters 5433 } 5434 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5435 master->th.th_teams_level == team->t.t_level) { 5436 ++level; // level was not increased in teams construct for 5437 // team_of_workers before the parallel 5438 } // team->t.t_level will be increased inside parallel 5439 } 5440 hot_teams = master->th.th_hot_teams; 5441 if (level < __kmp_hot_teams_max_level) { 5442 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5443 use_hot_team = 1; 5444 } 5445 } 5446 #endif // KMP_NESTED_HOT_TEAMS 5447 5448 /* team is done working */ 5449 TCW_SYNC_PTR(team->t.t_pkfn, 5450 NULL); // Important for Debugging Support Library. 5451 #if KMP_OS_WINDOWS 5452 team->t.t_copyin_counter = 0; // init counter for possible reuse 5453 #endif 5454 // Do not reset pointer to parent team to NULL for hot teams. 5455 5456 /* if we are non-hot team, release our threads */ 5457 if (!use_hot_team) { 5458 if (__kmp_tasking_mode != tskm_immediate_exec) { 5459 // Wait for threads to reach reapable state 5460 for (f = 1; f < team->t.t_nproc; ++f) { 5461 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5462 kmp_info_t *th = team->t.t_threads[f]; 5463 volatile kmp_uint32 *state = &th->th.th_reap_state; 5464 while (*state != KMP_SAFE_TO_REAP) { 5465 #if KMP_OS_WINDOWS 5466 // On Windows a thread can be killed at any time, check this 5467 DWORD ecode; 5468 if (!__kmp_is_thread_alive(th, &ecode)) { 5469 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5470 break; 5471 } 5472 #endif 5473 // first check if thread is sleeping 5474 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5475 if (fl.is_sleeping()) 5476 fl.resume(__kmp_gtid_from_thread(th)); 5477 KMP_CPU_PAUSE(); 5478 } 5479 } 5480 5481 // Delete task teams 5482 int tt_idx; 5483 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5484 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5485 if (task_team != NULL) { 5486 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5487 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5488 team->t.t_threads[f]->th.th_task_team = NULL; 5489 } 5490 KA_TRACE( 5491 20, 5492 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5493 __kmp_get_gtid(), task_team, team->t.t_id)); 5494 #if KMP_NESTED_HOT_TEAMS 5495 __kmp_free_task_team(master, task_team); 5496 #endif 5497 team->t.t_task_team[tt_idx] = NULL; 5498 } 5499 } 5500 } 5501 5502 // Reset pointer to parent team only for non-hot teams. 5503 team->t.t_parent = NULL; 5504 team->t.t_level = 0; 5505 team->t.t_active_level = 0; 5506 5507 /* free the worker threads */ 5508 for (f = 1; f < team->t.t_nproc; ++f) { 5509 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5510 __kmp_free_thread(team->t.t_threads[f]); 5511 team->t.t_threads[f] = NULL; 5512 } 5513 5514 /* put the team back in the team pool */ 5515 /* TODO limit size of team pool, call reap_team if pool too large */ 5516 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5517 __kmp_team_pool = (volatile kmp_team_t *)team; 5518 } else { // Check if team was created for the masters in a teams construct 5519 // See if first worker is a CG root 5520 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5521 team->t.t_threads[1]->th.th_cg_roots); 5522 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5523 // Clean up the CG root nodes on workers so that this team can be re-used 5524 for (f = 1; f < team->t.t_nproc; ++f) { 5525 kmp_info_t *thr = team->t.t_threads[f]; 5526 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5527 thr->th.th_cg_roots->cg_root == thr); 5528 // Pop current CG root off list 5529 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5530 thr->th.th_cg_roots = tmp->up; 5531 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5532 " up to node %p. cg_nthreads was %d\n", 5533 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5534 int i = tmp->cg_nthreads--; 5535 if (i == 1) { 5536 __kmp_free(tmp); // free CG if we are the last thread in it 5537 } 5538 // Restore current task's thread_limit from CG root 5539 if (thr->th.th_cg_roots) 5540 thr->th.th_current_task->td_icvs.thread_limit = 5541 thr->th.th_cg_roots->cg_thread_limit; 5542 } 5543 } 5544 } 5545 5546 KMP_MB(); 5547 } 5548 5549 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5550 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5551 kmp_team_t *next_pool = team->t.t_next_pool; 5552 5553 KMP_DEBUG_ASSERT(team); 5554 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5555 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5556 KMP_DEBUG_ASSERT(team->t.t_threads); 5557 KMP_DEBUG_ASSERT(team->t.t_argv); 5558 5559 /* TODO clean the threads that are a part of this? */ 5560 5561 /* free stuff */ 5562 __kmp_free_team_arrays(team); 5563 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5564 __kmp_free((void *)team->t.t_argv); 5565 __kmp_free(team); 5566 5567 KMP_MB(); 5568 return next_pool; 5569 } 5570 5571 // Free the thread. Don't reap it, just place it on the pool of available 5572 // threads. 5573 // 5574 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5575 // binding for the affinity mechanism to be useful. 5576 // 5577 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5578 // However, we want to avoid a potential performance problem by always 5579 // scanning through the list to find the correct point at which to insert 5580 // the thread (potential N**2 behavior). To do this we keep track of the 5581 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5582 // With single-level parallelism, threads will always be added to the tail 5583 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5584 // parallelism, all bets are off and we may need to scan through the entire 5585 // free list. 5586 // 5587 // This change also has a potentially large performance benefit, for some 5588 // applications. Previously, as threads were freed from the hot team, they 5589 // would be placed back on the free list in inverse order. If the hot team 5590 // grew back to it's original size, then the freed thread would be placed 5591 // back on the hot team in reverse order. This could cause bad cache 5592 // locality problems on programs where the size of the hot team regularly 5593 // grew and shrunk. 5594 // 5595 // Now, for single-level parallelism, the OMP tid is always == gtid. 5596 void __kmp_free_thread(kmp_info_t *this_th) { 5597 int gtid; 5598 kmp_info_t **scan; 5599 5600 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5601 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5602 5603 KMP_DEBUG_ASSERT(this_th); 5604 5605 // When moving thread to pool, switch thread to wait on own b_go flag, and 5606 // uninitialized (NULL team). 5607 int b; 5608 kmp_balign_t *balign = this_th->th.th_bar; 5609 for (b = 0; b < bs_last_barrier; ++b) { 5610 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5611 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5612 balign[b].bb.team = NULL; 5613 balign[b].bb.leaf_kids = 0; 5614 } 5615 this_th->th.th_task_state = 0; 5616 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5617 5618 /* put thread back on the free pool */ 5619 TCW_PTR(this_th->th.th_team, NULL); 5620 TCW_PTR(this_th->th.th_root, NULL); 5621 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5622 5623 while (this_th->th.th_cg_roots) { 5624 this_th->th.th_cg_roots->cg_nthreads--; 5625 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5626 " %p of thread %p to %d\n", 5627 this_th, this_th->th.th_cg_roots, 5628 this_th->th.th_cg_roots->cg_root, 5629 this_th->th.th_cg_roots->cg_nthreads)); 5630 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5631 if (tmp->cg_root == this_th) { // Thread is a cg_root 5632 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5633 KA_TRACE( 5634 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5635 this_th->th.th_cg_roots = tmp->up; 5636 __kmp_free(tmp); 5637 } else { // Worker thread 5638 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5639 __kmp_free(tmp); 5640 } 5641 this_th->th.th_cg_roots = NULL; 5642 break; 5643 } 5644 } 5645 5646 /* If the implicit task assigned to this thread can be used by other threads 5647 * -> multiple threads can share the data and try to free the task at 5648 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5649 * with higher probability when hot team is disabled but can occurs even when 5650 * the hot team is enabled */ 5651 __kmp_free_implicit_task(this_th); 5652 this_th->th.th_current_task = NULL; 5653 5654 // If the __kmp_thread_pool_insert_pt is already past the new insert 5655 // point, then we need to re-scan the entire list. 5656 gtid = this_th->th.th_info.ds.ds_gtid; 5657 if (__kmp_thread_pool_insert_pt != NULL) { 5658 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5659 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5660 __kmp_thread_pool_insert_pt = NULL; 5661 } 5662 } 5663 5664 // Scan down the list to find the place to insert the thread. 5665 // scan is the address of a link in the list, possibly the address of 5666 // __kmp_thread_pool itself. 5667 // 5668 // In the absence of nested parallelism, the for loop will have 0 iterations. 5669 if (__kmp_thread_pool_insert_pt != NULL) { 5670 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5671 } else { 5672 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5673 } 5674 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5675 scan = &((*scan)->th.th_next_pool)) 5676 ; 5677 5678 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5679 // to its address. 5680 TCW_PTR(this_th->th.th_next_pool, *scan); 5681 __kmp_thread_pool_insert_pt = *scan = this_th; 5682 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5683 (this_th->th.th_info.ds.ds_gtid < 5684 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5685 TCW_4(this_th->th.th_in_pool, TRUE); 5686 __kmp_suspend_initialize_thread(this_th); 5687 __kmp_lock_suspend_mx(this_th); 5688 if (this_th->th.th_active == TRUE) { 5689 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5690 this_th->th.th_active_in_pool = TRUE; 5691 } 5692 #if KMP_DEBUG 5693 else { 5694 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5695 } 5696 #endif 5697 __kmp_unlock_suspend_mx(this_th); 5698 5699 TCW_4(__kmp_nth, __kmp_nth - 1); 5700 5701 #ifdef KMP_ADJUST_BLOCKTIME 5702 /* Adjust blocktime back to user setting or default if necessary */ 5703 /* Middle initialization might never have occurred */ 5704 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5705 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5706 if (__kmp_nth <= __kmp_avail_proc) { 5707 __kmp_zero_bt = FALSE; 5708 } 5709 } 5710 #endif /* KMP_ADJUST_BLOCKTIME */ 5711 5712 KMP_MB(); 5713 } 5714 5715 /* ------------------------------------------------------------------------ */ 5716 5717 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5718 #if OMP_PROFILING_SUPPORT 5719 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5720 // TODO: add a configuration option for time granularity 5721 if (ProfileTraceFile) 5722 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5723 #endif 5724 5725 int gtid = this_thr->th.th_info.ds.ds_gtid; 5726 /* void *stack_data;*/ 5727 kmp_team_t **volatile pteam; 5728 5729 KMP_MB(); 5730 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5731 5732 if (__kmp_env_consistency_check) { 5733 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5734 } 5735 5736 #if OMPT_SUPPORT 5737 ompt_data_t *thread_data; 5738 if (ompt_enabled.enabled) { 5739 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5740 *thread_data = ompt_data_none; 5741 5742 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5743 this_thr->th.ompt_thread_info.wait_id = 0; 5744 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5745 this_thr->th.ompt_thread_info.parallel_flags = 0; 5746 if (ompt_enabled.ompt_callback_thread_begin) { 5747 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5748 ompt_thread_worker, thread_data); 5749 } 5750 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5751 } 5752 #endif 5753 5754 /* This is the place where threads wait for work */ 5755 while (!TCR_4(__kmp_global.g.g_done)) { 5756 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5757 KMP_MB(); 5758 5759 /* wait for work to do */ 5760 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5761 5762 /* No tid yet since not part of a team */ 5763 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5764 5765 #if OMPT_SUPPORT 5766 if (ompt_enabled.enabled) { 5767 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5768 } 5769 #endif 5770 5771 pteam = &this_thr->th.th_team; 5772 5773 /* have we been allocated? */ 5774 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5775 /* we were just woken up, so run our new task */ 5776 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5777 int rc; 5778 KA_TRACE(20, 5779 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5780 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5781 (*pteam)->t.t_pkfn)); 5782 5783 updateHWFPControl(*pteam); 5784 5785 #if OMPT_SUPPORT 5786 if (ompt_enabled.enabled) { 5787 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5788 } 5789 #endif 5790 5791 rc = (*pteam)->t.t_invoke(gtid); 5792 KMP_ASSERT(rc); 5793 5794 KMP_MB(); 5795 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5796 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5797 (*pteam)->t.t_pkfn)); 5798 } 5799 #if OMPT_SUPPORT 5800 if (ompt_enabled.enabled) { 5801 /* no frame set while outside task */ 5802 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5803 5804 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5805 } 5806 #endif 5807 /* join barrier after parallel region */ 5808 __kmp_join_barrier(gtid); 5809 } 5810 } 5811 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5812 5813 #if OMPT_SUPPORT 5814 if (ompt_enabled.ompt_callback_thread_end) { 5815 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5816 } 5817 #endif 5818 5819 this_thr->th.th_task_team = NULL; 5820 /* run the destructors for the threadprivate data for this thread */ 5821 __kmp_common_destroy_gtid(gtid); 5822 5823 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5824 KMP_MB(); 5825 5826 #if OMP_PROFILING_SUPPORT 5827 llvm::timeTraceProfilerFinishThread(); 5828 #endif 5829 return this_thr; 5830 } 5831 5832 /* ------------------------------------------------------------------------ */ 5833 5834 void __kmp_internal_end_dest(void *specific_gtid) { 5835 // Make sure no significant bits are lost 5836 int gtid; 5837 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5838 5839 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5840 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5841 * this is because 0 is reserved for the nothing-stored case */ 5842 5843 __kmp_internal_end_thread(gtid); 5844 } 5845 5846 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5847 5848 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5849 __kmp_internal_end_atexit(); 5850 } 5851 5852 #endif 5853 5854 /* [Windows] josh: when the atexit handler is called, there may still be more 5855 than one thread alive */ 5856 void __kmp_internal_end_atexit(void) { 5857 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5858 /* [Windows] 5859 josh: ideally, we want to completely shutdown the library in this atexit 5860 handler, but stat code that depends on thread specific data for gtid fails 5861 because that data becomes unavailable at some point during the shutdown, so 5862 we call __kmp_internal_end_thread instead. We should eventually remove the 5863 dependency on __kmp_get_specific_gtid in the stat code and use 5864 __kmp_internal_end_library to cleanly shutdown the library. 5865 5866 // TODO: Can some of this comment about GVS be removed? 5867 I suspect that the offending stat code is executed when the calling thread 5868 tries to clean up a dead root thread's data structures, resulting in GVS 5869 code trying to close the GVS structures for that thread, but since the stat 5870 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5871 the calling thread is cleaning up itself instead of another thread, it get 5872 confused. This happens because allowing a thread to unregister and cleanup 5873 another thread is a recent modification for addressing an issue. 5874 Based on the current design (20050722), a thread may end up 5875 trying to unregister another thread only if thread death does not trigger 5876 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5877 thread specific data destructor function to detect thread death. For 5878 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5879 is nothing. Thus, the workaround is applicable only for Windows static 5880 stat library. */ 5881 __kmp_internal_end_library(-1); 5882 #if KMP_OS_WINDOWS 5883 __kmp_close_console(); 5884 #endif 5885 } 5886 5887 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5888 // It is assumed __kmp_forkjoin_lock is acquired. 5889 5890 int gtid; 5891 5892 KMP_DEBUG_ASSERT(thread != NULL); 5893 5894 gtid = thread->th.th_info.ds.ds_gtid; 5895 5896 if (!is_root) { 5897 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5898 /* Assume the threads are at the fork barrier here */ 5899 KA_TRACE( 5900 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5901 gtid)); 5902 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5903 * (GEH) */ 5904 ANNOTATE_HAPPENS_BEFORE(thread); 5905 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5906 thread); 5907 __kmp_release_64(&flag); 5908 } 5909 5910 // Terminate OS thread. 5911 __kmp_reap_worker(thread); 5912 5913 // The thread was killed asynchronously. If it was actively 5914 // spinning in the thread pool, decrement the global count. 5915 // 5916 // There is a small timing hole here - if the worker thread was just waking 5917 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5918 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5919 // the global counter might not get updated. 5920 // 5921 // Currently, this can only happen as the library is unloaded, 5922 // so there are no harmful side effects. 5923 if (thread->th.th_active_in_pool) { 5924 thread->th.th_active_in_pool = FALSE; 5925 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5926 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5927 } 5928 } 5929 5930 __kmp_free_implicit_task(thread); 5931 5932 // Free the fast memory for tasking 5933 #if USE_FAST_MEMORY 5934 __kmp_free_fast_memory(thread); 5935 #endif /* USE_FAST_MEMORY */ 5936 5937 __kmp_suspend_uninitialize_thread(thread); 5938 5939 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5940 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5941 5942 --__kmp_all_nth; 5943 // __kmp_nth was decremented when thread is added to the pool. 5944 5945 #ifdef KMP_ADJUST_BLOCKTIME 5946 /* Adjust blocktime back to user setting or default if necessary */ 5947 /* Middle initialization might never have occurred */ 5948 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5949 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5950 if (__kmp_nth <= __kmp_avail_proc) { 5951 __kmp_zero_bt = FALSE; 5952 } 5953 } 5954 #endif /* KMP_ADJUST_BLOCKTIME */ 5955 5956 /* free the memory being used */ 5957 if (__kmp_env_consistency_check) { 5958 if (thread->th.th_cons) { 5959 __kmp_free_cons_stack(thread->th.th_cons); 5960 thread->th.th_cons = NULL; 5961 } 5962 } 5963 5964 if (thread->th.th_pri_common != NULL) { 5965 __kmp_free(thread->th.th_pri_common); 5966 thread->th.th_pri_common = NULL; 5967 } 5968 5969 if (thread->th.th_task_state_memo_stack != NULL) { 5970 __kmp_free(thread->th.th_task_state_memo_stack); 5971 thread->th.th_task_state_memo_stack = NULL; 5972 } 5973 5974 #if KMP_USE_BGET 5975 if (thread->th.th_local.bget_data != NULL) { 5976 __kmp_finalize_bget(thread); 5977 } 5978 #endif 5979 5980 #if KMP_AFFINITY_SUPPORTED 5981 if (thread->th.th_affin_mask != NULL) { 5982 KMP_CPU_FREE(thread->th.th_affin_mask); 5983 thread->th.th_affin_mask = NULL; 5984 } 5985 #endif /* KMP_AFFINITY_SUPPORTED */ 5986 5987 #if KMP_USE_HIER_SCHED 5988 if (thread->th.th_hier_bar_data != NULL) { 5989 __kmp_free(thread->th.th_hier_bar_data); 5990 thread->th.th_hier_bar_data = NULL; 5991 } 5992 #endif 5993 5994 __kmp_reap_team(thread->th.th_serial_team); 5995 thread->th.th_serial_team = NULL; 5996 __kmp_free(thread); 5997 5998 KMP_MB(); 5999 6000 } // __kmp_reap_thread 6001 6002 static void __kmp_internal_end(void) { 6003 int i; 6004 6005 /* First, unregister the library */ 6006 __kmp_unregister_library(); 6007 6008 #if KMP_OS_WINDOWS 6009 /* In Win static library, we can't tell when a root actually dies, so we 6010 reclaim the data structures for any root threads that have died but not 6011 unregistered themselves, in order to shut down cleanly. 6012 In Win dynamic library we also can't tell when a thread dies. */ 6013 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6014 // dead roots 6015 #endif 6016 6017 for (i = 0; i < __kmp_threads_capacity; i++) 6018 if (__kmp_root[i]) 6019 if (__kmp_root[i]->r.r_active) 6020 break; 6021 KMP_MB(); /* Flush all pending memory write invalidates. */ 6022 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6023 6024 if (i < __kmp_threads_capacity) { 6025 #if KMP_USE_MONITOR 6026 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6027 KMP_MB(); /* Flush all pending memory write invalidates. */ 6028 6029 // Need to check that monitor was initialized before reaping it. If we are 6030 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6031 // __kmp_monitor will appear to contain valid data, but it is only valid in 6032 // the parent process, not the child. 6033 // New behavior (201008): instead of keying off of the flag 6034 // __kmp_init_parallel, the monitor thread creation is keyed off 6035 // of the new flag __kmp_init_monitor. 6036 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6037 if (TCR_4(__kmp_init_monitor)) { 6038 __kmp_reap_monitor(&__kmp_monitor); 6039 TCW_4(__kmp_init_monitor, 0); 6040 } 6041 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6042 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6043 #endif // KMP_USE_MONITOR 6044 } else { 6045 /* TODO move this to cleanup code */ 6046 #ifdef KMP_DEBUG 6047 /* make sure that everything has properly ended */ 6048 for (i = 0; i < __kmp_threads_capacity; i++) { 6049 if (__kmp_root[i]) { 6050 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6051 // there can be uber threads alive here 6052 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6053 } 6054 } 6055 #endif 6056 6057 KMP_MB(); 6058 6059 // Reap the worker threads. 6060 // This is valid for now, but be careful if threads are reaped sooner. 6061 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6062 // Get the next thread from the pool. 6063 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6064 __kmp_thread_pool = thread->th.th_next_pool; 6065 // Reap it. 6066 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6067 thread->th.th_next_pool = NULL; 6068 thread->th.th_in_pool = FALSE; 6069 __kmp_reap_thread(thread, 0); 6070 } 6071 __kmp_thread_pool_insert_pt = NULL; 6072 6073 // Reap teams. 6074 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6075 // Get the next team from the pool. 6076 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6077 __kmp_team_pool = team->t.t_next_pool; 6078 // Reap it. 6079 team->t.t_next_pool = NULL; 6080 __kmp_reap_team(team); 6081 } 6082 6083 __kmp_reap_task_teams(); 6084 6085 #if KMP_OS_UNIX 6086 // Threads that are not reaped should not access any resources since they 6087 // are going to be deallocated soon, so the shutdown sequence should wait 6088 // until all threads either exit the final spin-waiting loop or begin 6089 // sleeping after the given blocktime. 6090 for (i = 0; i < __kmp_threads_capacity; i++) { 6091 kmp_info_t *thr = __kmp_threads[i]; 6092 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6093 KMP_CPU_PAUSE(); 6094 } 6095 #endif 6096 6097 for (i = 0; i < __kmp_threads_capacity; ++i) { 6098 // TBD: Add some checking... 6099 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6100 } 6101 6102 /* Make sure all threadprivate destructors get run by joining with all 6103 worker threads before resetting this flag */ 6104 TCW_SYNC_4(__kmp_init_common, FALSE); 6105 6106 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6107 KMP_MB(); 6108 6109 #if KMP_USE_MONITOR 6110 // See note above: One of the possible fixes for CQ138434 / CQ140126 6111 // 6112 // FIXME: push both code fragments down and CSE them? 6113 // push them into __kmp_cleanup() ? 6114 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6115 if (TCR_4(__kmp_init_monitor)) { 6116 __kmp_reap_monitor(&__kmp_monitor); 6117 TCW_4(__kmp_init_monitor, 0); 6118 } 6119 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6120 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6121 #endif 6122 } /* else !__kmp_global.t_active */ 6123 TCW_4(__kmp_init_gtid, FALSE); 6124 KMP_MB(); /* Flush all pending memory write invalidates. */ 6125 6126 __kmp_cleanup(); 6127 #if OMPT_SUPPORT 6128 ompt_fini(); 6129 #endif 6130 } 6131 6132 void __kmp_internal_end_library(int gtid_req) { 6133 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6134 /* this shouldn't be a race condition because __kmp_internal_end() is the 6135 only place to clear __kmp_serial_init */ 6136 /* we'll check this later too, after we get the lock */ 6137 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6138 // redundant, because the next check will work in any case. 6139 if (__kmp_global.g.g_abort) { 6140 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6141 /* TODO abort? */ 6142 return; 6143 } 6144 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6145 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6146 return; 6147 } 6148 6149 KMP_MB(); /* Flush all pending memory write invalidates. */ 6150 /* find out who we are and what we should do */ 6151 { 6152 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6153 KA_TRACE( 6154 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6155 if (gtid == KMP_GTID_SHUTDOWN) { 6156 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6157 "already shutdown\n")); 6158 return; 6159 } else if (gtid == KMP_GTID_MONITOR) { 6160 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6161 "registered, or system shutdown\n")); 6162 return; 6163 } else if (gtid == KMP_GTID_DNE) { 6164 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6165 "shutdown\n")); 6166 /* we don't know who we are, but we may still shutdown the library */ 6167 } else if (KMP_UBER_GTID(gtid)) { 6168 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6169 if (__kmp_root[gtid]->r.r_active) { 6170 __kmp_global.g.g_abort = -1; 6171 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6172 __kmp_unregister_library(); 6173 KA_TRACE(10, 6174 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6175 gtid)); 6176 return; 6177 } else { 6178 KA_TRACE( 6179 10, 6180 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6181 __kmp_unregister_root_current_thread(gtid); 6182 } 6183 } else { 6184 /* worker threads may call this function through the atexit handler, if they 6185 * call exit() */ 6186 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6187 TODO: do a thorough shutdown instead */ 6188 #ifdef DUMP_DEBUG_ON_EXIT 6189 if (__kmp_debug_buf) 6190 __kmp_dump_debug_buffer(); 6191 #endif 6192 // added unregister library call here when we switch to shm linux 6193 // if we don't, it will leave lots of files in /dev/shm 6194 // cleanup shared memory file before exiting. 6195 __kmp_unregister_library(); 6196 return; 6197 } 6198 } 6199 /* synchronize the termination process */ 6200 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6201 6202 /* have we already finished */ 6203 if (__kmp_global.g.g_abort) { 6204 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6205 /* TODO abort? */ 6206 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6207 return; 6208 } 6209 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6210 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6211 return; 6212 } 6213 6214 /* We need this lock to enforce mutex between this reading of 6215 __kmp_threads_capacity and the writing by __kmp_register_root. 6216 Alternatively, we can use a counter of roots that is atomically updated by 6217 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6218 __kmp_internal_end_*. */ 6219 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6220 6221 /* now we can safely conduct the actual termination */ 6222 __kmp_internal_end(); 6223 6224 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6225 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6226 6227 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6228 6229 #ifdef DUMP_DEBUG_ON_EXIT 6230 if (__kmp_debug_buf) 6231 __kmp_dump_debug_buffer(); 6232 #endif 6233 6234 #if KMP_OS_WINDOWS 6235 __kmp_close_console(); 6236 #endif 6237 6238 __kmp_fini_allocator(); 6239 6240 } // __kmp_internal_end_library 6241 6242 void __kmp_internal_end_thread(int gtid_req) { 6243 int i; 6244 6245 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6246 /* this shouldn't be a race condition because __kmp_internal_end() is the 6247 * only place to clear __kmp_serial_init */ 6248 /* we'll check this later too, after we get the lock */ 6249 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6250 // redundant, because the next check will work in any case. 6251 if (__kmp_global.g.g_abort) { 6252 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6253 /* TODO abort? */ 6254 return; 6255 } 6256 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6257 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6258 return; 6259 } 6260 6261 // If hidden helper team has been initialized, we need to deinit it 6262 if (TCR_4(__kmp_init_hidden_helper)) { 6263 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6264 // First release the main thread to let it continue its work 6265 __kmp_hidden_helper_main_thread_release(); 6266 // Wait until the hidden helper team has been destroyed 6267 __kmp_hidden_helper_threads_deinitz_wait(); 6268 } 6269 6270 KMP_MB(); /* Flush all pending memory write invalidates. */ 6271 6272 /* find out who we are and what we should do */ 6273 { 6274 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6275 KA_TRACE(10, 6276 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6277 if (gtid == KMP_GTID_SHUTDOWN) { 6278 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6279 "already shutdown\n")); 6280 return; 6281 } else if (gtid == KMP_GTID_MONITOR) { 6282 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6283 "registered, or system shutdown\n")); 6284 return; 6285 } else if (gtid == KMP_GTID_DNE) { 6286 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6287 "shutdown\n")); 6288 return; 6289 /* we don't know who we are */ 6290 } else if (KMP_UBER_GTID(gtid)) { 6291 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6292 if (__kmp_root[gtid]->r.r_active) { 6293 __kmp_global.g.g_abort = -1; 6294 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6295 KA_TRACE(10, 6296 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6297 gtid)); 6298 return; 6299 } else { 6300 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6301 gtid)); 6302 __kmp_unregister_root_current_thread(gtid); 6303 } 6304 } else { 6305 /* just a worker thread, let's leave */ 6306 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6307 6308 if (gtid >= 0) { 6309 __kmp_threads[gtid]->th.th_task_team = NULL; 6310 } 6311 6312 KA_TRACE(10, 6313 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6314 gtid)); 6315 return; 6316 } 6317 } 6318 #if KMP_DYNAMIC_LIB 6319 if (__kmp_pause_status != kmp_hard_paused) 6320 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6321 // because we will better shutdown later in the library destructor. 6322 { 6323 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6324 return; 6325 } 6326 #endif 6327 /* synchronize the termination process */ 6328 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6329 6330 /* have we already finished */ 6331 if (__kmp_global.g.g_abort) { 6332 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6333 /* TODO abort? */ 6334 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6335 return; 6336 } 6337 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6338 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6339 return; 6340 } 6341 6342 /* We need this lock to enforce mutex between this reading of 6343 __kmp_threads_capacity and the writing by __kmp_register_root. 6344 Alternatively, we can use a counter of roots that is atomically updated by 6345 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6346 __kmp_internal_end_*. */ 6347 6348 /* should we finish the run-time? are all siblings done? */ 6349 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6350 6351 for (i = 0; i < __kmp_threads_capacity; ++i) { 6352 if (KMP_UBER_GTID(i)) { 6353 KA_TRACE( 6354 10, 6355 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6356 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6357 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6358 return; 6359 } 6360 } 6361 6362 /* now we can safely conduct the actual termination */ 6363 6364 __kmp_internal_end(); 6365 6366 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6367 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6368 6369 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6370 6371 #ifdef DUMP_DEBUG_ON_EXIT 6372 if (__kmp_debug_buf) 6373 __kmp_dump_debug_buffer(); 6374 #endif 6375 } // __kmp_internal_end_thread 6376 6377 // ----------------------------------------------------------------------------- 6378 // Library registration stuff. 6379 6380 static long __kmp_registration_flag = 0; 6381 // Random value used to indicate library initialization. 6382 static char *__kmp_registration_str = NULL; 6383 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6384 6385 static inline char *__kmp_reg_status_name() { 6386 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6387 each thread. If registration and unregistration go in different threads 6388 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6389 env var can not be found, because the name will contain different pid. */ 6390 // macOS* complains about name being too long with additional getuid() 6391 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6392 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6393 (int)getuid()); 6394 #else 6395 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6396 #endif 6397 } // __kmp_reg_status_get 6398 6399 void __kmp_register_library_startup(void) { 6400 6401 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6402 int done = 0; 6403 union { 6404 double dtime; 6405 long ltime; 6406 } time; 6407 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6408 __kmp_initialize_system_tick(); 6409 #endif 6410 __kmp_read_system_time(&time.dtime); 6411 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6412 __kmp_registration_str = 6413 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6414 __kmp_registration_flag, KMP_LIBRARY_FILE); 6415 6416 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6417 __kmp_registration_str)); 6418 6419 while (!done) { 6420 6421 char *value = NULL; // Actual value of the environment variable. 6422 6423 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6424 char *shm_name = __kmp_str_format("/%s", name); 6425 int shm_preexist = 0; 6426 char *data1; 6427 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6428 if ((fd1 == -1) && (errno == EEXIST)) { 6429 // file didn't open because it already exists. 6430 // try opening existing file 6431 fd1 = shm_open(shm_name, O_RDWR, 0666); 6432 if (fd1 == -1) { // file didn't open 6433 // error out here 6434 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6435 __kmp_msg_null); 6436 } else { 6437 // able to open existing file 6438 shm_preexist = 1; 6439 } 6440 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6441 // already exists. 6442 // error out here. 6443 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6444 __kmp_msg_null); 6445 } 6446 if (shm_preexist == 0) { 6447 // we created SHM now set size 6448 if (ftruncate(fd1, SHM_SIZE) == -1) { 6449 // error occured setting size; 6450 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6451 KMP_ERR(errno), __kmp_msg_null); 6452 } 6453 } 6454 data1 = 6455 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6456 if (data1 == MAP_FAILED) { 6457 // failed to map shared memory 6458 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6459 __kmp_msg_null); 6460 } 6461 if (shm_preexist == 0) { // set data to SHM, set value 6462 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6463 } 6464 // Read value from either what we just wrote or existing file. 6465 value = __kmp_str_format("%s", data1); // read value from SHM 6466 munmap(data1, SHM_SIZE); 6467 close(fd1); 6468 #else // Windows and unix with static library 6469 // Set environment variable, but do not overwrite if it is exist. 6470 __kmp_env_set(name, __kmp_registration_str, 0); 6471 // read value to see if it got set 6472 value = __kmp_env_get(name); 6473 #endif 6474 6475 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6476 done = 1; // Ok, environment variable set successfully, exit the loop. 6477 } else { 6478 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6479 // Check whether it alive or dead. 6480 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6481 char *tail = value; 6482 char *flag_addr_str = NULL; 6483 char *flag_val_str = NULL; 6484 char const *file_name = NULL; 6485 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6486 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6487 file_name = tail; 6488 if (tail != NULL) { 6489 long *flag_addr = 0; 6490 unsigned long flag_val = 0; 6491 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6492 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6493 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6494 // First, check whether environment-encoded address is mapped into 6495 // addr space. 6496 // If so, dereference it to see if it still has the right value. 6497 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6498 neighbor = 1; 6499 } else { 6500 // If not, then we know the other copy of the library is no longer 6501 // running. 6502 neighbor = 2; 6503 } 6504 } 6505 } 6506 switch (neighbor) { 6507 case 0: // Cannot parse environment variable -- neighbor status unknown. 6508 // Assume it is the incompatible format of future version of the 6509 // library. Assume the other library is alive. 6510 // WARN( ... ); // TODO: Issue a warning. 6511 file_name = "unknown library"; 6512 KMP_FALLTHROUGH(); 6513 // Attention! Falling to the next case. That's intentional. 6514 case 1: { // Neighbor is alive. 6515 // Check it is allowed. 6516 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6517 if (!__kmp_str_match_true(duplicate_ok)) { 6518 // That's not allowed. Issue fatal error. 6519 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6520 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6521 } 6522 KMP_INTERNAL_FREE(duplicate_ok); 6523 __kmp_duplicate_library_ok = 1; 6524 done = 1; // Exit the loop. 6525 } break; 6526 case 2: { // Neighbor is dead. 6527 6528 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6529 // close shared memory. 6530 shm_unlink(shm_name); // this removes file in /dev/shm 6531 #else 6532 // Clear the variable and try to register library again. 6533 __kmp_env_unset(name); 6534 #endif 6535 } break; 6536 default: { 6537 KMP_DEBUG_ASSERT(0); 6538 } break; 6539 } 6540 } 6541 KMP_INTERNAL_FREE((void *)value); 6542 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6543 KMP_INTERNAL_FREE((void *)shm_name); 6544 #endif 6545 } // while 6546 KMP_INTERNAL_FREE((void *)name); 6547 6548 } // func __kmp_register_library_startup 6549 6550 void __kmp_unregister_library(void) { 6551 6552 char *name = __kmp_reg_status_name(); 6553 char *value = NULL; 6554 6555 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6556 char *shm_name = __kmp_str_format("/%s", name); 6557 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6558 if (fd1 == -1) { 6559 // file did not open. return. 6560 return; 6561 } 6562 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6563 if (data1 != MAP_FAILED) { 6564 value = __kmp_str_format("%s", data1); // read value from SHM 6565 munmap(data1, SHM_SIZE); 6566 } 6567 close(fd1); 6568 #else 6569 value = __kmp_env_get(name); 6570 #endif 6571 6572 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6573 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6574 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6575 // Ok, this is our variable. Delete it. 6576 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6577 shm_unlink(shm_name); // this removes file in /dev/shm 6578 #else 6579 __kmp_env_unset(name); 6580 #endif 6581 } 6582 6583 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6584 KMP_INTERNAL_FREE(shm_name); 6585 #endif 6586 6587 KMP_INTERNAL_FREE(__kmp_registration_str); 6588 KMP_INTERNAL_FREE(value); 6589 KMP_INTERNAL_FREE(name); 6590 6591 __kmp_registration_flag = 0; 6592 __kmp_registration_str = NULL; 6593 6594 } // __kmp_unregister_library 6595 6596 // End of Library registration stuff. 6597 // ----------------------------------------------------------------------------- 6598 6599 #if KMP_MIC_SUPPORTED 6600 6601 static void __kmp_check_mic_type() { 6602 kmp_cpuid_t cpuid_state = {0}; 6603 kmp_cpuid_t *cs_p = &cpuid_state; 6604 __kmp_x86_cpuid(1, 0, cs_p); 6605 // We don't support mic1 at the moment 6606 if ((cs_p->eax & 0xff0) == 0xB10) { 6607 __kmp_mic_type = mic2; 6608 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6609 __kmp_mic_type = mic3; 6610 } else { 6611 __kmp_mic_type = non_mic; 6612 } 6613 } 6614 6615 #endif /* KMP_MIC_SUPPORTED */ 6616 6617 #if KMP_HAVE_UMWAIT 6618 static void __kmp_user_level_mwait_init() { 6619 struct kmp_cpuid buf; 6620 __kmp_x86_cpuid(7, 0, &buf); 6621 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6622 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6623 __kmp_umwait_enabled)); 6624 } 6625 #elif KMP_HAVE_MWAIT 6626 #ifndef AT_INTELPHIUSERMWAIT 6627 // Spurious, non-existent value that should always fail to return anything. 6628 // Will be replaced with the correct value when we know that. 6629 #define AT_INTELPHIUSERMWAIT 10000 6630 #endif 6631 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6632 // earlier OS is used to build the RTL, we'll use the following internal 6633 // function when the entry is not found. 6634 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6635 unsigned long getauxval(unsigned long) { return 0; } 6636 6637 static void __kmp_user_level_mwait_init() { 6638 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6639 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6640 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6641 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6642 if (__kmp_mic_type == mic3) { 6643 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6644 if ((res & 0x1) || __kmp_user_level_mwait) { 6645 __kmp_mwait_enabled = TRUE; 6646 if (__kmp_user_level_mwait) { 6647 KMP_INFORM(EnvMwaitWarn); 6648 } 6649 } else { 6650 __kmp_mwait_enabled = FALSE; 6651 } 6652 } 6653 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6654 "__kmp_mwait_enabled = %d\n", 6655 __kmp_mic_type, __kmp_mwait_enabled)); 6656 } 6657 #endif /* KMP_HAVE_UMWAIT */ 6658 6659 static void __kmp_do_serial_initialize(void) { 6660 int i, gtid; 6661 size_t size; 6662 6663 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6664 6665 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6666 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6667 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6668 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6669 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6670 6671 #if OMPT_SUPPORT 6672 ompt_pre_init(); 6673 #endif 6674 6675 __kmp_validate_locks(); 6676 6677 /* Initialize internal memory allocator */ 6678 __kmp_init_allocator(); 6679 6680 /* Register the library startup via an environment variable and check to see 6681 whether another copy of the library is already registered. */ 6682 6683 __kmp_register_library_startup(); 6684 6685 /* TODO reinitialization of library */ 6686 if (TCR_4(__kmp_global.g.g_done)) { 6687 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6688 } 6689 6690 __kmp_global.g.g_abort = 0; 6691 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6692 6693 /* initialize the locks */ 6694 #if KMP_USE_ADAPTIVE_LOCKS 6695 #if KMP_DEBUG_ADAPTIVE_LOCKS 6696 __kmp_init_speculative_stats(); 6697 #endif 6698 #endif 6699 #if KMP_STATS_ENABLED 6700 __kmp_stats_init(); 6701 #endif 6702 __kmp_init_lock(&__kmp_global_lock); 6703 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6704 __kmp_init_lock(&__kmp_debug_lock); 6705 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6706 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6707 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6708 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6709 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6710 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6711 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6712 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6713 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6714 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6715 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6716 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6717 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6718 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6719 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6720 #if KMP_USE_MONITOR 6721 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6722 #endif 6723 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6724 6725 /* conduct initialization and initial setup of configuration */ 6726 6727 __kmp_runtime_initialize(); 6728 6729 #if KMP_MIC_SUPPORTED 6730 __kmp_check_mic_type(); 6731 #endif 6732 6733 // Some global variable initialization moved here from kmp_env_initialize() 6734 #ifdef KMP_DEBUG 6735 kmp_diag = 0; 6736 #endif 6737 __kmp_abort_delay = 0; 6738 6739 // From __kmp_init_dflt_team_nth() 6740 /* assume the entire machine will be used */ 6741 __kmp_dflt_team_nth_ub = __kmp_xproc; 6742 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6743 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6744 } 6745 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6746 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6747 } 6748 __kmp_max_nth = __kmp_sys_max_nth; 6749 __kmp_cg_max_nth = __kmp_sys_max_nth; 6750 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6751 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6752 __kmp_teams_max_nth = __kmp_sys_max_nth; 6753 } 6754 6755 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6756 // part 6757 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6758 #if KMP_USE_MONITOR 6759 __kmp_monitor_wakeups = 6760 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6761 __kmp_bt_intervals = 6762 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6763 #endif 6764 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6765 __kmp_library = library_throughput; 6766 // From KMP_SCHEDULE initialization 6767 __kmp_static = kmp_sch_static_balanced; 6768 // AC: do not use analytical here, because it is non-monotonous 6769 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6770 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6771 // need to repeat assignment 6772 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6773 // bit control and barrier method control parts 6774 #if KMP_FAST_REDUCTION_BARRIER 6775 #define kmp_reduction_barrier_gather_bb ((int)1) 6776 #define kmp_reduction_barrier_release_bb ((int)1) 6777 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6778 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6779 #endif // KMP_FAST_REDUCTION_BARRIER 6780 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6781 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6782 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6783 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6784 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6785 #if KMP_FAST_REDUCTION_BARRIER 6786 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6787 // lin_64 ): hyper,1 6788 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6789 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6790 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6791 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6792 } 6793 #endif // KMP_FAST_REDUCTION_BARRIER 6794 } 6795 #if KMP_FAST_REDUCTION_BARRIER 6796 #undef kmp_reduction_barrier_release_pat 6797 #undef kmp_reduction_barrier_gather_pat 6798 #undef kmp_reduction_barrier_release_bb 6799 #undef kmp_reduction_barrier_gather_bb 6800 #endif // KMP_FAST_REDUCTION_BARRIER 6801 #if KMP_MIC_SUPPORTED 6802 if (__kmp_mic_type == mic2) { // KNC 6803 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6804 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6805 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6806 1; // forkjoin release 6807 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6808 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6809 } 6810 #if KMP_FAST_REDUCTION_BARRIER 6811 if (__kmp_mic_type == mic2) { // KNC 6812 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6813 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6814 } 6815 #endif // KMP_FAST_REDUCTION_BARRIER 6816 #endif // KMP_MIC_SUPPORTED 6817 6818 // From KMP_CHECKS initialization 6819 #ifdef KMP_DEBUG 6820 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6821 #else 6822 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6823 #endif 6824 6825 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6826 __kmp_foreign_tp = TRUE; 6827 6828 __kmp_global.g.g_dynamic = FALSE; 6829 __kmp_global.g.g_dynamic_mode = dynamic_default; 6830 6831 __kmp_env_initialize(NULL); 6832 6833 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6834 __kmp_user_level_mwait_init(); 6835 #endif 6836 // Print all messages in message catalog for testing purposes. 6837 #ifdef KMP_DEBUG 6838 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6839 if (__kmp_str_match_true(val)) { 6840 kmp_str_buf_t buffer; 6841 __kmp_str_buf_init(&buffer); 6842 __kmp_i18n_dump_catalog(&buffer); 6843 __kmp_printf("%s", buffer.str); 6844 __kmp_str_buf_free(&buffer); 6845 } 6846 __kmp_env_free(&val); 6847 #endif 6848 6849 __kmp_threads_capacity = 6850 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6851 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6852 __kmp_tp_capacity = __kmp_default_tp_capacity( 6853 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6854 6855 // If the library is shut down properly, both pools must be NULL. Just in 6856 // case, set them to NULL -- some memory may leak, but subsequent code will 6857 // work even if pools are not freed. 6858 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6859 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6860 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6861 __kmp_thread_pool = NULL; 6862 __kmp_thread_pool_insert_pt = NULL; 6863 __kmp_team_pool = NULL; 6864 6865 /* Allocate all of the variable sized records */ 6866 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6867 * expandable */ 6868 /* Since allocation is cache-aligned, just add extra padding at the end */ 6869 size = 6870 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6871 CACHE_LINE; 6872 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6873 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6874 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6875 6876 /* init thread counts */ 6877 KMP_DEBUG_ASSERT(__kmp_all_nth == 6878 0); // Asserts fail if the library is reinitializing and 6879 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6880 __kmp_all_nth = 0; 6881 __kmp_nth = 0; 6882 6883 /* setup the uber master thread and hierarchy */ 6884 gtid = __kmp_register_root(TRUE); 6885 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6886 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6887 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6888 6889 KMP_MB(); /* Flush all pending memory write invalidates. */ 6890 6891 __kmp_common_initialize(); 6892 6893 #if KMP_OS_UNIX 6894 /* invoke the child fork handler */ 6895 __kmp_register_atfork(); 6896 #endif 6897 6898 #if !KMP_DYNAMIC_LIB 6899 { 6900 /* Invoke the exit handler when the program finishes, only for static 6901 library. For dynamic library, we already have _fini and DllMain. */ 6902 int rc = atexit(__kmp_internal_end_atexit); 6903 if (rc != 0) { 6904 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6905 __kmp_msg_null); 6906 } 6907 } 6908 #endif 6909 6910 #if KMP_HANDLE_SIGNALS 6911 #if KMP_OS_UNIX 6912 /* NOTE: make sure that this is called before the user installs their own 6913 signal handlers so that the user handlers are called first. this way they 6914 can return false, not call our handler, avoid terminating the library, and 6915 continue execution where they left off. */ 6916 __kmp_install_signals(FALSE); 6917 #endif /* KMP_OS_UNIX */ 6918 #if KMP_OS_WINDOWS 6919 __kmp_install_signals(TRUE); 6920 #endif /* KMP_OS_WINDOWS */ 6921 #endif 6922 6923 /* we have finished the serial initialization */ 6924 __kmp_init_counter++; 6925 6926 __kmp_init_serial = TRUE; 6927 6928 if (__kmp_settings) { 6929 __kmp_env_print(); 6930 } 6931 6932 if (__kmp_display_env || __kmp_display_env_verbose) { 6933 __kmp_env_print_2(); 6934 } 6935 6936 #if OMPT_SUPPORT 6937 ompt_post_init(); 6938 #endif 6939 6940 KMP_MB(); 6941 6942 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6943 } 6944 6945 void __kmp_serial_initialize(void) { 6946 if (__kmp_init_serial) { 6947 return; 6948 } 6949 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6950 if (__kmp_init_serial) { 6951 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6952 return; 6953 } 6954 __kmp_do_serial_initialize(); 6955 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6956 } 6957 6958 static void __kmp_do_middle_initialize(void) { 6959 int i, j; 6960 int prev_dflt_team_nth; 6961 6962 if (!__kmp_init_serial) { 6963 __kmp_do_serial_initialize(); 6964 } 6965 6966 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6967 6968 // Save the previous value for the __kmp_dflt_team_nth so that 6969 // we can avoid some reinitialization if it hasn't changed. 6970 prev_dflt_team_nth = __kmp_dflt_team_nth; 6971 6972 #if KMP_AFFINITY_SUPPORTED 6973 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6974 // number of cores on the machine. 6975 __kmp_affinity_initialize(); 6976 6977 // Run through the __kmp_threads array and set the affinity mask 6978 // for each root thread that is currently registered with the RTL. 6979 for (i = 0; i < __kmp_threads_capacity; i++) { 6980 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6981 __kmp_affinity_set_init_mask(i, TRUE); 6982 } 6983 } 6984 #endif /* KMP_AFFINITY_SUPPORTED */ 6985 6986 KMP_ASSERT(__kmp_xproc > 0); 6987 if (__kmp_avail_proc == 0) { 6988 __kmp_avail_proc = __kmp_xproc; 6989 } 6990 6991 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6992 // correct them now 6993 j = 0; 6994 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6995 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6996 __kmp_avail_proc; 6997 j++; 6998 } 6999 7000 if (__kmp_dflt_team_nth == 0) { 7001 #ifdef KMP_DFLT_NTH_CORES 7002 // Default #threads = #cores 7003 __kmp_dflt_team_nth = __kmp_ncores; 7004 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7005 "__kmp_ncores (%d)\n", 7006 __kmp_dflt_team_nth)); 7007 #else 7008 // Default #threads = #available OS procs 7009 __kmp_dflt_team_nth = __kmp_avail_proc; 7010 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7011 "__kmp_avail_proc(%d)\n", 7012 __kmp_dflt_team_nth)); 7013 #endif /* KMP_DFLT_NTH_CORES */ 7014 } 7015 7016 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7017 __kmp_dflt_team_nth = KMP_MIN_NTH; 7018 } 7019 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7020 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7021 } 7022 7023 // There's no harm in continuing if the following check fails, 7024 // but it indicates an error in the previous logic. 7025 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7026 7027 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7028 // Run through the __kmp_threads array and set the num threads icv for each 7029 // root thread that is currently registered with the RTL (which has not 7030 // already explicitly set its nthreads-var with a call to 7031 // omp_set_num_threads()). 7032 for (i = 0; i < __kmp_threads_capacity; i++) { 7033 kmp_info_t *thread = __kmp_threads[i]; 7034 if (thread == NULL) 7035 continue; 7036 if (thread->th.th_current_task->td_icvs.nproc != 0) 7037 continue; 7038 7039 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7040 } 7041 } 7042 KA_TRACE( 7043 20, 7044 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7045 __kmp_dflt_team_nth)); 7046 7047 #ifdef KMP_ADJUST_BLOCKTIME 7048 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7049 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7050 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7051 if (__kmp_nth > __kmp_avail_proc) { 7052 __kmp_zero_bt = TRUE; 7053 } 7054 } 7055 #endif /* KMP_ADJUST_BLOCKTIME */ 7056 7057 /* we have finished middle initialization */ 7058 TCW_SYNC_4(__kmp_init_middle, TRUE); 7059 7060 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7061 } 7062 7063 void __kmp_middle_initialize(void) { 7064 if (__kmp_init_middle) { 7065 return; 7066 } 7067 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7068 if (__kmp_init_middle) { 7069 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7070 return; 7071 } 7072 __kmp_do_middle_initialize(); 7073 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7074 } 7075 7076 void __kmp_parallel_initialize(void) { 7077 int gtid = __kmp_entry_gtid(); // this might be a new root 7078 7079 /* synchronize parallel initialization (for sibling) */ 7080 if (TCR_4(__kmp_init_parallel)) 7081 return; 7082 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7083 if (TCR_4(__kmp_init_parallel)) { 7084 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7085 return; 7086 } 7087 7088 /* TODO reinitialization after we have already shut down */ 7089 if (TCR_4(__kmp_global.g.g_done)) { 7090 KA_TRACE( 7091 10, 7092 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7093 __kmp_infinite_loop(); 7094 } 7095 7096 /* jc: The lock __kmp_initz_lock is already held, so calling 7097 __kmp_serial_initialize would cause a deadlock. So we call 7098 __kmp_do_serial_initialize directly. */ 7099 if (!__kmp_init_middle) { 7100 __kmp_do_middle_initialize(); 7101 } 7102 __kmp_resume_if_hard_paused(); 7103 7104 /* begin initialization */ 7105 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7106 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7107 7108 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7109 // Save the FP control regs. 7110 // Worker threads will set theirs to these values at thread startup. 7111 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7112 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7113 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7114 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7115 7116 #if KMP_OS_UNIX 7117 #if KMP_HANDLE_SIGNALS 7118 /* must be after __kmp_serial_initialize */ 7119 __kmp_install_signals(TRUE); 7120 #endif 7121 #endif 7122 7123 __kmp_suspend_initialize(); 7124 7125 #if defined(USE_LOAD_BALANCE) 7126 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7127 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7128 } 7129 #else 7130 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7131 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7132 } 7133 #endif 7134 7135 if (__kmp_version) { 7136 __kmp_print_version_2(); 7137 } 7138 7139 /* we have finished parallel initialization */ 7140 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7141 7142 KMP_MB(); 7143 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7144 7145 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7146 } 7147 7148 void __kmp_hidden_helper_initialize() { 7149 if (TCR_4(__kmp_init_hidden_helper)) 7150 return; 7151 7152 // __kmp_parallel_initialize is required before we initialize hidden helper 7153 if (!TCR_4(__kmp_init_parallel)) 7154 __kmp_parallel_initialize(); 7155 7156 // Double check. Note that this double check should not be placed before 7157 // __kmp_parallel_initialize as it will cause dead lock. 7158 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7159 if (TCR_4(__kmp_init_hidden_helper)) { 7160 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7161 return; 7162 } 7163 7164 // Set the count of hidden helper tasks to be executed to zero 7165 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7166 7167 // Set the global variable indicating that we're initializing hidden helper 7168 // team/threads 7169 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7170 7171 // Platform independent initialization 7172 __kmp_do_initialize_hidden_helper_threads(); 7173 7174 // Wait here for the finish of initialization of hidden helper teams 7175 __kmp_hidden_helper_threads_initz_wait(); 7176 7177 // We have finished hidden helper initialization 7178 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7179 7180 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7181 } 7182 7183 /* ------------------------------------------------------------------------ */ 7184 7185 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7186 kmp_team_t *team) { 7187 kmp_disp_t *dispatch; 7188 7189 KMP_MB(); 7190 7191 /* none of the threads have encountered any constructs, yet. */ 7192 this_thr->th.th_local.this_construct = 0; 7193 #if KMP_CACHE_MANAGE 7194 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7195 #endif /* KMP_CACHE_MANAGE */ 7196 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7197 KMP_DEBUG_ASSERT(dispatch); 7198 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7199 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7200 // this_thr->th.th_info.ds.ds_tid ] ); 7201 7202 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7203 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7204 if (__kmp_env_consistency_check) 7205 __kmp_push_parallel(gtid, team->t.t_ident); 7206 7207 KMP_MB(); /* Flush all pending memory write invalidates. */ 7208 } 7209 7210 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7211 kmp_team_t *team) { 7212 if (__kmp_env_consistency_check) 7213 __kmp_pop_parallel(gtid, team->t.t_ident); 7214 7215 __kmp_finish_implicit_task(this_thr); 7216 } 7217 7218 int __kmp_invoke_task_func(int gtid) { 7219 int rc; 7220 int tid = __kmp_tid_from_gtid(gtid); 7221 kmp_info_t *this_thr = __kmp_threads[gtid]; 7222 kmp_team_t *team = this_thr->th.th_team; 7223 7224 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7225 #if USE_ITT_BUILD 7226 if (__itt_stack_caller_create_ptr) { 7227 // inform ittnotify about entering user's code 7228 if (team->t.t_stack_id != NULL) { 7229 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7230 } else { 7231 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7232 __kmp_itt_stack_callee_enter( 7233 (__itt_caller)team->t.t_parent->t.t_stack_id); 7234 } 7235 } 7236 #endif /* USE_ITT_BUILD */ 7237 #if INCLUDE_SSC_MARKS 7238 SSC_MARK_INVOKING(); 7239 #endif 7240 7241 #if OMPT_SUPPORT 7242 void *dummy; 7243 void **exit_frame_p; 7244 ompt_data_t *my_task_data; 7245 ompt_data_t *my_parallel_data; 7246 int ompt_team_size; 7247 7248 if (ompt_enabled.enabled) { 7249 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7250 .ompt_task_info.frame.exit_frame.ptr); 7251 } else { 7252 exit_frame_p = &dummy; 7253 } 7254 7255 my_task_data = 7256 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7257 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7258 if (ompt_enabled.ompt_callback_implicit_task) { 7259 ompt_team_size = team->t.t_nproc; 7260 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7261 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7262 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7263 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7264 } 7265 #endif 7266 7267 #if KMP_STATS_ENABLED 7268 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7269 if (previous_state == stats_state_e::TEAMS_REGION) { 7270 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7271 } else { 7272 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7273 } 7274 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7275 #endif 7276 7277 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7278 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7279 #if OMPT_SUPPORT 7280 , 7281 exit_frame_p 7282 #endif 7283 ); 7284 #if OMPT_SUPPORT 7285 *exit_frame_p = NULL; 7286 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7287 #endif 7288 7289 #if KMP_STATS_ENABLED 7290 if (previous_state == stats_state_e::TEAMS_REGION) { 7291 KMP_SET_THREAD_STATE(previous_state); 7292 } 7293 KMP_POP_PARTITIONED_TIMER(); 7294 #endif 7295 7296 #if USE_ITT_BUILD 7297 if (__itt_stack_caller_create_ptr) { 7298 // inform ittnotify about leaving user's code 7299 if (team->t.t_stack_id != NULL) { 7300 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7301 } else { 7302 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7303 __kmp_itt_stack_callee_leave( 7304 (__itt_caller)team->t.t_parent->t.t_stack_id); 7305 } 7306 } 7307 #endif /* USE_ITT_BUILD */ 7308 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7309 7310 return rc; 7311 } 7312 7313 void __kmp_teams_master(int gtid) { 7314 // This routine is called by all master threads in teams construct 7315 kmp_info_t *thr = __kmp_threads[gtid]; 7316 kmp_team_t *team = thr->th.th_team; 7317 ident_t *loc = team->t.t_ident; 7318 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7319 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7320 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7321 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7322 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7323 7324 // This thread is a new CG root. Set up the proper variables. 7325 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7326 tmp->cg_root = thr; // Make thr the CG root 7327 // Init to thread limit that was stored when league masters were forked 7328 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7329 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7330 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7331 " cg_nthreads to 1\n", 7332 thr, tmp)); 7333 tmp->up = thr->th.th_cg_roots; 7334 thr->th.th_cg_roots = tmp; 7335 7336 // Launch league of teams now, but not let workers execute 7337 // (they hang on fork barrier until next parallel) 7338 #if INCLUDE_SSC_MARKS 7339 SSC_MARK_FORKING(); 7340 #endif 7341 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7342 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7343 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7344 #if INCLUDE_SSC_MARKS 7345 SSC_MARK_JOINING(); 7346 #endif 7347 // If the team size was reduced from the limit, set it to the new size 7348 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7349 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7350 // AC: last parameter "1" eliminates join barrier which won't work because 7351 // worker threads are in a fork barrier waiting for more parallel regions 7352 __kmp_join_call(loc, gtid 7353 #if OMPT_SUPPORT 7354 , 7355 fork_context_intel 7356 #endif 7357 , 7358 1); 7359 } 7360 7361 int __kmp_invoke_teams_master(int gtid) { 7362 kmp_info_t *this_thr = __kmp_threads[gtid]; 7363 kmp_team_t *team = this_thr->th.th_team; 7364 #if KMP_DEBUG 7365 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7366 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7367 (void *)__kmp_teams_master); 7368 #endif 7369 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7370 #if OMPT_SUPPORT 7371 int tid = __kmp_tid_from_gtid(gtid); 7372 ompt_data_t *task_data = 7373 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7374 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7375 if (ompt_enabled.ompt_callback_implicit_task) { 7376 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7377 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7378 ompt_task_initial); 7379 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7380 } 7381 #endif 7382 __kmp_teams_master(gtid); 7383 #if OMPT_SUPPORT 7384 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7385 #endif 7386 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7387 return 1; 7388 } 7389 7390 /* this sets the requested number of threads for the next parallel region 7391 encountered by this team. since this should be enclosed in the forkjoin 7392 critical section it should avoid race conditions with asymmetrical nested 7393 parallelism */ 7394 7395 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7396 kmp_info_t *thr = __kmp_threads[gtid]; 7397 7398 if (num_threads > 0) 7399 thr->th.th_set_nproc = num_threads; 7400 } 7401 7402 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7403 int num_threads) { 7404 KMP_DEBUG_ASSERT(thr); 7405 // Remember the number of threads for inner parallel regions 7406 if (!TCR_4(__kmp_init_middle)) 7407 __kmp_middle_initialize(); // get internal globals calculated 7408 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7409 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7410 7411 if (num_threads == 0) { 7412 if (__kmp_teams_thread_limit > 0) { 7413 num_threads = __kmp_teams_thread_limit; 7414 } else { 7415 num_threads = __kmp_avail_proc / num_teams; 7416 } 7417 // adjust num_threads w/o warning as it is not user setting 7418 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7419 // no thread_limit clause specified - do not change thread-limit-var ICV 7420 if (num_threads > __kmp_dflt_team_nth) { 7421 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7422 } 7423 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7424 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7425 } // prevent team size to exceed thread-limit-var 7426 if (num_teams * num_threads > __kmp_teams_max_nth) { 7427 num_threads = __kmp_teams_max_nth / num_teams; 7428 } 7429 if (num_threads == 0) { 7430 num_threads = 1; 7431 } 7432 } else { 7433 // This thread will be the master of the league masters 7434 // Store new thread limit; old limit is saved in th_cg_roots list 7435 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7436 // num_threads = min(num_threads, nthreads-var) 7437 if (num_threads > __kmp_dflt_team_nth) { 7438 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7439 } 7440 if (num_teams * num_threads > __kmp_teams_max_nth) { 7441 int new_threads = __kmp_teams_max_nth / num_teams; 7442 if (new_threads == 0) { 7443 new_threads = 1; 7444 } 7445 if (new_threads != num_threads) { 7446 if (!__kmp_reserve_warn) { // user asked for too many threads 7447 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7448 __kmp_msg(kmp_ms_warning, 7449 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7450 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7451 } 7452 } 7453 num_threads = new_threads; 7454 } 7455 } 7456 thr->th.th_teams_size.nth = num_threads; 7457 } 7458 7459 /* this sets the requested number of teams for the teams region and/or 7460 the number of threads for the next parallel region encountered */ 7461 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7462 int num_threads) { 7463 kmp_info_t *thr = __kmp_threads[gtid]; 7464 KMP_DEBUG_ASSERT(num_teams >= 0); 7465 KMP_DEBUG_ASSERT(num_threads >= 0); 7466 7467 if (num_teams == 0) { 7468 if (__kmp_nteams > 0) { 7469 num_teams = __kmp_nteams; 7470 } else { 7471 num_teams = 1; // default number of teams is 1. 7472 } 7473 } 7474 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7475 if (!__kmp_reserve_warn) { 7476 __kmp_reserve_warn = 1; 7477 __kmp_msg(kmp_ms_warning, 7478 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7479 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7480 } 7481 num_teams = __kmp_teams_max_nth; 7482 } 7483 // Set number of teams (number of threads in the outer "parallel" of the 7484 // teams) 7485 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7486 7487 __kmp_push_thread_limit(thr, num_teams, num_threads); 7488 } 7489 7490 /* This sets the requested number of teams for the teams region and/or 7491 the number of threads for the next parallel region encountered */ 7492 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7493 int num_teams_ub, int num_threads) { 7494 kmp_info_t *thr = __kmp_threads[gtid]; 7495 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7496 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7497 KMP_DEBUG_ASSERT(num_threads >= 0); 7498 7499 if (num_teams_lb > num_teams_ub) { 7500 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7501 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7502 } 7503 7504 int num_teams = 1; // defalt number of teams is 1. 7505 7506 if (num_teams_lb == 0 && num_teams_ub > 0) 7507 num_teams_lb = num_teams_ub; 7508 7509 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7510 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7511 if (num_teams > __kmp_teams_max_nth) { 7512 if (!__kmp_reserve_warn) { 7513 __kmp_reserve_warn = 1; 7514 __kmp_msg(kmp_ms_warning, 7515 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7516 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7517 } 7518 num_teams = __kmp_teams_max_nth; 7519 } 7520 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7521 num_teams = num_teams_ub; 7522 } else { // num_teams_lb <= num_teams <= num_teams_ub 7523 if (num_threads == 0) { 7524 if (num_teams_ub > __kmp_teams_max_nth) { 7525 num_teams = num_teams_lb; 7526 } else { 7527 num_teams = num_teams_ub; 7528 } 7529 } else { 7530 num_teams = (num_threads > __kmp_teams_max_nth) 7531 ? num_teams 7532 : __kmp_teams_max_nth / num_threads; 7533 if (num_teams < num_teams_lb) { 7534 num_teams = num_teams_lb; 7535 } else if (num_teams > num_teams_ub) { 7536 num_teams = num_teams_ub; 7537 } 7538 } 7539 } 7540 // Set number of teams (number of threads in the outer "parallel" of the 7541 // teams) 7542 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7543 7544 __kmp_push_thread_limit(thr, num_teams, num_threads); 7545 } 7546 7547 // Set the proc_bind var to use in the following parallel region. 7548 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7549 kmp_info_t *thr = __kmp_threads[gtid]; 7550 thr->th.th_set_proc_bind = proc_bind; 7551 } 7552 7553 /* Launch the worker threads into the microtask. */ 7554 7555 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7556 kmp_info_t *this_thr = __kmp_threads[gtid]; 7557 7558 #ifdef KMP_DEBUG 7559 int f; 7560 #endif /* KMP_DEBUG */ 7561 7562 KMP_DEBUG_ASSERT(team); 7563 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7564 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7565 KMP_MB(); /* Flush all pending memory write invalidates. */ 7566 7567 team->t.t_construct = 0; /* no single directives seen yet */ 7568 team->t.t_ordered.dt.t_value = 7569 0; /* thread 0 enters the ordered section first */ 7570 7571 /* Reset the identifiers on the dispatch buffer */ 7572 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7573 if (team->t.t_max_nproc > 1) { 7574 int i; 7575 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7576 team->t.t_disp_buffer[i].buffer_index = i; 7577 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7578 } 7579 } else { 7580 team->t.t_disp_buffer[0].buffer_index = 0; 7581 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7582 } 7583 7584 KMP_MB(); /* Flush all pending memory write invalidates. */ 7585 KMP_ASSERT(this_thr->th.th_team == team); 7586 7587 #ifdef KMP_DEBUG 7588 for (f = 0; f < team->t.t_nproc; f++) { 7589 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7590 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7591 } 7592 #endif /* KMP_DEBUG */ 7593 7594 /* release the worker threads so they may begin working */ 7595 __kmp_fork_barrier(gtid, 0); 7596 } 7597 7598 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7599 kmp_info_t *this_thr = __kmp_threads[gtid]; 7600 7601 KMP_DEBUG_ASSERT(team); 7602 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7603 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7604 KMP_MB(); /* Flush all pending memory write invalidates. */ 7605 7606 /* Join barrier after fork */ 7607 7608 #ifdef KMP_DEBUG 7609 if (__kmp_threads[gtid] && 7610 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7611 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7612 __kmp_threads[gtid]); 7613 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7614 "team->t.t_nproc=%d\n", 7615 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7616 team->t.t_nproc); 7617 __kmp_print_structure(); 7618 } 7619 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7620 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7621 #endif /* KMP_DEBUG */ 7622 7623 __kmp_join_barrier(gtid); /* wait for everyone */ 7624 #if OMPT_SUPPORT 7625 if (ompt_enabled.enabled && 7626 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7627 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7628 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7629 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7630 #if OMPT_OPTIONAL 7631 void *codeptr = NULL; 7632 if (KMP_MASTER_TID(ds_tid) && 7633 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7634 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7635 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7636 7637 if (ompt_enabled.ompt_callback_sync_region_wait) { 7638 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7639 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7640 codeptr); 7641 } 7642 if (ompt_enabled.ompt_callback_sync_region) { 7643 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7644 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7645 codeptr); 7646 } 7647 #endif 7648 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7649 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7650 ompt_scope_end, NULL, task_data, 0, ds_tid, 7651 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7652 } 7653 } 7654 #endif 7655 7656 KMP_MB(); /* Flush all pending memory write invalidates. */ 7657 KMP_ASSERT(this_thr->th.th_team == team); 7658 } 7659 7660 /* ------------------------------------------------------------------------ */ 7661 7662 #ifdef USE_LOAD_BALANCE 7663 7664 // Return the worker threads actively spinning in the hot team, if we 7665 // are at the outermost level of parallelism. Otherwise, return 0. 7666 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7667 int i; 7668 int retval; 7669 kmp_team_t *hot_team; 7670 7671 if (root->r.r_active) { 7672 return 0; 7673 } 7674 hot_team = root->r.r_hot_team; 7675 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7676 return hot_team->t.t_nproc - 1; // Don't count master thread 7677 } 7678 7679 // Skip the master thread - it is accounted for elsewhere. 7680 retval = 0; 7681 for (i = 1; i < hot_team->t.t_nproc; i++) { 7682 if (hot_team->t.t_threads[i]->th.th_active) { 7683 retval++; 7684 } 7685 } 7686 return retval; 7687 } 7688 7689 // Perform an automatic adjustment to the number of 7690 // threads used by the next parallel region. 7691 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7692 int retval; 7693 int pool_active; 7694 int hot_team_active; 7695 int team_curr_active; 7696 int system_active; 7697 7698 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7699 set_nproc)); 7700 KMP_DEBUG_ASSERT(root); 7701 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7702 ->th.th_current_task->td_icvs.dynamic == TRUE); 7703 KMP_DEBUG_ASSERT(set_nproc > 1); 7704 7705 if (set_nproc == 1) { 7706 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7707 return 1; 7708 } 7709 7710 // Threads that are active in the thread pool, active in the hot team for this 7711 // particular root (if we are at the outer par level), and the currently 7712 // executing thread (to become the master) are available to add to the new 7713 // team, but are currently contributing to the system load, and must be 7714 // accounted for. 7715 pool_active = __kmp_thread_pool_active_nth; 7716 hot_team_active = __kmp_active_hot_team_nproc(root); 7717 team_curr_active = pool_active + hot_team_active + 1; 7718 7719 // Check the system load. 7720 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7721 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7722 "hot team active = %d\n", 7723 system_active, pool_active, hot_team_active)); 7724 7725 if (system_active < 0) { 7726 // There was an error reading the necessary info from /proc, so use the 7727 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7728 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7729 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7730 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7731 7732 // Make this call behave like the thread limit algorithm. 7733 retval = __kmp_avail_proc - __kmp_nth + 7734 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7735 if (retval > set_nproc) { 7736 retval = set_nproc; 7737 } 7738 if (retval < KMP_MIN_NTH) { 7739 retval = KMP_MIN_NTH; 7740 } 7741 7742 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7743 retval)); 7744 return retval; 7745 } 7746 7747 // There is a slight delay in the load balance algorithm in detecting new 7748 // running procs. The real system load at this instant should be at least as 7749 // large as the #active omp thread that are available to add to the team. 7750 if (system_active < team_curr_active) { 7751 system_active = team_curr_active; 7752 } 7753 retval = __kmp_avail_proc - system_active + team_curr_active; 7754 if (retval > set_nproc) { 7755 retval = set_nproc; 7756 } 7757 if (retval < KMP_MIN_NTH) { 7758 retval = KMP_MIN_NTH; 7759 } 7760 7761 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7762 return retval; 7763 } // __kmp_load_balance_nproc() 7764 7765 #endif /* USE_LOAD_BALANCE */ 7766 7767 /* ------------------------------------------------------------------------ */ 7768 7769 /* NOTE: this is called with the __kmp_init_lock held */ 7770 void __kmp_cleanup(void) { 7771 int f; 7772 7773 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7774 7775 if (TCR_4(__kmp_init_parallel)) { 7776 #if KMP_HANDLE_SIGNALS 7777 __kmp_remove_signals(); 7778 #endif 7779 TCW_4(__kmp_init_parallel, FALSE); 7780 } 7781 7782 if (TCR_4(__kmp_init_middle)) { 7783 #if KMP_AFFINITY_SUPPORTED 7784 __kmp_affinity_uninitialize(); 7785 #endif /* KMP_AFFINITY_SUPPORTED */ 7786 __kmp_cleanup_hierarchy(); 7787 TCW_4(__kmp_init_middle, FALSE); 7788 } 7789 7790 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7791 7792 if (__kmp_init_serial) { 7793 __kmp_runtime_destroy(); 7794 __kmp_init_serial = FALSE; 7795 } 7796 7797 __kmp_cleanup_threadprivate_caches(); 7798 7799 for (f = 0; f < __kmp_threads_capacity; f++) { 7800 if (__kmp_root[f] != NULL) { 7801 __kmp_free(__kmp_root[f]); 7802 __kmp_root[f] = NULL; 7803 } 7804 } 7805 __kmp_free(__kmp_threads); 7806 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7807 // there is no need in freeing __kmp_root. 7808 __kmp_threads = NULL; 7809 __kmp_root = NULL; 7810 __kmp_threads_capacity = 0; 7811 7812 #if KMP_USE_DYNAMIC_LOCK 7813 __kmp_cleanup_indirect_user_locks(); 7814 #else 7815 __kmp_cleanup_user_locks(); 7816 #endif 7817 7818 #if KMP_AFFINITY_SUPPORTED 7819 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7820 __kmp_cpuinfo_file = NULL; 7821 #endif /* KMP_AFFINITY_SUPPORTED */ 7822 7823 #if KMP_USE_ADAPTIVE_LOCKS 7824 #if KMP_DEBUG_ADAPTIVE_LOCKS 7825 __kmp_print_speculative_stats(); 7826 #endif 7827 #endif 7828 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7829 __kmp_nested_nth.nth = NULL; 7830 __kmp_nested_nth.size = 0; 7831 __kmp_nested_nth.used = 0; 7832 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7833 __kmp_nested_proc_bind.bind_types = NULL; 7834 __kmp_nested_proc_bind.size = 0; 7835 __kmp_nested_proc_bind.used = 0; 7836 if (__kmp_affinity_format) { 7837 KMP_INTERNAL_FREE(__kmp_affinity_format); 7838 __kmp_affinity_format = NULL; 7839 } 7840 7841 __kmp_i18n_catclose(); 7842 7843 #if KMP_USE_HIER_SCHED 7844 __kmp_hier_scheds.deallocate(); 7845 #endif 7846 7847 #if KMP_STATS_ENABLED 7848 __kmp_stats_fini(); 7849 #endif 7850 7851 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7852 } 7853 7854 /* ------------------------------------------------------------------------ */ 7855 7856 int __kmp_ignore_mppbeg(void) { 7857 char *env; 7858 7859 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7860 if (__kmp_str_match_false(env)) 7861 return FALSE; 7862 } 7863 // By default __kmpc_begin() is no-op. 7864 return TRUE; 7865 } 7866 7867 int __kmp_ignore_mppend(void) { 7868 char *env; 7869 7870 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7871 if (__kmp_str_match_false(env)) 7872 return FALSE; 7873 } 7874 // By default __kmpc_end() is no-op. 7875 return TRUE; 7876 } 7877 7878 void __kmp_internal_begin(void) { 7879 int gtid; 7880 kmp_root_t *root; 7881 7882 /* this is a very important step as it will register new sibling threads 7883 and assign these new uber threads a new gtid */ 7884 gtid = __kmp_entry_gtid(); 7885 root = __kmp_threads[gtid]->th.th_root; 7886 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7887 7888 if (root->r.r_begin) 7889 return; 7890 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7891 if (root->r.r_begin) { 7892 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7893 return; 7894 } 7895 7896 root->r.r_begin = TRUE; 7897 7898 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7899 } 7900 7901 /* ------------------------------------------------------------------------ */ 7902 7903 void __kmp_user_set_library(enum library_type arg) { 7904 int gtid; 7905 kmp_root_t *root; 7906 kmp_info_t *thread; 7907 7908 /* first, make sure we are initialized so we can get our gtid */ 7909 7910 gtid = __kmp_entry_gtid(); 7911 thread = __kmp_threads[gtid]; 7912 7913 root = thread->th.th_root; 7914 7915 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7916 library_serial)); 7917 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7918 thread */ 7919 KMP_WARNING(SetLibraryIncorrectCall); 7920 return; 7921 } 7922 7923 switch (arg) { 7924 case library_serial: 7925 thread->th.th_set_nproc = 0; 7926 set__nproc(thread, 1); 7927 break; 7928 case library_turnaround: 7929 thread->th.th_set_nproc = 0; 7930 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7931 : __kmp_dflt_team_nth_ub); 7932 break; 7933 case library_throughput: 7934 thread->th.th_set_nproc = 0; 7935 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7936 : __kmp_dflt_team_nth_ub); 7937 break; 7938 default: 7939 KMP_FATAL(UnknownLibraryType, arg); 7940 } 7941 7942 __kmp_aux_set_library(arg); 7943 } 7944 7945 void __kmp_aux_set_stacksize(size_t arg) { 7946 if (!__kmp_init_serial) 7947 __kmp_serial_initialize(); 7948 7949 #if KMP_OS_DARWIN 7950 if (arg & (0x1000 - 1)) { 7951 arg &= ~(0x1000 - 1); 7952 if (arg + 0x1000) /* check for overflow if we round up */ 7953 arg += 0x1000; 7954 } 7955 #endif 7956 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7957 7958 /* only change the default stacksize before the first parallel region */ 7959 if (!TCR_4(__kmp_init_parallel)) { 7960 size_t value = arg; /* argument is in bytes */ 7961 7962 if (value < __kmp_sys_min_stksize) 7963 value = __kmp_sys_min_stksize; 7964 else if (value > KMP_MAX_STKSIZE) 7965 value = KMP_MAX_STKSIZE; 7966 7967 __kmp_stksize = value; 7968 7969 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7970 } 7971 7972 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7973 } 7974 7975 /* set the behaviour of the runtime library */ 7976 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7977 void __kmp_aux_set_library(enum library_type arg) { 7978 __kmp_library = arg; 7979 7980 switch (__kmp_library) { 7981 case library_serial: { 7982 KMP_INFORM(LibraryIsSerial); 7983 } break; 7984 case library_turnaround: 7985 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7986 __kmp_use_yield = 2; // only yield when oversubscribed 7987 break; 7988 case library_throughput: 7989 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7990 __kmp_dflt_blocktime = 200; 7991 break; 7992 default: 7993 KMP_FATAL(UnknownLibraryType, arg); 7994 } 7995 } 7996 7997 /* Getting team information common for all team API */ 7998 // Returns NULL if not in teams construct 7999 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8000 kmp_info_t *thr = __kmp_entry_thread(); 8001 teams_serialized = 0; 8002 if (thr->th.th_teams_microtask) { 8003 kmp_team_t *team = thr->th.th_team; 8004 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8005 int ii = team->t.t_level; 8006 teams_serialized = team->t.t_serialized; 8007 int level = tlevel + 1; 8008 KMP_DEBUG_ASSERT(ii >= tlevel); 8009 while (ii > level) { 8010 for (teams_serialized = team->t.t_serialized; 8011 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8012 } 8013 if (team->t.t_serialized && (!teams_serialized)) { 8014 team = team->t.t_parent; 8015 continue; 8016 } 8017 if (ii > level) { 8018 team = team->t.t_parent; 8019 ii--; 8020 } 8021 } 8022 return team; 8023 } 8024 return NULL; 8025 } 8026 8027 int __kmp_aux_get_team_num() { 8028 int serialized; 8029 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8030 if (team) { 8031 if (serialized > 1) { 8032 return 0; // teams region is serialized ( 1 team of 1 thread ). 8033 } else { 8034 return team->t.t_master_tid; 8035 } 8036 } 8037 return 0; 8038 } 8039 8040 int __kmp_aux_get_num_teams() { 8041 int serialized; 8042 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8043 if (team) { 8044 if (serialized > 1) { 8045 return 1; 8046 } else { 8047 return team->t.t_parent->t.t_nproc; 8048 } 8049 } 8050 return 1; 8051 } 8052 8053 /* ------------------------------------------------------------------------ */ 8054 8055 /* 8056 * Affinity Format Parser 8057 * 8058 * Field is in form of: %[[[0].]size]type 8059 * % and type are required (%% means print a literal '%') 8060 * type is either single char or long name surrounded by {}, 8061 * e.g., N or {num_threads} 8062 * 0 => leading zeros 8063 * . => right justified when size is specified 8064 * by default output is left justified 8065 * size is the *minimum* field length 8066 * All other characters are printed as is 8067 * 8068 * Available field types: 8069 * L {thread_level} - omp_get_level() 8070 * n {thread_num} - omp_get_thread_num() 8071 * h {host} - name of host machine 8072 * P {process_id} - process id (integer) 8073 * T {thread_identifier} - native thread identifier (integer) 8074 * N {num_threads} - omp_get_num_threads() 8075 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8076 * a {thread_affinity} - comma separated list of integers or integer ranges 8077 * (values of affinity mask) 8078 * 8079 * Implementation-specific field types can be added 8080 * If a type is unknown, print "undefined" 8081 */ 8082 8083 // Structure holding the short name, long name, and corresponding data type 8084 // for snprintf. A table of these will represent the entire valid keyword 8085 // field types. 8086 typedef struct kmp_affinity_format_field_t { 8087 char short_name; // from spec e.g., L -> thread level 8088 const char *long_name; // from spec thread_level -> thread level 8089 char field_format; // data type for snprintf (typically 'd' or 's' 8090 // for integer or string) 8091 } kmp_affinity_format_field_t; 8092 8093 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8094 #if KMP_AFFINITY_SUPPORTED 8095 {'A', "thread_affinity", 's'}, 8096 #endif 8097 {'t', "team_num", 'd'}, 8098 {'T', "num_teams", 'd'}, 8099 {'L', "nesting_level", 'd'}, 8100 {'n', "thread_num", 'd'}, 8101 {'N', "num_threads", 'd'}, 8102 {'a', "ancestor_tnum", 'd'}, 8103 {'H', "host", 's'}, 8104 {'P', "process_id", 'd'}, 8105 {'i', "native_thread_id", 'd'}}; 8106 8107 // Return the number of characters it takes to hold field 8108 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8109 const char **ptr, 8110 kmp_str_buf_t *field_buffer) { 8111 int rc, format_index, field_value; 8112 const char *width_left, *width_right; 8113 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8114 static const int FORMAT_SIZE = 20; 8115 char format[FORMAT_SIZE] = {0}; 8116 char absolute_short_name = 0; 8117 8118 KMP_DEBUG_ASSERT(gtid >= 0); 8119 KMP_DEBUG_ASSERT(th); 8120 KMP_DEBUG_ASSERT(**ptr == '%'); 8121 KMP_DEBUG_ASSERT(field_buffer); 8122 8123 __kmp_str_buf_clear(field_buffer); 8124 8125 // Skip the initial % 8126 (*ptr)++; 8127 8128 // Check for %% first 8129 if (**ptr == '%') { 8130 __kmp_str_buf_cat(field_buffer, "%", 1); 8131 (*ptr)++; // skip over the second % 8132 return 1; 8133 } 8134 8135 // Parse field modifiers if they are present 8136 pad_zeros = false; 8137 if (**ptr == '0') { 8138 pad_zeros = true; 8139 (*ptr)++; // skip over 0 8140 } 8141 right_justify = false; 8142 if (**ptr == '.') { 8143 right_justify = true; 8144 (*ptr)++; // skip over . 8145 } 8146 // Parse width of field: [width_left, width_right) 8147 width_left = width_right = NULL; 8148 if (**ptr >= '0' && **ptr <= '9') { 8149 width_left = *ptr; 8150 SKIP_DIGITS(*ptr); 8151 width_right = *ptr; 8152 } 8153 8154 // Create the format for KMP_SNPRINTF based on flags parsed above 8155 format_index = 0; 8156 format[format_index++] = '%'; 8157 if (!right_justify) 8158 format[format_index++] = '-'; 8159 if (pad_zeros) 8160 format[format_index++] = '0'; 8161 if (width_left && width_right) { 8162 int i = 0; 8163 // Only allow 8 digit number widths. 8164 // This also prevents overflowing format variable 8165 while (i < 8 && width_left < width_right) { 8166 format[format_index++] = *width_left; 8167 width_left++; 8168 i++; 8169 } 8170 } 8171 8172 // Parse a name (long or short) 8173 // Canonicalize the name into absolute_short_name 8174 found_valid_name = false; 8175 parse_long_name = (**ptr == '{'); 8176 if (parse_long_name) 8177 (*ptr)++; // skip initial left brace 8178 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8179 sizeof(__kmp_affinity_format_table[0]); 8180 ++i) { 8181 char short_name = __kmp_affinity_format_table[i].short_name; 8182 const char *long_name = __kmp_affinity_format_table[i].long_name; 8183 char field_format = __kmp_affinity_format_table[i].field_format; 8184 if (parse_long_name) { 8185 size_t length = KMP_STRLEN(long_name); 8186 if (strncmp(*ptr, long_name, length) == 0) { 8187 found_valid_name = true; 8188 (*ptr) += length; // skip the long name 8189 } 8190 } else if (**ptr == short_name) { 8191 found_valid_name = true; 8192 (*ptr)++; // skip the short name 8193 } 8194 if (found_valid_name) { 8195 format[format_index++] = field_format; 8196 format[format_index++] = '\0'; 8197 absolute_short_name = short_name; 8198 break; 8199 } 8200 } 8201 if (parse_long_name) { 8202 if (**ptr != '}') { 8203 absolute_short_name = 0; 8204 } else { 8205 (*ptr)++; // skip over the right brace 8206 } 8207 } 8208 8209 // Attempt to fill the buffer with the requested 8210 // value using snprintf within __kmp_str_buf_print() 8211 switch (absolute_short_name) { 8212 case 't': 8213 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8214 break; 8215 case 'T': 8216 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8217 break; 8218 case 'L': 8219 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8220 break; 8221 case 'n': 8222 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8223 break; 8224 case 'H': { 8225 static const int BUFFER_SIZE = 256; 8226 char buf[BUFFER_SIZE]; 8227 __kmp_expand_host_name(buf, BUFFER_SIZE); 8228 rc = __kmp_str_buf_print(field_buffer, format, buf); 8229 } break; 8230 case 'P': 8231 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8232 break; 8233 case 'i': 8234 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8235 break; 8236 case 'N': 8237 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8238 break; 8239 case 'a': 8240 field_value = 8241 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8242 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8243 break; 8244 #if KMP_AFFINITY_SUPPORTED 8245 case 'A': { 8246 kmp_str_buf_t buf; 8247 __kmp_str_buf_init(&buf); 8248 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8249 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8250 __kmp_str_buf_free(&buf); 8251 } break; 8252 #endif 8253 default: 8254 // According to spec, If an implementation does not have info for field 8255 // type, then "undefined" is printed 8256 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8257 // Skip the field 8258 if (parse_long_name) { 8259 SKIP_TOKEN(*ptr); 8260 if (**ptr == '}') 8261 (*ptr)++; 8262 } else { 8263 (*ptr)++; 8264 } 8265 } 8266 8267 KMP_ASSERT(format_index <= FORMAT_SIZE); 8268 return rc; 8269 } 8270 8271 /* 8272 * Return number of characters needed to hold the affinity string 8273 * (not including null byte character) 8274 * The resultant string is printed to buffer, which the caller can then 8275 * handle afterwards 8276 */ 8277 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8278 kmp_str_buf_t *buffer) { 8279 const char *parse_ptr; 8280 size_t retval; 8281 const kmp_info_t *th; 8282 kmp_str_buf_t field; 8283 8284 KMP_DEBUG_ASSERT(buffer); 8285 KMP_DEBUG_ASSERT(gtid >= 0); 8286 8287 __kmp_str_buf_init(&field); 8288 __kmp_str_buf_clear(buffer); 8289 8290 th = __kmp_threads[gtid]; 8291 retval = 0; 8292 8293 // If format is NULL or zero-length string, then we use 8294 // affinity-format-var ICV 8295 parse_ptr = format; 8296 if (parse_ptr == NULL || *parse_ptr == '\0') { 8297 parse_ptr = __kmp_affinity_format; 8298 } 8299 KMP_DEBUG_ASSERT(parse_ptr); 8300 8301 while (*parse_ptr != '\0') { 8302 // Parse a field 8303 if (*parse_ptr == '%') { 8304 // Put field in the buffer 8305 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8306 __kmp_str_buf_catbuf(buffer, &field); 8307 retval += rc; 8308 } else { 8309 // Put literal character in buffer 8310 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8311 retval++; 8312 parse_ptr++; 8313 } 8314 } 8315 __kmp_str_buf_free(&field); 8316 return retval; 8317 } 8318 8319 // Displays the affinity string to stdout 8320 void __kmp_aux_display_affinity(int gtid, const char *format) { 8321 kmp_str_buf_t buf; 8322 __kmp_str_buf_init(&buf); 8323 __kmp_aux_capture_affinity(gtid, format, &buf); 8324 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8325 __kmp_str_buf_free(&buf); 8326 } 8327 8328 /* ------------------------------------------------------------------------ */ 8329 8330 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8331 int blocktime = arg; /* argument is in milliseconds */ 8332 #if KMP_USE_MONITOR 8333 int bt_intervals; 8334 #endif 8335 kmp_int8 bt_set; 8336 8337 __kmp_save_internal_controls(thread); 8338 8339 /* Normalize and set blocktime for the teams */ 8340 if (blocktime < KMP_MIN_BLOCKTIME) 8341 blocktime = KMP_MIN_BLOCKTIME; 8342 else if (blocktime > KMP_MAX_BLOCKTIME) 8343 blocktime = KMP_MAX_BLOCKTIME; 8344 8345 set__blocktime_team(thread->th.th_team, tid, blocktime); 8346 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8347 8348 #if KMP_USE_MONITOR 8349 /* Calculate and set blocktime intervals for the teams */ 8350 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8351 8352 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8353 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8354 #endif 8355 8356 /* Set whether blocktime has been set to "TRUE" */ 8357 bt_set = TRUE; 8358 8359 set__bt_set_team(thread->th.th_team, tid, bt_set); 8360 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8361 #if KMP_USE_MONITOR 8362 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8363 "bt_intervals=%d, monitor_updates=%d\n", 8364 __kmp_gtid_from_tid(tid, thread->th.th_team), 8365 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8366 __kmp_monitor_wakeups)); 8367 #else 8368 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8369 __kmp_gtid_from_tid(tid, thread->th.th_team), 8370 thread->th.th_team->t.t_id, tid, blocktime)); 8371 #endif 8372 } 8373 8374 void __kmp_aux_set_defaults(char const *str, size_t len) { 8375 if (!__kmp_init_serial) { 8376 __kmp_serial_initialize(); 8377 } 8378 __kmp_env_initialize(str); 8379 8380 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8381 __kmp_env_print(); 8382 } 8383 } // __kmp_aux_set_defaults 8384 8385 /* ------------------------------------------------------------------------ */ 8386 /* internal fast reduction routines */ 8387 8388 PACKED_REDUCTION_METHOD_T 8389 __kmp_determine_reduction_method( 8390 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8391 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8392 kmp_critical_name *lck) { 8393 8394 // Default reduction method: critical construct ( lck != NULL, like in current 8395 // PAROPT ) 8396 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8397 // can be selected by RTL 8398 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8399 // can be selected by RTL 8400 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8401 // among generated by PAROPT. 8402 8403 PACKED_REDUCTION_METHOD_T retval; 8404 8405 int team_size; 8406 8407 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8408 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8409 8410 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8411 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8412 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8413 8414 retval = critical_reduce_block; 8415 8416 // another choice of getting a team size (with 1 dynamic deference) is slower 8417 team_size = __kmp_get_team_num_threads(global_tid); 8418 if (team_size == 1) { 8419 8420 retval = empty_reduce_block; 8421 8422 } else { 8423 8424 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8425 8426 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8427 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8428 8429 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8430 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8431 8432 int teamsize_cutoff = 4; 8433 8434 #if KMP_MIC_SUPPORTED 8435 if (__kmp_mic_type != non_mic) { 8436 teamsize_cutoff = 8; 8437 } 8438 #endif 8439 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8440 if (tree_available) { 8441 if (team_size <= teamsize_cutoff) { 8442 if (atomic_available) { 8443 retval = atomic_reduce_block; 8444 } 8445 } else { 8446 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8447 } 8448 } else if (atomic_available) { 8449 retval = atomic_reduce_block; 8450 } 8451 #else 8452 #error "Unknown or unsupported OS" 8453 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8454 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8455 8456 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8457 8458 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8459 8460 // basic tuning 8461 8462 if (atomic_available) { 8463 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8464 retval = atomic_reduce_block; 8465 } 8466 } // otherwise: use critical section 8467 8468 #elif KMP_OS_DARWIN 8469 8470 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8471 if (atomic_available && (num_vars <= 3)) { 8472 retval = atomic_reduce_block; 8473 } else if (tree_available) { 8474 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8475 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8476 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8477 } 8478 } // otherwise: use critical section 8479 8480 #else 8481 #error "Unknown or unsupported OS" 8482 #endif 8483 8484 #else 8485 #error "Unknown or unsupported architecture" 8486 #endif 8487 } 8488 8489 // KMP_FORCE_REDUCTION 8490 8491 // If the team is serialized (team_size == 1), ignore the forced reduction 8492 // method and stay with the unsynchronized method (empty_reduce_block) 8493 if (__kmp_force_reduction_method != reduction_method_not_defined && 8494 team_size != 1) { 8495 8496 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8497 8498 int atomic_available, tree_available; 8499 8500 switch ((forced_retval = __kmp_force_reduction_method)) { 8501 case critical_reduce_block: 8502 KMP_ASSERT(lck); // lck should be != 0 8503 break; 8504 8505 case atomic_reduce_block: 8506 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8507 if (!atomic_available) { 8508 KMP_WARNING(RedMethodNotSupported, "atomic"); 8509 forced_retval = critical_reduce_block; 8510 } 8511 break; 8512 8513 case tree_reduce_block: 8514 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8515 if (!tree_available) { 8516 KMP_WARNING(RedMethodNotSupported, "tree"); 8517 forced_retval = critical_reduce_block; 8518 } else { 8519 #if KMP_FAST_REDUCTION_BARRIER 8520 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8521 #endif 8522 } 8523 break; 8524 8525 default: 8526 KMP_ASSERT(0); // "unsupported method specified" 8527 } 8528 8529 retval = forced_retval; 8530 } 8531 8532 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8533 8534 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8535 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8536 8537 return (retval); 8538 } 8539 // this function is for testing set/get/determine reduce method 8540 kmp_int32 __kmp_get_reduce_method(void) { 8541 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8542 } 8543 8544 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8545 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8546 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8547 8548 // Hard pause shuts down the runtime completely. Resume happens naturally when 8549 // OpenMP is used subsequently. 8550 void __kmp_hard_pause() { 8551 __kmp_pause_status = kmp_hard_paused; 8552 __kmp_internal_end_thread(-1); 8553 } 8554 8555 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8556 void __kmp_resume_if_soft_paused() { 8557 if (__kmp_pause_status == kmp_soft_paused) { 8558 __kmp_pause_status = kmp_not_paused; 8559 8560 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8561 kmp_info_t *thread = __kmp_threads[gtid]; 8562 if (thread) { // Wake it if sleeping 8563 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8564 thread); 8565 if (fl.is_sleeping()) 8566 fl.resume(gtid); 8567 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8568 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8569 } else { // thread holds the lock and may sleep soon 8570 do { // until either the thread sleeps, or we can get the lock 8571 if (fl.is_sleeping()) { 8572 fl.resume(gtid); 8573 break; 8574 } else if (__kmp_try_suspend_mx(thread)) { 8575 __kmp_unlock_suspend_mx(thread); 8576 break; 8577 } 8578 } while (1); 8579 } 8580 } 8581 } 8582 } 8583 } 8584 8585 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8586 // TODO: add warning messages 8587 int __kmp_pause_resource(kmp_pause_status_t level) { 8588 if (level == kmp_not_paused) { // requesting resume 8589 if (__kmp_pause_status == kmp_not_paused) { 8590 // error message about runtime not being paused, so can't resume 8591 return 1; 8592 } else { 8593 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8594 __kmp_pause_status == kmp_hard_paused); 8595 __kmp_pause_status = kmp_not_paused; 8596 return 0; 8597 } 8598 } else if (level == kmp_soft_paused) { // requesting soft pause 8599 if (__kmp_pause_status != kmp_not_paused) { 8600 // error message about already being paused 8601 return 1; 8602 } else { 8603 __kmp_soft_pause(); 8604 return 0; 8605 } 8606 } else if (level == kmp_hard_paused) { // requesting hard pause 8607 if (__kmp_pause_status != kmp_not_paused) { 8608 // error message about already being paused 8609 return 1; 8610 } else { 8611 __kmp_hard_pause(); 8612 return 0; 8613 } 8614 } else { 8615 // error message about invalid level 8616 return 1; 8617 } 8618 } 8619 8620 void __kmp_omp_display_env(int verbose) { 8621 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8622 if (__kmp_init_serial == 0) 8623 __kmp_do_serial_initialize(); 8624 __kmp_display_env_impl(!verbose, verbose); 8625 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8626 } 8627 8628 // Globals and functions for hidden helper task 8629 kmp_info_t **__kmp_hidden_helper_threads; 8630 kmp_info_t *__kmp_hidden_helper_main_thread; 8631 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8632 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8633 #if KMP_OS_LINUX 8634 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8635 #else 8636 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8637 #endif 8638 8639 namespace { 8640 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8641 8642 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8643 // This is an explicit synchronization on all hidden helper threads in case 8644 // that when a regular thread pushes a hidden helper task to one hidden 8645 // helper thread, the thread has not been awaken once since they're released 8646 // by the main thread after creating the team. 8647 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8648 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8649 __kmp_hidden_helper_threads_num) 8650 ; 8651 8652 // If main thread, then wait for signal 8653 if (__kmpc_master(nullptr, *gtid)) { 8654 // First, unset the initial state and release the initial thread 8655 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8656 __kmp_hidden_helper_initz_release(); 8657 __kmp_hidden_helper_main_thread_wait(); 8658 // Now wake up all worker threads 8659 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8660 __kmp_hidden_helper_worker_thread_signal(); 8661 } 8662 } 8663 } 8664 } // namespace 8665 8666 void __kmp_hidden_helper_threads_initz_routine() { 8667 // Create a new root for hidden helper team/threads 8668 const int gtid = __kmp_register_root(TRUE); 8669 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8670 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8671 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8672 __kmp_hidden_helper_threads_num; 8673 8674 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8675 8676 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8677 8678 // Set the initialization flag to FALSE 8679 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8680 8681 __kmp_hidden_helper_threads_deinitz_release(); 8682 } 8683