1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 #if OMP_PROFILING_SUPPORT 36 #include "llvm/Support/TimeProfiler.h" 37 static char *ProfileTraceFile = nullptr; 38 #endif 39 40 /* these are temporary issues to be dealt with */ 41 #define KMP_USE_PRCTL 0 42 43 #if KMP_OS_WINDOWS 44 #include <process.h> 45 #endif 46 47 #include "tsan_annotations.h" 48 49 #if KMP_OS_WINDOWS 50 // windows does not need include files as it doesn't use shared memory 51 #else 52 #include <sys/mman.h> 53 #include <sys/stat.h> 54 #include <fcntl.h> 55 #define SHM_SIZE 1024 56 #endif 57 58 #if defined(KMP_GOMP_COMPAT) 59 char const __kmp_version_alt_comp[] = 60 KMP_VERSION_PREFIX "alternative compiler support: yes"; 61 #endif /* defined(KMP_GOMP_COMPAT) */ 62 63 char const __kmp_version_omp_api[] = 64 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 65 66 #ifdef KMP_DEBUG 67 char const __kmp_version_lock[] = 68 KMP_VERSION_PREFIX "lock type: run time selectable"; 69 #endif /* KMP_DEBUG */ 70 71 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 72 73 /* ------------------------------------------------------------------------ */ 74 75 #if KMP_USE_MONITOR 76 kmp_info_t __kmp_monitor; 77 #endif 78 79 /* Forward declarations */ 80 81 void __kmp_cleanup(void); 82 83 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 84 int gtid); 85 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 86 kmp_internal_control_t *new_icvs, 87 ident_t *loc); 88 #if KMP_AFFINITY_SUPPORTED 89 static void __kmp_partition_places(kmp_team_t *team, 90 int update_master_only = 0); 91 #endif 92 static void __kmp_do_serial_initialize(void); 93 void __kmp_fork_barrier(int gtid, int tid); 94 void __kmp_join_barrier(int gtid); 95 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 96 kmp_internal_control_t *new_icvs, ident_t *loc); 97 98 #ifdef USE_LOAD_BALANCE 99 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 100 #endif 101 102 static int __kmp_expand_threads(int nNeed); 103 #if KMP_OS_WINDOWS 104 static int __kmp_unregister_root_other_thread(int gtid); 105 #endif 106 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 107 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 108 109 /* Calculate the identifier of the current thread */ 110 /* fast (and somewhat portable) way to get unique identifier of executing 111 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 112 int __kmp_get_global_thread_id() { 113 int i; 114 kmp_info_t **other_threads; 115 size_t stack_data; 116 char *stack_addr; 117 size_t stack_size; 118 char *stack_base; 119 120 KA_TRACE( 121 1000, 122 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 123 __kmp_nth, __kmp_all_nth)); 124 125 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 126 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 127 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 128 __kmp_init_gtid for this to work. */ 129 130 if (!TCR_4(__kmp_init_gtid)) 131 return KMP_GTID_DNE; 132 133 #ifdef KMP_TDATA_GTID 134 if (TCR_4(__kmp_gtid_mode) >= 3) { 135 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 136 return __kmp_gtid; 137 } 138 #endif 139 if (TCR_4(__kmp_gtid_mode) >= 2) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 141 return __kmp_gtid_get_specific(); 142 } 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 144 145 stack_addr = (char *)&stack_data; 146 other_threads = __kmp_threads; 147 148 /* ATT: The code below is a source of potential bugs due to unsynchronized 149 access to __kmp_threads array. For example: 150 1. Current thread loads other_threads[i] to thr and checks it, it is 151 non-NULL. 152 2. Current thread is suspended by OS. 153 3. Another thread unregisters and finishes (debug versions of free() 154 may fill memory with something like 0xEF). 155 4. Current thread is resumed. 156 5. Current thread reads junk from *thr. 157 TODO: Fix it. --ln */ 158 159 for (i = 0; i < __kmp_threads_capacity; i++) { 160 161 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 162 if (!thr) 163 continue; 164 165 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 166 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 167 168 /* stack grows down -- search through all of the active threads */ 169 170 if (stack_addr <= stack_base) { 171 size_t stack_diff = stack_base - stack_addr; 172 173 if (stack_diff <= stack_size) { 174 /* The only way we can be closer than the allocated */ 175 /* stack size is if we are running on this thread. */ 176 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 177 return i; 178 } 179 } 180 } 181 182 /* get specific to try and determine our gtid */ 183 KA_TRACE(1000, 184 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 185 "thread, using TLS\n")); 186 i = __kmp_gtid_get_specific(); 187 188 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 189 190 /* if we havn't been assigned a gtid, then return code */ 191 if (i < 0) 192 return i; 193 194 /* dynamically updated stack window for uber threads to avoid get_specific 195 call */ 196 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 197 KMP_FATAL(StackOverflow, i); 198 } 199 200 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 201 if (stack_addr > stack_base) { 202 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 203 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 204 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 205 stack_base); 206 } else { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 208 stack_base - stack_addr); 209 } 210 211 /* Reprint stack bounds for ubermaster since they have been refined */ 212 if (__kmp_storage_map) { 213 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 214 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 215 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 216 other_threads[i]->th.th_info.ds.ds_stacksize, 217 "th_%d stack (refinement)", i); 218 } 219 return i; 220 } 221 222 int __kmp_get_global_thread_id_reg() { 223 int gtid; 224 225 if (!__kmp_init_serial) { 226 gtid = KMP_GTID_DNE; 227 } else 228 #ifdef KMP_TDATA_GTID 229 if (TCR_4(__kmp_gtid_mode) >= 3) { 230 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 231 gtid = __kmp_gtid; 232 } else 233 #endif 234 if (TCR_4(__kmp_gtid_mode) >= 2) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 236 gtid = __kmp_gtid_get_specific(); 237 } else { 238 KA_TRACE(1000, 239 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 240 gtid = __kmp_get_global_thread_id(); 241 } 242 243 /* we must be a new uber master sibling thread */ 244 if (gtid == KMP_GTID_DNE) { 245 KA_TRACE(10, 246 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 247 "Registering a new gtid.\n")); 248 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 249 if (!__kmp_init_serial) { 250 __kmp_do_serial_initialize(); 251 gtid = __kmp_gtid_get_specific(); 252 } else { 253 gtid = __kmp_register_root(FALSE); 254 } 255 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 256 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 257 } 258 259 KMP_DEBUG_ASSERT(gtid >= 0); 260 261 return gtid; 262 } 263 264 /* caller must hold forkjoin_lock */ 265 void __kmp_check_stack_overlap(kmp_info_t *th) { 266 int f; 267 char *stack_beg = NULL; 268 char *stack_end = NULL; 269 int gtid; 270 271 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 272 if (__kmp_storage_map) { 273 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 274 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 275 276 gtid = __kmp_gtid_from_thread(th); 277 278 if (gtid == KMP_GTID_MONITOR) { 279 __kmp_print_storage_map_gtid( 280 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 281 "th_%s stack (%s)", "mon", 282 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 283 } else { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%d stack (%s)", gtid, 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } 289 } 290 291 /* No point in checking ubermaster threads since they use refinement and 292 * cannot overlap */ 293 gtid = __kmp_gtid_from_thread(th); 294 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 295 KA_TRACE(10, 296 ("__kmp_check_stack_overlap: performing extensive checking\n")); 297 if (stack_beg == NULL) { 298 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 299 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 300 } 301 302 for (f = 0; f < __kmp_threads_capacity; f++) { 303 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 304 305 if (f_th && f_th != th) { 306 char *other_stack_end = 307 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 308 char *other_stack_beg = 309 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 310 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 311 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 312 313 /* Print the other stack values before the abort */ 314 if (__kmp_storage_map) 315 __kmp_print_storage_map_gtid( 316 -1, other_stack_beg, other_stack_end, 317 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 318 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 319 320 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 321 __kmp_msg_null); 322 } 323 } 324 } 325 } 326 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 327 } 328 329 /* ------------------------------------------------------------------------ */ 330 331 void __kmp_infinite_loop(void) { 332 static int done = FALSE; 333 334 while (!done) { 335 KMP_YIELD(TRUE); 336 } 337 } 338 339 #define MAX_MESSAGE 512 340 341 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 342 char const *format, ...) { 343 char buffer[MAX_MESSAGE]; 344 va_list ap; 345 346 va_start(ap, format); 347 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 348 p2, (unsigned long)size, format); 349 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 350 __kmp_vprintf(kmp_err, buffer, ap); 351 #if KMP_PRINT_DATA_PLACEMENT 352 int node; 353 if (gtid >= 0) { 354 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 355 if (__kmp_storage_map_verbose) { 356 node = __kmp_get_host_node(p1); 357 if (node < 0) /* doesn't work, so don't try this next time */ 358 __kmp_storage_map_verbose = FALSE; 359 else { 360 char *last; 361 int lastNode; 362 int localProc = __kmp_get_cpu_from_gtid(gtid); 363 364 const int page_size = KMP_GET_PAGE_SIZE(); 365 366 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 367 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 368 if (localProc >= 0) 369 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 370 localProc >> 1); 371 else 372 __kmp_printf_no_lock(" GTID %d\n", gtid); 373 #if KMP_USE_PRCTL 374 /* The more elaborate format is disabled for now because of the prctl 375 * hanging bug. */ 376 do { 377 last = p1; 378 lastNode = node; 379 /* This loop collates adjacent pages with the same host node. */ 380 do { 381 (char *)p1 += page_size; 382 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 383 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 384 lastNode); 385 } while (p1 <= p2); 386 #else 387 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 388 (char *)p1 + (page_size - 1), 389 __kmp_get_host_node(p1)); 390 if (p1 < p2) { 391 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 392 (char *)p2 + (page_size - 1), 393 __kmp_get_host_node(p2)); 394 } 395 #endif 396 } 397 } 398 } else 399 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 400 } 401 #endif /* KMP_PRINT_DATA_PLACEMENT */ 402 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 403 } 404 405 void __kmp_warn(char const *format, ...) { 406 char buffer[MAX_MESSAGE]; 407 va_list ap; 408 409 if (__kmp_generate_warnings == kmp_warnings_off) { 410 return; 411 } 412 413 va_start(ap, format); 414 415 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 416 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 417 __kmp_vprintf(kmp_err, buffer, ap); 418 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 419 420 va_end(ap); 421 } 422 423 void __kmp_abort_process() { 424 // Later threads may stall here, but that's ok because abort() will kill them. 425 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 426 427 if (__kmp_debug_buf) { 428 __kmp_dump_debug_buffer(); 429 } 430 431 if (KMP_OS_WINDOWS) { 432 // Let other threads know of abnormal termination and prevent deadlock 433 // if abort happened during library initialization or shutdown 434 __kmp_global.g.g_abort = SIGABRT; 435 436 /* On Windows* OS by default abort() causes pop-up error box, which stalls 437 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 438 boxes. _set_abort_behavior() works well, but this function is not 439 available in VS7 (this is not problem for DLL, but it is a problem for 440 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 441 help, at least in some versions of MS C RTL. 442 443 It seems following sequence is the only way to simulate abort() and 444 avoid pop-up error box. */ 445 raise(SIGABRT); 446 _exit(3); // Just in case, if signal ignored, exit anyway. 447 } else { 448 __kmp_unregister_library(); 449 abort(); 450 } 451 452 __kmp_infinite_loop(); 453 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 454 455 } // __kmp_abort_process 456 457 void __kmp_abort_thread(void) { 458 // TODO: Eliminate g_abort global variable and this function. 459 // In case of abort just call abort(), it will kill all the threads. 460 __kmp_infinite_loop(); 461 } // __kmp_abort_thread 462 463 /* Print out the storage map for the major kmp_info_t thread data structures 464 that are allocated together. */ 465 466 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 467 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 468 gtid); 469 470 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 471 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 474 sizeof(kmp_local_t), "th_%d.th_local", gtid); 475 476 __kmp_print_storage_map_gtid( 477 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 478 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 479 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 481 &thr->th.th_bar[bs_plain_barrier + 1], 482 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 483 gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 486 &thr->th.th_bar[bs_forkjoin_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 488 gtid); 489 490 #if KMP_FAST_REDUCTION_BARRIER 491 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 492 &thr->th.th_bar[bs_reduction_barrier + 1], 493 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 494 gtid); 495 #endif // KMP_FAST_REDUCTION_BARRIER 496 } 497 498 /* Print out the storage map for the major kmp_team_t team data structures 499 that are allocated together. */ 500 501 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 502 int team_id, int num_thr) { 503 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 504 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 505 header, team_id); 506 507 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 508 &team->t.t_bar[bs_last_barrier], 509 sizeof(kmp_balign_team_t) * bs_last_barrier, 510 "%s_%d.t_bar", header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 513 &team->t.t_bar[bs_plain_barrier + 1], 514 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 515 header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 518 &team->t.t_bar[bs_forkjoin_barrier + 1], 519 sizeof(kmp_balign_team_t), 520 "%s_%d.t_bar[forkjoin]", header, team_id); 521 522 #if KMP_FAST_REDUCTION_BARRIER 523 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 524 &team->t.t_bar[bs_reduction_barrier + 1], 525 sizeof(kmp_balign_team_t), 526 "%s_%d.t_bar[reduction]", header, team_id); 527 #endif // KMP_FAST_REDUCTION_BARRIER 528 529 __kmp_print_storage_map_gtid( 530 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 531 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 532 533 __kmp_print_storage_map_gtid( 534 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 535 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 536 537 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 538 &team->t.t_disp_buffer[num_disp_buff], 539 sizeof(dispatch_shared_info_t) * num_disp_buff, 540 "%s_%d.t_disp_buffer", header, team_id); 541 } 542 543 static void __kmp_init_allocator() { 544 __kmp_init_memkind(); 545 __kmp_init_target_mem(); 546 } 547 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 548 549 /* ------------------------------------------------------------------------ */ 550 551 #if KMP_DYNAMIC_LIB 552 #if KMP_OS_WINDOWS 553 554 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 555 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 556 557 switch (fdwReason) { 558 559 case DLL_PROCESS_ATTACH: 560 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 561 562 return TRUE; 563 564 case DLL_PROCESS_DETACH: 565 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 566 567 // According to Windows* documentation for DllMain entry point: 568 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 569 // lpReserved == NULL when FreeLibrary() is called, 570 // lpReserved != NULL when the process is terminated. 571 // When FreeLibrary() is called, worker threads remain alive. So the 572 // runtime's state is consistent and executing proper shutdown is OK. 573 // When the process is terminated, worker threads have exited or been 574 // forcefully terminated by the OS and only the shutdown thread remains. 575 // This can leave the runtime in an inconsistent state. 576 // Hence, only attempt proper cleanup when FreeLibrary() is called. 577 // Otherwise, rely on OS to reclaim resources. 578 if (lpReserved == NULL) 579 __kmp_internal_end_library(__kmp_gtid_get_specific()); 580 581 return TRUE; 582 583 case DLL_THREAD_ATTACH: 584 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 585 586 /* if we want to register new siblings all the time here call 587 * __kmp_get_gtid(); */ 588 return TRUE; 589 590 case DLL_THREAD_DETACH: 591 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 592 593 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 594 return TRUE; 595 } 596 597 return TRUE; 598 } 599 600 #endif /* KMP_OS_WINDOWS */ 601 #endif /* KMP_DYNAMIC_LIB */ 602 603 /* __kmp_parallel_deo -- Wait until it's our turn. */ 604 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 605 int gtid = *gtid_ref; 606 #ifdef BUILD_PARALLEL_ORDERED 607 kmp_team_t *team = __kmp_team_from_gtid(gtid); 608 #endif /* BUILD_PARALLEL_ORDERED */ 609 610 if (__kmp_env_consistency_check) { 611 if (__kmp_threads[gtid]->th.th_root->r.r_active) 612 #if KMP_USE_DYNAMIC_LOCK 613 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 614 #else 615 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 616 #endif 617 } 618 #ifdef BUILD_PARALLEL_ORDERED 619 if (!team->t.t_serialized) { 620 KMP_MB(); 621 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 622 NULL); 623 KMP_MB(); 624 } 625 #endif /* BUILD_PARALLEL_ORDERED */ 626 } 627 628 /* __kmp_parallel_dxo -- Signal the next task. */ 629 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 630 int gtid = *gtid_ref; 631 #ifdef BUILD_PARALLEL_ORDERED 632 int tid = __kmp_tid_from_gtid(gtid); 633 kmp_team_t *team = __kmp_team_from_gtid(gtid); 634 #endif /* BUILD_PARALLEL_ORDERED */ 635 636 if (__kmp_env_consistency_check) { 637 if (__kmp_threads[gtid]->th.th_root->r.r_active) 638 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 639 } 640 #ifdef BUILD_PARALLEL_ORDERED 641 if (!team->t.t_serialized) { 642 KMP_MB(); /* Flush all pending memory write invalidates. */ 643 644 /* use the tid of the next thread in this team */ 645 /* TODO replace with general release procedure */ 646 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 647 648 KMP_MB(); /* Flush all pending memory write invalidates. */ 649 } 650 #endif /* BUILD_PARALLEL_ORDERED */ 651 } 652 653 /* ------------------------------------------------------------------------ */ 654 /* The BARRIER for a SINGLE process section is always explicit */ 655 656 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 657 int status; 658 kmp_info_t *th; 659 kmp_team_t *team; 660 661 if (!TCR_4(__kmp_init_parallel)) 662 __kmp_parallel_initialize(); 663 __kmp_resume_if_soft_paused(); 664 665 th = __kmp_threads[gtid]; 666 team = th->th.th_team; 667 status = 0; 668 669 th->th.th_ident = id_ref; 670 671 if (team->t.t_serialized) { 672 status = 1; 673 } else { 674 kmp_int32 old_this = th->th.th_local.this_construct; 675 676 ++th->th.th_local.this_construct; 677 /* try to set team count to thread count--success means thread got the 678 single block */ 679 /* TODO: Should this be acquire or release? */ 680 if (team->t.t_construct == old_this) { 681 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 682 th->th.th_local.this_construct); 683 } 684 #if USE_ITT_BUILD 685 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 686 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 687 team->t.t_active_level == 1) { 688 // Only report metadata by primary thread of active team at level 1 689 __kmp_itt_metadata_single(id_ref); 690 } 691 #endif /* USE_ITT_BUILD */ 692 } 693 694 if (__kmp_env_consistency_check) { 695 if (status && push_ws) { 696 __kmp_push_workshare(gtid, ct_psingle, id_ref); 697 } else { 698 __kmp_check_workshare(gtid, ct_psingle, id_ref); 699 } 700 } 701 #if USE_ITT_BUILD 702 if (status) { 703 __kmp_itt_single_start(gtid); 704 } 705 #endif /* USE_ITT_BUILD */ 706 return status; 707 } 708 709 void __kmp_exit_single(int gtid) { 710 #if USE_ITT_BUILD 711 __kmp_itt_single_end(gtid); 712 #endif /* USE_ITT_BUILD */ 713 if (__kmp_env_consistency_check) 714 __kmp_pop_workshare(gtid, ct_psingle, NULL); 715 } 716 717 /* determine if we can go parallel or must use a serialized parallel region and 718 * how many threads we can use 719 * set_nproc is the number of threads requested for the team 720 * returns 0 if we should serialize or only use one thread, 721 * otherwise the number of threads to use 722 * The forkjoin lock is held by the caller. */ 723 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 724 int master_tid, int set_nthreads, 725 int enter_teams) { 726 int capacity; 727 int new_nthreads; 728 KMP_DEBUG_ASSERT(__kmp_init_serial); 729 KMP_DEBUG_ASSERT(root && parent_team); 730 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 731 732 // If dyn-var is set, dynamically adjust the number of desired threads, 733 // according to the method specified by dynamic_mode. 734 new_nthreads = set_nthreads; 735 if (!get__dynamic_2(parent_team, master_tid)) { 736 ; 737 } 738 #ifdef USE_LOAD_BALANCE 739 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 740 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 741 if (new_nthreads == 1) { 742 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 743 "reservation to 1 thread\n", 744 master_tid)); 745 return 1; 746 } 747 if (new_nthreads < set_nthreads) { 748 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 749 "reservation to %d threads\n", 750 master_tid, new_nthreads)); 751 } 752 } 753 #endif /* USE_LOAD_BALANCE */ 754 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 755 new_nthreads = __kmp_avail_proc - __kmp_nth + 756 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 757 if (new_nthreads <= 1) { 758 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 759 "reservation to 1 thread\n", 760 master_tid)); 761 return 1; 762 } 763 if (new_nthreads < set_nthreads) { 764 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 765 "reservation to %d threads\n", 766 master_tid, new_nthreads)); 767 } else { 768 new_nthreads = set_nthreads; 769 } 770 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 771 if (set_nthreads > 2) { 772 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 773 new_nthreads = (new_nthreads % set_nthreads) + 1; 774 if (new_nthreads == 1) { 775 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 776 "reservation to 1 thread\n", 777 master_tid)); 778 return 1; 779 } 780 if (new_nthreads < set_nthreads) { 781 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 782 "reservation to %d threads\n", 783 master_tid, new_nthreads)); 784 } 785 } 786 } else { 787 KMP_ASSERT(0); 788 } 789 790 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 791 if (__kmp_nth + new_nthreads - 792 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 793 __kmp_max_nth) { 794 int tl_nthreads = __kmp_max_nth - __kmp_nth + 795 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 796 if (tl_nthreads <= 0) { 797 tl_nthreads = 1; 798 } 799 800 // If dyn-var is false, emit a 1-time warning. 801 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 802 __kmp_reserve_warn = 1; 803 __kmp_msg(kmp_ms_warning, 804 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 805 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 806 } 807 if (tl_nthreads == 1) { 808 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 809 "reduced reservation to 1 thread\n", 810 master_tid)); 811 return 1; 812 } 813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 814 "reservation to %d threads\n", 815 master_tid, tl_nthreads)); 816 new_nthreads = tl_nthreads; 817 } 818 819 // Respect OMP_THREAD_LIMIT 820 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 821 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 822 if (cg_nthreads + new_nthreads - 823 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 824 max_cg_threads) { 825 int tl_nthreads = max_cg_threads - cg_nthreads + 826 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 827 if (tl_nthreads <= 0) { 828 tl_nthreads = 1; 829 } 830 831 // If dyn-var is false, emit a 1-time warning. 832 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 833 __kmp_reserve_warn = 1; 834 __kmp_msg(kmp_ms_warning, 835 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 836 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 837 } 838 if (tl_nthreads == 1) { 839 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 840 "reduced reservation to 1 thread\n", 841 master_tid)); 842 return 1; 843 } 844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 845 "reservation to %d threads\n", 846 master_tid, tl_nthreads)); 847 new_nthreads = tl_nthreads; 848 } 849 850 // Check if the threads array is large enough, or needs expanding. 851 // See comment in __kmp_register_root() about the adjustment if 852 // __kmp_threads[0] == NULL. 853 capacity = __kmp_threads_capacity; 854 if (TCR_PTR(__kmp_threads[0]) == NULL) { 855 --capacity; 856 } 857 if (__kmp_nth + new_nthreads - 858 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 859 capacity) { 860 // Expand the threads array. 861 int slotsRequired = __kmp_nth + new_nthreads - 862 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 863 capacity; 864 int slotsAdded = __kmp_expand_threads(slotsRequired); 865 if (slotsAdded < slotsRequired) { 866 // The threads array was not expanded enough. 867 new_nthreads -= (slotsRequired - slotsAdded); 868 KMP_ASSERT(new_nthreads >= 1); 869 870 // If dyn-var is false, emit a 1-time warning. 871 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 872 __kmp_reserve_warn = 1; 873 if (__kmp_tp_cached) { 874 __kmp_msg(kmp_ms_warning, 875 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 876 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 877 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 878 } else { 879 __kmp_msg(kmp_ms_warning, 880 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 881 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 882 } 883 } 884 } 885 } 886 887 #ifdef KMP_DEBUG 888 if (new_nthreads == 1) { 889 KC_TRACE(10, 890 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 891 "dead roots and rechecking; requested %d threads\n", 892 __kmp_get_gtid(), set_nthreads)); 893 } else { 894 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 895 " %d threads\n", 896 __kmp_get_gtid(), new_nthreads, set_nthreads)); 897 } 898 #endif // KMP_DEBUG 899 return new_nthreads; 900 } 901 902 /* Allocate threads from the thread pool and assign them to the new team. We are 903 assured that there are enough threads available, because we checked on that 904 earlier within critical section forkjoin */ 905 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 906 kmp_info_t *master_th, int master_gtid) { 907 int i; 908 int use_hot_team; 909 910 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 911 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 912 KMP_MB(); 913 914 /* first, let's setup the primary thread */ 915 master_th->th.th_info.ds.ds_tid = 0; 916 master_th->th.th_team = team; 917 master_th->th.th_team_nproc = team->t.t_nproc; 918 master_th->th.th_team_master = master_th; 919 master_th->th.th_team_serialized = FALSE; 920 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 921 922 /* make sure we are not the optimized hot team */ 923 #if KMP_NESTED_HOT_TEAMS 924 use_hot_team = 0; 925 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 926 if (hot_teams) { // hot teams array is not allocated if 927 // KMP_HOT_TEAMS_MAX_LEVEL=0 928 int level = team->t.t_active_level - 1; // index in array of hot teams 929 if (master_th->th.th_teams_microtask) { // are we inside the teams? 930 if (master_th->th.th_teams_size.nteams > 1) { 931 ++level; // level was not increased in teams construct for 932 // team_of_masters 933 } 934 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 935 master_th->th.th_teams_level == team->t.t_level) { 936 ++level; // level was not increased in teams construct for 937 // team_of_workers before the parallel 938 } // team->t.t_level will be increased inside parallel 939 } 940 if (level < __kmp_hot_teams_max_level) { 941 if (hot_teams[level].hot_team) { 942 // hot team has already been allocated for given level 943 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 944 use_hot_team = 1; // the team is ready to use 945 } else { 946 use_hot_team = 0; // AC: threads are not allocated yet 947 hot_teams[level].hot_team = team; // remember new hot team 948 hot_teams[level].hot_team_nth = team->t.t_nproc; 949 } 950 } else { 951 use_hot_team = 0; 952 } 953 } 954 #else 955 use_hot_team = team == root->r.r_hot_team; 956 #endif 957 if (!use_hot_team) { 958 959 /* install the primary thread */ 960 team->t.t_threads[0] = master_th; 961 __kmp_initialize_info(master_th, team, 0, master_gtid); 962 963 /* now, install the worker threads */ 964 for (i = 1; i < team->t.t_nproc; i++) { 965 966 /* fork or reallocate a new thread and install it in team */ 967 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 968 team->t.t_threads[i] = thr; 969 KMP_DEBUG_ASSERT(thr); 970 KMP_DEBUG_ASSERT(thr->th.th_team == team); 971 /* align team and thread arrived states */ 972 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 973 "T#%d(%d:%d) join =%llu, plain=%llu\n", 974 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 975 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 976 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 977 team->t.t_bar[bs_plain_barrier].b_arrived)); 978 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 979 thr->th.th_teams_level = master_th->th.th_teams_level; 980 thr->th.th_teams_size = master_th->th.th_teams_size; 981 { // Initialize threads' barrier data. 982 int b; 983 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 984 for (b = 0; b < bs_last_barrier; ++b) { 985 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 986 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 987 #if USE_DEBUGGER 988 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 989 #endif 990 } 991 } 992 } 993 994 #if KMP_AFFINITY_SUPPORTED 995 __kmp_partition_places(team); 996 #endif 997 } 998 999 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1000 for (i = 0; i < team->t.t_nproc; i++) { 1001 kmp_info_t *thr = team->t.t_threads[i]; 1002 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1003 thr->th.th_prev_level != team->t.t_level) { 1004 team->t.t_display_affinity = 1; 1005 break; 1006 } 1007 } 1008 } 1009 1010 KMP_MB(); 1011 } 1012 1013 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1014 // Propagate any changes to the floating point control registers out to the team 1015 // We try to avoid unnecessary writes to the relevant cache line in the team 1016 // structure, so we don't make changes unless they are needed. 1017 inline static void propagateFPControl(kmp_team_t *team) { 1018 if (__kmp_inherit_fp_control) { 1019 kmp_int16 x87_fpu_control_word; 1020 kmp_uint32 mxcsr; 1021 1022 // Get primary thread's values of FPU control flags (both X87 and vector) 1023 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1024 __kmp_store_mxcsr(&mxcsr); 1025 mxcsr &= KMP_X86_MXCSR_MASK; 1026 1027 // There is no point looking at t_fp_control_saved here. 1028 // If it is TRUE, we still have to update the values if they are different 1029 // from those we now have. If it is FALSE we didn't save anything yet, but 1030 // our objective is the same. We have to ensure that the values in the team 1031 // are the same as those we have. 1032 // So, this code achieves what we need whether or not t_fp_control_saved is 1033 // true. By checking whether the value needs updating we avoid unnecessary 1034 // writes that would put the cache-line into a written state, causing all 1035 // threads in the team to have to read it again. 1036 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1037 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1038 // Although we don't use this value, other code in the runtime wants to know 1039 // whether it should restore them. So we must ensure it is correct. 1040 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1041 } else { 1042 // Similarly here. Don't write to this cache-line in the team structure 1043 // unless we have to. 1044 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1045 } 1046 } 1047 1048 // Do the opposite, setting the hardware registers to the updated values from 1049 // the team. 1050 inline static void updateHWFPControl(kmp_team_t *team) { 1051 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1052 // Only reset the fp control regs if they have been changed in the team. 1053 // the parallel region that we are exiting. 1054 kmp_int16 x87_fpu_control_word; 1055 kmp_uint32 mxcsr; 1056 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1057 __kmp_store_mxcsr(&mxcsr); 1058 mxcsr &= KMP_X86_MXCSR_MASK; 1059 1060 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1061 __kmp_clear_x87_fpu_status_word(); 1062 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1063 } 1064 1065 if (team->t.t_mxcsr != mxcsr) { 1066 __kmp_load_mxcsr(&team->t.t_mxcsr); 1067 } 1068 } 1069 } 1070 #else 1071 #define propagateFPControl(x) ((void)0) 1072 #define updateHWFPControl(x) ((void)0) 1073 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1074 1075 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1076 int realloc); // forward declaration 1077 1078 /* Run a parallel region that has been serialized, so runs only in a team of the 1079 single primary thread. */ 1080 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1081 kmp_info_t *this_thr; 1082 kmp_team_t *serial_team; 1083 1084 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1085 1086 /* Skip all this code for autopar serialized loops since it results in 1087 unacceptable overhead */ 1088 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1089 return; 1090 1091 if (!TCR_4(__kmp_init_parallel)) 1092 __kmp_parallel_initialize(); 1093 __kmp_resume_if_soft_paused(); 1094 1095 this_thr = __kmp_threads[global_tid]; 1096 serial_team = this_thr->th.th_serial_team; 1097 1098 /* utilize the serialized team held by this thread */ 1099 KMP_DEBUG_ASSERT(serial_team); 1100 KMP_MB(); 1101 1102 if (__kmp_tasking_mode != tskm_immediate_exec) { 1103 KMP_DEBUG_ASSERT( 1104 this_thr->th.th_task_team == 1105 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1106 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1107 NULL); 1108 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1109 "team %p, new task_team = NULL\n", 1110 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1111 this_thr->th.th_task_team = NULL; 1112 } 1113 1114 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1115 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1116 proc_bind = proc_bind_false; 1117 } else if (proc_bind == proc_bind_default) { 1118 // No proc_bind clause was specified, so use the current value 1119 // of proc-bind-var for this parallel region. 1120 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1121 } 1122 // Reset for next parallel region 1123 this_thr->th.th_set_proc_bind = proc_bind_default; 1124 1125 #if OMPT_SUPPORT 1126 ompt_data_t ompt_parallel_data = ompt_data_none; 1127 ompt_data_t *implicit_task_data; 1128 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1129 if (ompt_enabled.enabled && 1130 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1131 1132 ompt_task_info_t *parent_task_info; 1133 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1134 1135 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1136 if (ompt_enabled.ompt_callback_parallel_begin) { 1137 int team_size = 1; 1138 1139 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1140 &(parent_task_info->task_data), &(parent_task_info->frame), 1141 &ompt_parallel_data, team_size, 1142 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1143 } 1144 } 1145 #endif // OMPT_SUPPORT 1146 1147 if (this_thr->th.th_team != serial_team) { 1148 // Nested level will be an index in the nested nthreads array 1149 int level = this_thr->th.th_team->t.t_level; 1150 1151 if (serial_team->t.t_serialized) { 1152 /* this serial team was already used 1153 TODO increase performance by making this locks more specific */ 1154 kmp_team_t *new_team; 1155 1156 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1157 1158 new_team = 1159 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1160 #if OMPT_SUPPORT 1161 ompt_parallel_data, 1162 #endif 1163 proc_bind, &this_thr->th.th_current_task->td_icvs, 1164 0 USE_NESTED_HOT_ARG(NULL)); 1165 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1166 KMP_ASSERT(new_team); 1167 1168 /* setup new serialized team and install it */ 1169 new_team->t.t_threads[0] = this_thr; 1170 new_team->t.t_parent = this_thr->th.th_team; 1171 serial_team = new_team; 1172 this_thr->th.th_serial_team = serial_team; 1173 1174 KF_TRACE( 1175 10, 1176 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1177 global_tid, serial_team)); 1178 1179 /* TODO the above breaks the requirement that if we run out of resources, 1180 then we can still guarantee that serialized teams are ok, since we may 1181 need to allocate a new one */ 1182 } else { 1183 KF_TRACE( 1184 10, 1185 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1186 global_tid, serial_team)); 1187 } 1188 1189 /* we have to initialize this serial team */ 1190 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1191 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1192 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1193 serial_team->t.t_ident = loc; 1194 serial_team->t.t_serialized = 1; 1195 serial_team->t.t_nproc = 1; 1196 serial_team->t.t_parent = this_thr->th.th_team; 1197 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1198 this_thr->th.th_team = serial_team; 1199 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1200 1201 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1202 this_thr->th.th_current_task)); 1203 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1204 this_thr->th.th_current_task->td_flags.executing = 0; 1205 1206 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1207 1208 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1209 implicit task for each serialized task represented by 1210 team->t.t_serialized? */ 1211 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1212 &this_thr->th.th_current_task->td_parent->td_icvs); 1213 1214 // Thread value exists in the nested nthreads array for the next nested 1215 // level 1216 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1217 this_thr->th.th_current_task->td_icvs.nproc = 1218 __kmp_nested_nth.nth[level + 1]; 1219 } 1220 1221 if (__kmp_nested_proc_bind.used && 1222 (level + 1 < __kmp_nested_proc_bind.used)) { 1223 this_thr->th.th_current_task->td_icvs.proc_bind = 1224 __kmp_nested_proc_bind.bind_types[level + 1]; 1225 } 1226 1227 #if USE_DEBUGGER 1228 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1229 #endif 1230 this_thr->th.th_info.ds.ds_tid = 0; 1231 1232 /* set thread cache values */ 1233 this_thr->th.th_team_nproc = 1; 1234 this_thr->th.th_team_master = this_thr; 1235 this_thr->th.th_team_serialized = 1; 1236 1237 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1238 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1239 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1240 1241 propagateFPControl(serial_team); 1242 1243 /* check if we need to allocate dispatch buffers stack */ 1244 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1245 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1246 serial_team->t.t_dispatch->th_disp_buffer = 1247 (dispatch_private_info_t *)__kmp_allocate( 1248 sizeof(dispatch_private_info_t)); 1249 } 1250 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1251 1252 KMP_MB(); 1253 1254 } else { 1255 /* this serialized team is already being used, 1256 * that's fine, just add another nested level */ 1257 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1258 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1259 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1260 ++serial_team->t.t_serialized; 1261 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1262 1263 // Nested level will be an index in the nested nthreads array 1264 int level = this_thr->th.th_team->t.t_level; 1265 // Thread value exists in the nested nthreads array for the next nested 1266 // level 1267 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1268 this_thr->th.th_current_task->td_icvs.nproc = 1269 __kmp_nested_nth.nth[level + 1]; 1270 } 1271 serial_team->t.t_level++; 1272 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1273 "of serial team %p to %d\n", 1274 global_tid, serial_team, serial_team->t.t_level)); 1275 1276 /* allocate/push dispatch buffers stack */ 1277 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1278 { 1279 dispatch_private_info_t *disp_buffer = 1280 (dispatch_private_info_t *)__kmp_allocate( 1281 sizeof(dispatch_private_info_t)); 1282 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1283 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1284 } 1285 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1286 1287 KMP_MB(); 1288 } 1289 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1290 1291 // Perform the display affinity functionality for 1292 // serialized parallel regions 1293 if (__kmp_display_affinity) { 1294 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1295 this_thr->th.th_prev_num_threads != 1) { 1296 // NULL means use the affinity-format-var ICV 1297 __kmp_aux_display_affinity(global_tid, NULL); 1298 this_thr->th.th_prev_level = serial_team->t.t_level; 1299 this_thr->th.th_prev_num_threads = 1; 1300 } 1301 } 1302 1303 if (__kmp_env_consistency_check) 1304 __kmp_push_parallel(global_tid, NULL); 1305 #if OMPT_SUPPORT 1306 serial_team->t.ompt_team_info.master_return_address = codeptr; 1307 if (ompt_enabled.enabled && 1308 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1309 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1310 OMPT_GET_FRAME_ADDRESS(0); 1311 1312 ompt_lw_taskteam_t lw_taskteam; 1313 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1314 &ompt_parallel_data, codeptr); 1315 1316 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1317 // don't use lw_taskteam after linking. content was swaped 1318 1319 /* OMPT implicit task begin */ 1320 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1321 if (ompt_enabled.ompt_callback_implicit_task) { 1322 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1323 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1324 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1325 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1326 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1327 __kmp_tid_from_gtid(global_tid); 1328 } 1329 1330 /* OMPT state */ 1331 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1332 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1333 OMPT_GET_FRAME_ADDRESS(0); 1334 } 1335 #endif 1336 } 1337 1338 /* most of the work for a fork */ 1339 /* return true if we really went parallel, false if serialized */ 1340 int __kmp_fork_call(ident_t *loc, int gtid, 1341 enum fork_context_e call_context, // Intel, GNU, ... 1342 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1343 kmp_va_list ap) { 1344 void **argv; 1345 int i; 1346 int master_tid; 1347 int master_this_cons; 1348 kmp_team_t *team; 1349 kmp_team_t *parent_team; 1350 kmp_info_t *master_th; 1351 kmp_root_t *root; 1352 int nthreads; 1353 int master_active; 1354 int master_set_numthreads; 1355 int level; 1356 int active_level; 1357 int teams_level; 1358 #if KMP_NESTED_HOT_TEAMS 1359 kmp_hot_team_ptr_t **p_hot_teams; 1360 #endif 1361 { // KMP_TIME_BLOCK 1362 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1363 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1364 1365 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1366 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1367 /* Some systems prefer the stack for the root thread(s) to start with */ 1368 /* some gap from the parent stack to prevent false sharing. */ 1369 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1370 /* These 2 lines below are so this does not get optimized out */ 1371 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1372 __kmp_stkpadding += (short)((kmp_int64)dummy); 1373 } 1374 1375 /* initialize if needed */ 1376 KMP_DEBUG_ASSERT( 1377 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1378 if (!TCR_4(__kmp_init_parallel)) 1379 __kmp_parallel_initialize(); 1380 __kmp_resume_if_soft_paused(); 1381 1382 /* setup current data */ 1383 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1384 // shutdown 1385 parent_team = master_th->th.th_team; 1386 master_tid = master_th->th.th_info.ds.ds_tid; 1387 master_this_cons = master_th->th.th_local.this_construct; 1388 root = master_th->th.th_root; 1389 master_active = root->r.r_active; 1390 master_set_numthreads = master_th->th.th_set_nproc; 1391 1392 #if OMPT_SUPPORT 1393 ompt_data_t ompt_parallel_data = ompt_data_none; 1394 ompt_data_t *parent_task_data; 1395 ompt_frame_t *ompt_frame; 1396 ompt_data_t *implicit_task_data; 1397 void *return_address = NULL; 1398 1399 if (ompt_enabled.enabled) { 1400 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1401 NULL, NULL); 1402 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1403 } 1404 #endif 1405 1406 // Nested level will be an index in the nested nthreads array 1407 level = parent_team->t.t_level; 1408 // used to launch non-serial teams even if nested is not allowed 1409 active_level = parent_team->t.t_active_level; 1410 // needed to check nesting inside the teams 1411 teams_level = master_th->th.th_teams_level; 1412 #if KMP_NESTED_HOT_TEAMS 1413 p_hot_teams = &master_th->th.th_hot_teams; 1414 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1415 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1416 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1417 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1418 // it is either actual or not needed (when active_level > 0) 1419 (*p_hot_teams)[0].hot_team_nth = 1; 1420 } 1421 #endif 1422 1423 #if OMPT_SUPPORT 1424 if (ompt_enabled.enabled) { 1425 if (ompt_enabled.ompt_callback_parallel_begin) { 1426 int team_size = master_set_numthreads 1427 ? master_set_numthreads 1428 : get__nproc_2(parent_team, master_tid); 1429 int flags = OMPT_INVOKER(call_context) | 1430 ((microtask == (microtask_t)__kmp_teams_master) 1431 ? ompt_parallel_league 1432 : ompt_parallel_team); 1433 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1434 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1435 return_address); 1436 } 1437 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1438 } 1439 #endif 1440 1441 master_th->th.th_ident = loc; 1442 1443 if (master_th->th.th_teams_microtask && ap && 1444 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1445 // AC: This is start of parallel that is nested inside teams construct. 1446 // The team is actual (hot), all workers are ready at the fork barrier. 1447 // No lock needed to initialize the team a bit, then free workers. 1448 parent_team->t.t_ident = loc; 1449 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1450 parent_team->t.t_argc = argc; 1451 argv = (void **)parent_team->t.t_argv; 1452 for (i = argc - 1; i >= 0; --i) 1453 *argv++ = va_arg(kmp_va_deref(ap), void *); 1454 // Increment our nested depth levels, but not increase the serialization 1455 if (parent_team == master_th->th.th_serial_team) { 1456 // AC: we are in serialized parallel 1457 __kmpc_serialized_parallel(loc, gtid); 1458 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1459 1460 if (call_context == fork_context_gnu) { 1461 // AC: need to decrement t_serialized for enquiry functions to work 1462 // correctly, will restore at join time 1463 parent_team->t.t_serialized--; 1464 return TRUE; 1465 } 1466 1467 #if OMPT_SUPPORT 1468 void *dummy; 1469 void **exit_frame_p; 1470 1471 ompt_lw_taskteam_t lw_taskteam; 1472 1473 if (ompt_enabled.enabled) { 1474 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1475 &ompt_parallel_data, return_address); 1476 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1477 1478 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1479 // don't use lw_taskteam after linking. content was swaped 1480 1481 /* OMPT implicit task begin */ 1482 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1483 if (ompt_enabled.ompt_callback_implicit_task) { 1484 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1485 __kmp_tid_from_gtid(gtid); 1486 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1487 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1488 implicit_task_data, 1, 1489 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1490 } 1491 1492 /* OMPT state */ 1493 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1494 } else { 1495 exit_frame_p = &dummy; 1496 } 1497 #endif 1498 // AC: need to decrement t_serialized for enquiry functions to work 1499 // correctly, will restore at join time 1500 parent_team->t.t_serialized--; 1501 1502 { 1503 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1504 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1505 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1506 #if OMPT_SUPPORT 1507 , 1508 exit_frame_p 1509 #endif 1510 ); 1511 } 1512 1513 #if OMPT_SUPPORT 1514 if (ompt_enabled.enabled) { 1515 *exit_frame_p = NULL; 1516 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1517 if (ompt_enabled.ompt_callback_implicit_task) { 1518 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1519 ompt_scope_end, NULL, implicit_task_data, 1, 1520 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1521 } 1522 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1523 __ompt_lw_taskteam_unlink(master_th); 1524 if (ompt_enabled.ompt_callback_parallel_end) { 1525 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1526 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1527 OMPT_INVOKER(call_context) | ompt_parallel_team, 1528 return_address); 1529 } 1530 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1531 } 1532 #endif 1533 return TRUE; 1534 } 1535 1536 parent_team->t.t_pkfn = microtask; 1537 parent_team->t.t_invoke = invoker; 1538 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1539 parent_team->t.t_active_level++; 1540 parent_team->t.t_level++; 1541 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1542 1543 #if OMPT_SUPPORT 1544 if (ompt_enabled.enabled) { 1545 ompt_lw_taskteam_t lw_taskteam; 1546 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1547 &ompt_parallel_data, return_address); 1548 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1549 } 1550 #endif 1551 1552 /* Change number of threads in the team if requested */ 1553 if (master_set_numthreads) { // The parallel has num_threads clause 1554 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1555 // AC: only can reduce number of threads dynamically, can't increase 1556 kmp_info_t **other_threads = parent_team->t.t_threads; 1557 parent_team->t.t_nproc = master_set_numthreads; 1558 for (i = 0; i < master_set_numthreads; ++i) { 1559 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1560 } 1561 // Keep extra threads hot in the team for possible next parallels 1562 } 1563 master_th->th.th_set_nproc = 0; 1564 } 1565 1566 #if USE_DEBUGGER 1567 if (__kmp_debugging) { // Let debugger override number of threads. 1568 int nth = __kmp_omp_num_threads(loc); 1569 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1570 master_set_numthreads = nth; 1571 } 1572 } 1573 #endif 1574 1575 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1576 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1577 KMP_ITT_DEBUG) && 1578 __kmp_forkjoin_frames_mode == 3 && 1579 parent_team->t.t_active_level == 1 // only report frames at level 1 1580 && master_th->th.th_teams_size.nteams == 1) { 1581 kmp_uint64 tmp_time = __itt_get_timestamp(); 1582 master_th->th.th_frame_time = tmp_time; 1583 parent_team->t.t_region_time = tmp_time; 1584 } 1585 if (__itt_stack_caller_create_ptr) { 1586 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1587 // create new stack stitching id before entering fork barrier 1588 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1589 } 1590 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1591 1592 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1593 "master_th=%p, gtid=%d\n", 1594 root, parent_team, master_th, gtid)); 1595 __kmp_internal_fork(loc, gtid, parent_team); 1596 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1597 "master_th=%p, gtid=%d\n", 1598 root, parent_team, master_th, gtid)); 1599 1600 if (call_context == fork_context_gnu) 1601 return TRUE; 1602 1603 /* Invoke microtask for PRIMARY thread */ 1604 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1605 parent_team->t.t_id, parent_team->t.t_pkfn)); 1606 1607 if (!parent_team->t.t_invoke(gtid)) { 1608 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1609 } 1610 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1611 parent_team->t.t_id, parent_team->t.t_pkfn)); 1612 KMP_MB(); /* Flush all pending memory write invalidates. */ 1613 1614 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1615 1616 return TRUE; 1617 } // Parallel closely nested in teams construct 1618 1619 #if KMP_DEBUG 1620 if (__kmp_tasking_mode != tskm_immediate_exec) { 1621 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1622 parent_team->t.t_task_team[master_th->th.th_task_state]); 1623 } 1624 #endif 1625 1626 int enter_teams = 0; 1627 if (parent_team->t.t_active_level >= 1628 master_th->th.th_current_task->td_icvs.max_active_levels) { 1629 nthreads = 1; 1630 } else { 1631 enter_teams = ((ap == NULL && active_level == 0) || 1632 (ap && teams_level > 0 && teams_level == level)); 1633 nthreads = 1634 master_set_numthreads 1635 ? master_set_numthreads 1636 : get__nproc_2( 1637 parent_team, 1638 master_tid); // TODO: get nproc directly from current task 1639 1640 // Check if we need to take forkjoin lock? (no need for serialized 1641 // parallel out of teams construct). This code moved here from 1642 // __kmp_reserve_threads() to speedup nested serialized parallels. 1643 if (nthreads > 1) { 1644 if ((get__max_active_levels(master_th) == 1 && 1645 (root->r.r_in_parallel && !enter_teams)) || 1646 (__kmp_library == library_serial)) { 1647 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1648 " threads\n", 1649 gtid, nthreads)); 1650 nthreads = 1; 1651 } 1652 } 1653 if (nthreads > 1) { 1654 /* determine how many new threads we can use */ 1655 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1656 /* AC: If we execute teams from parallel region (on host), then teams 1657 should be created but each can only have 1 thread if nesting is 1658 disabled. If teams called from serial region, then teams and their 1659 threads should be created regardless of the nesting setting. */ 1660 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1661 nthreads, enter_teams); 1662 if (nthreads == 1) { 1663 // Free lock for single thread execution here; for multi-thread 1664 // execution it will be freed later after team of threads created 1665 // and initialized 1666 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1667 } 1668 } 1669 } 1670 KMP_DEBUG_ASSERT(nthreads > 0); 1671 1672 // If we temporarily changed the set number of threads then restore it now 1673 master_th->th.th_set_nproc = 0; 1674 1675 /* create a serialized parallel region? */ 1676 if (nthreads == 1) { 1677 /* josh todo: hypothetical question: what do we do for OS X*? */ 1678 #if KMP_OS_LINUX && \ 1679 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1680 void *args[argc]; 1681 #else 1682 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1683 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1684 KMP_ARCH_AARCH64) */ 1685 1686 KA_TRACE(20, 1687 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1688 1689 __kmpc_serialized_parallel(loc, gtid); 1690 1691 if (call_context == fork_context_intel) { 1692 /* TODO this sucks, use the compiler itself to pass args! :) */ 1693 master_th->th.th_serial_team->t.t_ident = loc; 1694 if (!ap) { 1695 // revert change made in __kmpc_serialized_parallel() 1696 master_th->th.th_serial_team->t.t_level--; 1697 // Get args from parent team for teams construct 1698 1699 #if OMPT_SUPPORT 1700 void *dummy; 1701 void **exit_frame_p; 1702 ompt_task_info_t *task_info; 1703 1704 ompt_lw_taskteam_t lw_taskteam; 1705 1706 if (ompt_enabled.enabled) { 1707 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1708 &ompt_parallel_data, return_address); 1709 1710 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1711 // don't use lw_taskteam after linking. content was swaped 1712 1713 task_info = OMPT_CUR_TASK_INFO(master_th); 1714 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1715 if (ompt_enabled.ompt_callback_implicit_task) { 1716 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1717 __kmp_tid_from_gtid(gtid); 1718 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1719 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1720 &(task_info->task_data), 1, 1721 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1722 ompt_task_implicit); 1723 } 1724 1725 /* OMPT state */ 1726 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1727 } else { 1728 exit_frame_p = &dummy; 1729 } 1730 #endif 1731 1732 { 1733 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1734 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1735 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1736 parent_team->t.t_argv 1737 #if OMPT_SUPPORT 1738 , 1739 exit_frame_p 1740 #endif 1741 ); 1742 } 1743 1744 #if OMPT_SUPPORT 1745 if (ompt_enabled.enabled) { 1746 *exit_frame_p = NULL; 1747 if (ompt_enabled.ompt_callback_implicit_task) { 1748 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1749 ompt_scope_end, NULL, &(task_info->task_data), 1, 1750 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1751 ompt_task_implicit); 1752 } 1753 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1754 __ompt_lw_taskteam_unlink(master_th); 1755 if (ompt_enabled.ompt_callback_parallel_end) { 1756 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1757 &ompt_parallel_data, parent_task_data, 1758 OMPT_INVOKER(call_context) | ompt_parallel_team, 1759 return_address); 1760 } 1761 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1762 } 1763 #endif 1764 } else if (microtask == (microtask_t)__kmp_teams_master) { 1765 KMP_DEBUG_ASSERT(master_th->th.th_team == 1766 master_th->th.th_serial_team); 1767 team = master_th->th.th_team; 1768 // team->t.t_pkfn = microtask; 1769 team->t.t_invoke = invoker; 1770 __kmp_alloc_argv_entries(argc, team, TRUE); 1771 team->t.t_argc = argc; 1772 argv = (void **)team->t.t_argv; 1773 if (ap) { 1774 for (i = argc - 1; i >= 0; --i) 1775 *argv++ = va_arg(kmp_va_deref(ap), void *); 1776 } else { 1777 for (i = 0; i < argc; ++i) 1778 // Get args from parent team for teams construct 1779 argv[i] = parent_team->t.t_argv[i]; 1780 } 1781 // AC: revert change made in __kmpc_serialized_parallel() 1782 // because initial code in teams should have level=0 1783 team->t.t_level--; 1784 // AC: call special invoker for outer "parallel" of teams construct 1785 invoker(gtid); 1786 #if OMPT_SUPPORT 1787 if (ompt_enabled.enabled) { 1788 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1789 if (ompt_enabled.ompt_callback_implicit_task) { 1790 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1791 ompt_scope_end, NULL, &(task_info->task_data), 0, 1792 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1793 } 1794 if (ompt_enabled.ompt_callback_parallel_end) { 1795 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1796 &ompt_parallel_data, parent_task_data, 1797 OMPT_INVOKER(call_context) | ompt_parallel_league, 1798 return_address); 1799 } 1800 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1801 } 1802 #endif 1803 } else { 1804 argv = args; 1805 for (i = argc - 1; i >= 0; --i) 1806 *argv++ = va_arg(kmp_va_deref(ap), void *); 1807 KMP_MB(); 1808 1809 #if OMPT_SUPPORT 1810 void *dummy; 1811 void **exit_frame_p; 1812 ompt_task_info_t *task_info; 1813 1814 ompt_lw_taskteam_t lw_taskteam; 1815 1816 if (ompt_enabled.enabled) { 1817 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1818 &ompt_parallel_data, return_address); 1819 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1820 // don't use lw_taskteam after linking. content was swaped 1821 task_info = OMPT_CUR_TASK_INFO(master_th); 1822 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1823 1824 /* OMPT implicit task begin */ 1825 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1826 if (ompt_enabled.ompt_callback_implicit_task) { 1827 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1828 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1829 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1830 ompt_task_implicit); 1831 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1832 __kmp_tid_from_gtid(gtid); 1833 } 1834 1835 /* OMPT state */ 1836 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1837 } else { 1838 exit_frame_p = &dummy; 1839 } 1840 #endif 1841 1842 { 1843 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1844 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1845 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1846 #if OMPT_SUPPORT 1847 , 1848 exit_frame_p 1849 #endif 1850 ); 1851 } 1852 1853 #if OMPT_SUPPORT 1854 if (ompt_enabled.enabled) { 1855 *exit_frame_p = NULL; 1856 if (ompt_enabled.ompt_callback_implicit_task) { 1857 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1858 ompt_scope_end, NULL, &(task_info->task_data), 1, 1859 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1860 ompt_task_implicit); 1861 } 1862 1863 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1864 __ompt_lw_taskteam_unlink(master_th); 1865 if (ompt_enabled.ompt_callback_parallel_end) { 1866 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1867 &ompt_parallel_data, parent_task_data, 1868 OMPT_INVOKER(call_context) | ompt_parallel_team, 1869 return_address); 1870 } 1871 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1872 } 1873 #endif 1874 } 1875 } else if (call_context == fork_context_gnu) { 1876 #if OMPT_SUPPORT 1877 ompt_lw_taskteam_t lwt; 1878 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1879 return_address); 1880 1881 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1882 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1883 // don't use lw_taskteam after linking. content was swaped 1884 #endif 1885 1886 // we were called from GNU native code 1887 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1888 return FALSE; 1889 } else { 1890 KMP_ASSERT2(call_context < fork_context_last, 1891 "__kmp_fork_call: unknown fork_context parameter"); 1892 } 1893 1894 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1895 KMP_MB(); 1896 return FALSE; 1897 } // if (nthreads == 1) 1898 1899 // GEH: only modify the executing flag in the case when not serialized 1900 // serialized case is handled in kmpc_serialized_parallel 1901 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1902 "curtask=%p, curtask_max_aclevel=%d\n", 1903 parent_team->t.t_active_level, master_th, 1904 master_th->th.th_current_task, 1905 master_th->th.th_current_task->td_icvs.max_active_levels)); 1906 // TODO: GEH - cannot do this assertion because root thread not set up as 1907 // executing 1908 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1909 master_th->th.th_current_task->td_flags.executing = 0; 1910 1911 if (!master_th->th.th_teams_microtask || level > teams_level) { 1912 /* Increment our nested depth level */ 1913 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1914 } 1915 1916 // See if we need to make a copy of the ICVs. 1917 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1918 if ((level + 1 < __kmp_nested_nth.used) && 1919 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1920 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1921 } else { 1922 nthreads_icv = 0; // don't update 1923 } 1924 1925 // Figure out the proc_bind_policy for the new team. 1926 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1927 kmp_proc_bind_t proc_bind_icv = 1928 proc_bind_default; // proc_bind_default means don't update 1929 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1930 proc_bind = proc_bind_false; 1931 } else { 1932 if (proc_bind == proc_bind_default) { 1933 // No proc_bind clause specified; use current proc-bind-var for this 1934 // parallel region 1935 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1936 } 1937 /* else: The proc_bind policy was specified explicitly on parallel clause. 1938 This overrides proc-bind-var for this parallel region, but does not 1939 change proc-bind-var. */ 1940 // Figure the value of proc-bind-var for the child threads. 1941 if ((level + 1 < __kmp_nested_proc_bind.used) && 1942 (__kmp_nested_proc_bind.bind_types[level + 1] != 1943 master_th->th.th_current_task->td_icvs.proc_bind)) { 1944 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1945 } 1946 } 1947 1948 // Reset for next parallel region 1949 master_th->th.th_set_proc_bind = proc_bind_default; 1950 1951 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1952 kmp_internal_control_t new_icvs; 1953 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1954 new_icvs.next = NULL; 1955 if (nthreads_icv > 0) { 1956 new_icvs.nproc = nthreads_icv; 1957 } 1958 if (proc_bind_icv != proc_bind_default) { 1959 new_icvs.proc_bind = proc_bind_icv; 1960 } 1961 1962 /* allocate a new parallel team */ 1963 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1964 team = __kmp_allocate_team(root, nthreads, nthreads, 1965 #if OMPT_SUPPORT 1966 ompt_parallel_data, 1967 #endif 1968 proc_bind, &new_icvs, 1969 argc USE_NESTED_HOT_ARG(master_th)); 1970 } else { 1971 /* allocate a new parallel team */ 1972 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1973 team = __kmp_allocate_team(root, nthreads, nthreads, 1974 #if OMPT_SUPPORT 1975 ompt_parallel_data, 1976 #endif 1977 proc_bind, 1978 &master_th->th.th_current_task->td_icvs, 1979 argc USE_NESTED_HOT_ARG(master_th)); 1980 } 1981 KF_TRACE( 1982 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 1983 1984 /* setup the new team */ 1985 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 1986 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 1987 KMP_CHECK_UPDATE(team->t.t_ident, loc); 1988 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 1989 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 1990 #if OMPT_SUPPORT 1991 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 1992 return_address); 1993 #endif 1994 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 1995 // TODO: parent_team->t.t_level == INT_MAX ??? 1996 if (!master_th->th.th_teams_microtask || level > teams_level) { 1997 int new_level = parent_team->t.t_level + 1; 1998 KMP_CHECK_UPDATE(team->t.t_level, new_level); 1999 new_level = parent_team->t.t_active_level + 1; 2000 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2001 } else { 2002 // AC: Do not increase parallel level at start of the teams construct 2003 int new_level = parent_team->t.t_level; 2004 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2005 new_level = parent_team->t.t_active_level; 2006 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2007 } 2008 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2009 // set primary thread's schedule as new run-time schedule 2010 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2011 2012 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2013 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2014 2015 // Update the floating point rounding in the team if required. 2016 propagateFPControl(team); 2017 2018 if (__kmp_tasking_mode != tskm_immediate_exec) { 2019 // Set primary thread's task team to team's task team. Unless this is hot 2020 // team, it should be NULL. 2021 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2022 parent_team->t.t_task_team[master_th->th.th_task_state]); 2023 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2024 "%p, new task_team %p / team %p\n", 2025 __kmp_gtid_from_thread(master_th), 2026 master_th->th.th_task_team, parent_team, 2027 team->t.t_task_team[master_th->th.th_task_state], team)); 2028 2029 if (active_level || master_th->th.th_task_team) { 2030 // Take a memo of primary thread's task_state 2031 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2032 if (master_th->th.th_task_state_top >= 2033 master_th->th.th_task_state_stack_sz) { // increase size 2034 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2035 kmp_uint8 *old_stack, *new_stack; 2036 kmp_uint32 i; 2037 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2038 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2039 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2040 } 2041 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2042 ++i) { // zero-init rest of stack 2043 new_stack[i] = 0; 2044 } 2045 old_stack = master_th->th.th_task_state_memo_stack; 2046 master_th->th.th_task_state_memo_stack = new_stack; 2047 master_th->th.th_task_state_stack_sz = new_size; 2048 __kmp_free(old_stack); 2049 } 2050 // Store primary thread's task_state on stack 2051 master_th->th 2052 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2053 master_th->th.th_task_state; 2054 master_th->th.th_task_state_top++; 2055 #if KMP_NESTED_HOT_TEAMS 2056 if (master_th->th.th_hot_teams && 2057 active_level < __kmp_hot_teams_max_level && 2058 team == master_th->th.th_hot_teams[active_level].hot_team) { 2059 // Restore primary thread's nested state if nested hot team 2060 master_th->th.th_task_state = 2061 master_th->th 2062 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2063 } else { 2064 #endif 2065 master_th->th.th_task_state = 0; 2066 #if KMP_NESTED_HOT_TEAMS 2067 } 2068 #endif 2069 } 2070 #if !KMP_NESTED_HOT_TEAMS 2071 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2072 (team == root->r.r_hot_team)); 2073 #endif 2074 } 2075 2076 KA_TRACE( 2077 20, 2078 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2079 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2080 team->t.t_nproc)); 2081 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2082 (team->t.t_master_tid == 0 && 2083 (team->t.t_parent == root->r.r_root_team || 2084 team->t.t_parent->t.t_serialized))); 2085 KMP_MB(); 2086 2087 /* now, setup the arguments */ 2088 argv = (void **)team->t.t_argv; 2089 if (ap) { 2090 for (i = argc - 1; i >= 0; --i) { 2091 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2092 KMP_CHECK_UPDATE(*argv, new_argv); 2093 argv++; 2094 } 2095 } else { 2096 for (i = 0; i < argc; ++i) { 2097 // Get args from parent team for teams construct 2098 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2099 } 2100 } 2101 2102 /* now actually fork the threads */ 2103 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2104 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2105 root->r.r_active = TRUE; 2106 2107 __kmp_fork_team_threads(root, team, master_th, gtid); 2108 __kmp_setup_icv_copy(team, nthreads, 2109 &master_th->th.th_current_task->td_icvs, loc); 2110 2111 #if OMPT_SUPPORT 2112 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2113 #endif 2114 2115 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2116 2117 #if USE_ITT_BUILD 2118 if (team->t.t_active_level == 1 // only report frames at level 1 2119 && !master_th->th.th_teams_microtask) { // not in teams construct 2120 #if USE_ITT_NOTIFY 2121 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2122 (__kmp_forkjoin_frames_mode == 3 || 2123 __kmp_forkjoin_frames_mode == 1)) { 2124 kmp_uint64 tmp_time = 0; 2125 if (__itt_get_timestamp_ptr) 2126 tmp_time = __itt_get_timestamp(); 2127 // Internal fork - report frame begin 2128 master_th->th.th_frame_time = tmp_time; 2129 if (__kmp_forkjoin_frames_mode == 3) 2130 team->t.t_region_time = tmp_time; 2131 } else 2132 // only one notification scheme (either "submit" or "forking/joined", not both) 2133 #endif /* USE_ITT_NOTIFY */ 2134 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2135 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2136 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2137 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2138 } 2139 } 2140 #endif /* USE_ITT_BUILD */ 2141 2142 /* now go on and do the work */ 2143 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2144 KMP_MB(); 2145 KF_TRACE(10, 2146 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2147 root, team, master_th, gtid)); 2148 2149 #if USE_ITT_BUILD 2150 if (__itt_stack_caller_create_ptr) { 2151 // create new stack stitching id before entering fork barrier 2152 if (!enter_teams) { 2153 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2154 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2155 } else if (parent_team->t.t_serialized) { 2156 // keep stack stitching id in the serialized parent_team; 2157 // current team will be used for parallel inside the teams; 2158 // if parent_team is active, then it already keeps stack stitching id 2159 // for the league of teams 2160 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2161 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2162 } 2163 } 2164 #endif /* USE_ITT_BUILD */ 2165 2166 // AC: skip __kmp_internal_fork at teams construct, let only primary 2167 // threads execute 2168 if (ap) { 2169 __kmp_internal_fork(loc, gtid, team); 2170 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2171 "master_th=%p, gtid=%d\n", 2172 root, team, master_th, gtid)); 2173 } 2174 2175 if (call_context == fork_context_gnu) { 2176 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2177 return TRUE; 2178 } 2179 2180 /* Invoke microtask for PRIMARY thread */ 2181 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2182 team->t.t_id, team->t.t_pkfn)); 2183 } // END of timer KMP_fork_call block 2184 2185 #if KMP_STATS_ENABLED 2186 // If beginning a teams construct, then change thread state 2187 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2188 if (!ap) { 2189 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2190 } 2191 #endif 2192 2193 if (!team->t.t_invoke(gtid)) { 2194 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2195 } 2196 2197 #if KMP_STATS_ENABLED 2198 // If was beginning of a teams construct, then reset thread state 2199 if (!ap) { 2200 KMP_SET_THREAD_STATE(previous_state); 2201 } 2202 #endif 2203 2204 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2205 team->t.t_id, team->t.t_pkfn)); 2206 KMP_MB(); /* Flush all pending memory write invalidates. */ 2207 2208 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2209 2210 #if OMPT_SUPPORT 2211 if (ompt_enabled.enabled) { 2212 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2213 } 2214 #endif 2215 2216 return TRUE; 2217 } 2218 2219 #if OMPT_SUPPORT 2220 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2221 kmp_team_t *team) { 2222 // restore state outside the region 2223 thread->th.ompt_thread_info.state = 2224 ((team->t.t_serialized) ? ompt_state_work_serial 2225 : ompt_state_work_parallel); 2226 } 2227 2228 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2229 kmp_team_t *team, ompt_data_t *parallel_data, 2230 int flags, void *codeptr) { 2231 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2232 if (ompt_enabled.ompt_callback_parallel_end) { 2233 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2234 parallel_data, &(task_info->task_data), flags, codeptr); 2235 } 2236 2237 task_info->frame.enter_frame = ompt_data_none; 2238 __kmp_join_restore_state(thread, team); 2239 } 2240 #endif 2241 2242 void __kmp_join_call(ident_t *loc, int gtid 2243 #if OMPT_SUPPORT 2244 , 2245 enum fork_context_e fork_context 2246 #endif 2247 , 2248 int exit_teams) { 2249 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2250 kmp_team_t *team; 2251 kmp_team_t *parent_team; 2252 kmp_info_t *master_th; 2253 kmp_root_t *root; 2254 int master_active; 2255 2256 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2257 2258 /* setup current data */ 2259 master_th = __kmp_threads[gtid]; 2260 root = master_th->th.th_root; 2261 team = master_th->th.th_team; 2262 parent_team = team->t.t_parent; 2263 2264 master_th->th.th_ident = loc; 2265 2266 #if OMPT_SUPPORT 2267 void *team_microtask = (void *)team->t.t_pkfn; 2268 // For GOMP interface with serialized parallel, need the 2269 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2270 // and end-parallel events. 2271 if (ompt_enabled.enabled && 2272 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2273 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2274 } 2275 #endif 2276 2277 #if KMP_DEBUG 2278 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2279 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2280 "th_task_team = %p\n", 2281 __kmp_gtid_from_thread(master_th), team, 2282 team->t.t_task_team[master_th->th.th_task_state], 2283 master_th->th.th_task_team)); 2284 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2285 team->t.t_task_team[master_th->th.th_task_state]); 2286 } 2287 #endif 2288 2289 if (team->t.t_serialized) { 2290 if (master_th->th.th_teams_microtask) { 2291 // We are in teams construct 2292 int level = team->t.t_level; 2293 int tlevel = master_th->th.th_teams_level; 2294 if (level == tlevel) { 2295 // AC: we haven't incremented it earlier at start of teams construct, 2296 // so do it here - at the end of teams construct 2297 team->t.t_level++; 2298 } else if (level == tlevel + 1) { 2299 // AC: we are exiting parallel inside teams, need to increment 2300 // serialization in order to restore it in the next call to 2301 // __kmpc_end_serialized_parallel 2302 team->t.t_serialized++; 2303 } 2304 } 2305 __kmpc_end_serialized_parallel(loc, gtid); 2306 2307 #if OMPT_SUPPORT 2308 if (ompt_enabled.enabled) { 2309 __kmp_join_restore_state(master_th, parent_team); 2310 } 2311 #endif 2312 2313 return; 2314 } 2315 2316 master_active = team->t.t_master_active; 2317 2318 if (!exit_teams) { 2319 // AC: No barrier for internal teams at exit from teams construct. 2320 // But there is barrier for external team (league). 2321 __kmp_internal_join(loc, gtid, team); 2322 #if USE_ITT_BUILD 2323 if (__itt_stack_caller_create_ptr) { 2324 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2325 // destroy the stack stitching id after join barrier 2326 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2327 team->t.t_stack_id = NULL; 2328 } 2329 #endif 2330 } else { 2331 master_th->th.th_task_state = 2332 0; // AC: no tasking in teams (out of any parallel) 2333 #if USE_ITT_BUILD 2334 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2335 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2336 // destroy the stack stitching id on exit from the teams construct 2337 // if parent_team is active, then the id will be destroyed later on 2338 // by master of the league of teams 2339 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2340 parent_team->t.t_stack_id = NULL; 2341 } 2342 #endif 2343 } 2344 2345 KMP_MB(); 2346 2347 #if OMPT_SUPPORT 2348 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2349 void *codeptr = team->t.ompt_team_info.master_return_address; 2350 #endif 2351 2352 #if USE_ITT_BUILD 2353 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2354 if (team->t.t_active_level == 1 && 2355 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2356 master_th->th.th_teams_size.nteams == 1)) { 2357 master_th->th.th_ident = loc; 2358 // only one notification scheme (either "submit" or "forking/joined", not 2359 // both) 2360 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2361 __kmp_forkjoin_frames_mode == 3) 2362 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2363 master_th->th.th_frame_time, 0, loc, 2364 master_th->th.th_team_nproc, 1); 2365 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2366 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2367 __kmp_itt_region_joined(gtid); 2368 } // active_level == 1 2369 #endif /* USE_ITT_BUILD */ 2370 2371 if (master_th->th.th_teams_microtask && !exit_teams && 2372 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2373 team->t.t_level == master_th->th.th_teams_level + 1) { 2374 // AC: We need to leave the team structure intact at the end of parallel 2375 // inside the teams construct, so that at the next parallel same (hot) team 2376 // works, only adjust nesting levels 2377 #if OMPT_SUPPORT 2378 ompt_data_t ompt_parallel_data = ompt_data_none; 2379 if (ompt_enabled.enabled) { 2380 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2381 if (ompt_enabled.ompt_callback_implicit_task) { 2382 int ompt_team_size = team->t.t_nproc; 2383 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2384 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2385 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2386 } 2387 task_info->frame.exit_frame = ompt_data_none; 2388 task_info->task_data = ompt_data_none; 2389 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2390 __ompt_lw_taskteam_unlink(master_th); 2391 } 2392 #endif 2393 /* Decrement our nested depth level */ 2394 team->t.t_level--; 2395 team->t.t_active_level--; 2396 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2397 2398 // Restore number of threads in the team if needed. This code relies on 2399 // the proper adjustment of th_teams_size.nth after the fork in 2400 // __kmp_teams_master on each teams primary thread in the case that 2401 // __kmp_reserve_threads reduced it. 2402 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2403 int old_num = master_th->th.th_team_nproc; 2404 int new_num = master_th->th.th_teams_size.nth; 2405 kmp_info_t **other_threads = team->t.t_threads; 2406 team->t.t_nproc = new_num; 2407 for (int i = 0; i < old_num; ++i) { 2408 other_threads[i]->th.th_team_nproc = new_num; 2409 } 2410 // Adjust states of non-used threads of the team 2411 for (int i = old_num; i < new_num; ++i) { 2412 // Re-initialize thread's barrier data. 2413 KMP_DEBUG_ASSERT(other_threads[i]); 2414 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2415 for (int b = 0; b < bs_last_barrier; ++b) { 2416 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2417 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2418 #if USE_DEBUGGER 2419 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2420 #endif 2421 } 2422 if (__kmp_tasking_mode != tskm_immediate_exec) { 2423 // Synchronize thread's task state 2424 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2425 } 2426 } 2427 } 2428 2429 #if OMPT_SUPPORT 2430 if (ompt_enabled.enabled) { 2431 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2432 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2433 } 2434 #endif 2435 2436 return; 2437 } 2438 2439 /* do cleanup and restore the parent team */ 2440 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2441 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2442 2443 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2444 2445 /* jc: The following lock has instructions with REL and ACQ semantics, 2446 separating the parallel user code called in this parallel region 2447 from the serial user code called after this function returns. */ 2448 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2449 2450 if (!master_th->th.th_teams_microtask || 2451 team->t.t_level > master_th->th.th_teams_level) { 2452 /* Decrement our nested depth level */ 2453 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2454 } 2455 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2456 2457 #if OMPT_SUPPORT 2458 if (ompt_enabled.enabled) { 2459 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2460 if (ompt_enabled.ompt_callback_implicit_task) { 2461 int flags = (team_microtask == (void *)__kmp_teams_master) 2462 ? ompt_task_initial 2463 : ompt_task_implicit; 2464 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2465 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2466 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2467 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2468 } 2469 task_info->frame.exit_frame = ompt_data_none; 2470 task_info->task_data = ompt_data_none; 2471 } 2472 #endif 2473 2474 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2475 master_th, team)); 2476 __kmp_pop_current_task_from_thread(master_th); 2477 2478 #if KMP_AFFINITY_SUPPORTED 2479 // Restore master thread's partition. 2480 master_th->th.th_first_place = team->t.t_first_place; 2481 master_th->th.th_last_place = team->t.t_last_place; 2482 #endif // KMP_AFFINITY_SUPPORTED 2483 master_th->th.th_def_allocator = team->t.t_def_allocator; 2484 2485 updateHWFPControl(team); 2486 2487 if (root->r.r_active != master_active) 2488 root->r.r_active = master_active; 2489 2490 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2491 master_th)); // this will free worker threads 2492 2493 /* this race was fun to find. make sure the following is in the critical 2494 region otherwise assertions may fail occasionally since the old team may be 2495 reallocated and the hierarchy appears inconsistent. it is actually safe to 2496 run and won't cause any bugs, but will cause those assertion failures. it's 2497 only one deref&assign so might as well put this in the critical region */ 2498 master_th->th.th_team = parent_team; 2499 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2500 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2501 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2502 2503 /* restore serialized team, if need be */ 2504 if (parent_team->t.t_serialized && 2505 parent_team != master_th->th.th_serial_team && 2506 parent_team != root->r.r_root_team) { 2507 __kmp_free_team(root, 2508 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2509 master_th->th.th_serial_team = parent_team; 2510 } 2511 2512 if (__kmp_tasking_mode != tskm_immediate_exec) { 2513 if (master_th->th.th_task_state_top > 2514 0) { // Restore task state from memo stack 2515 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2516 // Remember primary thread's state if we re-use this nested hot team 2517 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2518 master_th->th.th_task_state; 2519 --master_th->th.th_task_state_top; // pop 2520 // Now restore state at this level 2521 master_th->th.th_task_state = 2522 master_th->th 2523 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2524 } 2525 // Copy the task team from the parent team to the primary thread 2526 master_th->th.th_task_team = 2527 parent_team->t.t_task_team[master_th->th.th_task_state]; 2528 KA_TRACE(20, 2529 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2530 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2531 parent_team)); 2532 } 2533 2534 // TODO: GEH - cannot do this assertion because root thread not set up as 2535 // executing 2536 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2537 master_th->th.th_current_task->td_flags.executing = 1; 2538 2539 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2540 2541 #if OMPT_SUPPORT 2542 int flags = 2543 OMPT_INVOKER(fork_context) | 2544 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2545 : ompt_parallel_team); 2546 if (ompt_enabled.enabled) { 2547 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2548 codeptr); 2549 } 2550 #endif 2551 2552 KMP_MB(); 2553 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2554 } 2555 2556 /* Check whether we should push an internal control record onto the 2557 serial team stack. If so, do it. */ 2558 void __kmp_save_internal_controls(kmp_info_t *thread) { 2559 2560 if (thread->th.th_team != thread->th.th_serial_team) { 2561 return; 2562 } 2563 if (thread->th.th_team->t.t_serialized > 1) { 2564 int push = 0; 2565 2566 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2567 push = 1; 2568 } else { 2569 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2570 thread->th.th_team->t.t_serialized) { 2571 push = 1; 2572 } 2573 } 2574 if (push) { /* push a record on the serial team's stack */ 2575 kmp_internal_control_t *control = 2576 (kmp_internal_control_t *)__kmp_allocate( 2577 sizeof(kmp_internal_control_t)); 2578 2579 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2580 2581 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2582 2583 control->next = thread->th.th_team->t.t_control_stack_top; 2584 thread->th.th_team->t.t_control_stack_top = control; 2585 } 2586 } 2587 } 2588 2589 /* Changes set_nproc */ 2590 void __kmp_set_num_threads(int new_nth, int gtid) { 2591 kmp_info_t *thread; 2592 kmp_root_t *root; 2593 2594 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2595 KMP_DEBUG_ASSERT(__kmp_init_serial); 2596 2597 if (new_nth < 1) 2598 new_nth = 1; 2599 else if (new_nth > __kmp_max_nth) 2600 new_nth = __kmp_max_nth; 2601 2602 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2603 thread = __kmp_threads[gtid]; 2604 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2605 return; // nothing to do 2606 2607 __kmp_save_internal_controls(thread); 2608 2609 set__nproc(thread, new_nth); 2610 2611 // If this omp_set_num_threads() call will cause the hot team size to be 2612 // reduced (in the absence of a num_threads clause), then reduce it now, 2613 // rather than waiting for the next parallel region. 2614 root = thread->th.th_root; 2615 if (__kmp_init_parallel && (!root->r.r_active) && 2616 (root->r.r_hot_team->t.t_nproc > new_nth) 2617 #if KMP_NESTED_HOT_TEAMS 2618 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2619 #endif 2620 ) { 2621 kmp_team_t *hot_team = root->r.r_hot_team; 2622 int f; 2623 2624 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2625 2626 // Release the extra threads we don't need any more. 2627 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2628 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2629 if (__kmp_tasking_mode != tskm_immediate_exec) { 2630 // When decreasing team size, threads no longer in the team should unref 2631 // task team. 2632 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2633 } 2634 __kmp_free_thread(hot_team->t.t_threads[f]); 2635 hot_team->t.t_threads[f] = NULL; 2636 } 2637 hot_team->t.t_nproc = new_nth; 2638 #if KMP_NESTED_HOT_TEAMS 2639 if (thread->th.th_hot_teams) { 2640 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2641 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2642 } 2643 #endif 2644 2645 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2646 2647 // Update the t_nproc field in the threads that are still active. 2648 for (f = 0; f < new_nth; f++) { 2649 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2650 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2651 } 2652 // Special flag in case omp_set_num_threads() call 2653 hot_team->t.t_size_changed = -1; 2654 } 2655 } 2656 2657 /* Changes max_active_levels */ 2658 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2659 kmp_info_t *thread; 2660 2661 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2662 "%d = (%d)\n", 2663 gtid, max_active_levels)); 2664 KMP_DEBUG_ASSERT(__kmp_init_serial); 2665 2666 // validate max_active_levels 2667 if (max_active_levels < 0) { 2668 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2669 // We ignore this call if the user has specified a negative value. 2670 // The current setting won't be changed. The last valid setting will be 2671 // used. A warning will be issued (if warnings are allowed as controlled by 2672 // the KMP_WARNINGS env var). 2673 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2674 "max_active_levels for thread %d = (%d)\n", 2675 gtid, max_active_levels)); 2676 return; 2677 } 2678 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2679 // it's OK, the max_active_levels is within the valid range: [ 0; 2680 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2681 // We allow a zero value. (implementation defined behavior) 2682 } else { 2683 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2684 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2685 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2686 // Current upper limit is MAX_INT. (implementation defined behavior) 2687 // If the input exceeds the upper limit, we correct the input to be the 2688 // upper limit. (implementation defined behavior) 2689 // Actually, the flow should never get here until we use MAX_INT limit. 2690 } 2691 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2692 "max_active_levels for thread %d = (%d)\n", 2693 gtid, max_active_levels)); 2694 2695 thread = __kmp_threads[gtid]; 2696 2697 __kmp_save_internal_controls(thread); 2698 2699 set__max_active_levels(thread, max_active_levels); 2700 } 2701 2702 /* Gets max_active_levels */ 2703 int __kmp_get_max_active_levels(int gtid) { 2704 kmp_info_t *thread; 2705 2706 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2707 KMP_DEBUG_ASSERT(__kmp_init_serial); 2708 2709 thread = __kmp_threads[gtid]; 2710 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2711 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2712 "curtask_maxaclevel=%d\n", 2713 gtid, thread->th.th_current_task, 2714 thread->th.th_current_task->td_icvs.max_active_levels)); 2715 return thread->th.th_current_task->td_icvs.max_active_levels; 2716 } 2717 2718 // nteams-var per-device ICV 2719 void __kmp_set_num_teams(int num_teams) { 2720 if (num_teams > 0) 2721 __kmp_nteams = num_teams; 2722 } 2723 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2724 // teams-thread-limit-var per-device ICV 2725 void __kmp_set_teams_thread_limit(int limit) { 2726 if (limit > 0) 2727 __kmp_teams_thread_limit = limit; 2728 } 2729 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2730 2731 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2732 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2733 2734 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2735 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2736 kmp_info_t *thread; 2737 kmp_sched_t orig_kind; 2738 // kmp_team_t *team; 2739 2740 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2741 gtid, (int)kind, chunk)); 2742 KMP_DEBUG_ASSERT(__kmp_init_serial); 2743 2744 // Check if the kind parameter is valid, correct if needed. 2745 // Valid parameters should fit in one of two intervals - standard or extended: 2746 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2747 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2748 orig_kind = kind; 2749 kind = __kmp_sched_without_mods(kind); 2750 2751 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2752 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2753 // TODO: Hint needs attention in case we change the default schedule. 2754 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2755 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2756 __kmp_msg_null); 2757 kind = kmp_sched_default; 2758 chunk = 0; // ignore chunk value in case of bad kind 2759 } 2760 2761 thread = __kmp_threads[gtid]; 2762 2763 __kmp_save_internal_controls(thread); 2764 2765 if (kind < kmp_sched_upper_std) { 2766 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2767 // differ static chunked vs. unchunked: chunk should be invalid to 2768 // indicate unchunked schedule (which is the default) 2769 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2770 } else { 2771 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2772 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2773 } 2774 } else { 2775 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2776 // kmp_sched_lower - 2 ]; 2777 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2778 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2779 kmp_sched_lower - 2]; 2780 } 2781 __kmp_sched_apply_mods_intkind( 2782 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2783 if (kind == kmp_sched_auto || chunk < 1) { 2784 // ignore parameter chunk for schedule auto 2785 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2786 } else { 2787 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2788 } 2789 } 2790 2791 /* Gets def_sched_var ICV values */ 2792 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2793 kmp_info_t *thread; 2794 enum sched_type th_type; 2795 2796 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2797 KMP_DEBUG_ASSERT(__kmp_init_serial); 2798 2799 thread = __kmp_threads[gtid]; 2800 2801 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2802 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2803 case kmp_sch_static: 2804 case kmp_sch_static_greedy: 2805 case kmp_sch_static_balanced: 2806 *kind = kmp_sched_static; 2807 __kmp_sched_apply_mods_stdkind(kind, th_type); 2808 *chunk = 0; // chunk was not set, try to show this fact via zero value 2809 return; 2810 case kmp_sch_static_chunked: 2811 *kind = kmp_sched_static; 2812 break; 2813 case kmp_sch_dynamic_chunked: 2814 *kind = kmp_sched_dynamic; 2815 break; 2816 case kmp_sch_guided_chunked: 2817 case kmp_sch_guided_iterative_chunked: 2818 case kmp_sch_guided_analytical_chunked: 2819 *kind = kmp_sched_guided; 2820 break; 2821 case kmp_sch_auto: 2822 *kind = kmp_sched_auto; 2823 break; 2824 case kmp_sch_trapezoidal: 2825 *kind = kmp_sched_trapezoidal; 2826 break; 2827 #if KMP_STATIC_STEAL_ENABLED 2828 case kmp_sch_static_steal: 2829 *kind = kmp_sched_static_steal; 2830 break; 2831 #endif 2832 default: 2833 KMP_FATAL(UnknownSchedulingType, th_type); 2834 } 2835 2836 __kmp_sched_apply_mods_stdkind(kind, th_type); 2837 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2838 } 2839 2840 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2841 2842 int ii, dd; 2843 kmp_team_t *team; 2844 kmp_info_t *thr; 2845 2846 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2847 KMP_DEBUG_ASSERT(__kmp_init_serial); 2848 2849 // validate level 2850 if (level == 0) 2851 return 0; 2852 if (level < 0) 2853 return -1; 2854 thr = __kmp_threads[gtid]; 2855 team = thr->th.th_team; 2856 ii = team->t.t_level; 2857 if (level > ii) 2858 return -1; 2859 2860 if (thr->th.th_teams_microtask) { 2861 // AC: we are in teams region where multiple nested teams have same level 2862 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2863 if (level <= 2864 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2865 KMP_DEBUG_ASSERT(ii >= tlevel); 2866 // AC: As we need to pass by the teams league, we need to artificially 2867 // increase ii 2868 if (ii == tlevel) { 2869 ii += 2; // three teams have same level 2870 } else { 2871 ii++; // two teams have same level 2872 } 2873 } 2874 } 2875 2876 if (ii == level) 2877 return __kmp_tid_from_gtid(gtid); 2878 2879 dd = team->t.t_serialized; 2880 level++; 2881 while (ii > level) { 2882 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2883 } 2884 if ((team->t.t_serialized) && (!dd)) { 2885 team = team->t.t_parent; 2886 continue; 2887 } 2888 if (ii > level) { 2889 team = team->t.t_parent; 2890 dd = team->t.t_serialized; 2891 ii--; 2892 } 2893 } 2894 2895 return (dd > 1) ? (0) : (team->t.t_master_tid); 2896 } 2897 2898 int __kmp_get_team_size(int gtid, int level) { 2899 2900 int ii, dd; 2901 kmp_team_t *team; 2902 kmp_info_t *thr; 2903 2904 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2905 KMP_DEBUG_ASSERT(__kmp_init_serial); 2906 2907 // validate level 2908 if (level == 0) 2909 return 1; 2910 if (level < 0) 2911 return -1; 2912 thr = __kmp_threads[gtid]; 2913 team = thr->th.th_team; 2914 ii = team->t.t_level; 2915 if (level > ii) 2916 return -1; 2917 2918 if (thr->th.th_teams_microtask) { 2919 // AC: we are in teams region where multiple nested teams have same level 2920 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2921 if (level <= 2922 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2923 KMP_DEBUG_ASSERT(ii >= tlevel); 2924 // AC: As we need to pass by the teams league, we need to artificially 2925 // increase ii 2926 if (ii == tlevel) { 2927 ii += 2; // three teams have same level 2928 } else { 2929 ii++; // two teams have same level 2930 } 2931 } 2932 } 2933 2934 while (ii > level) { 2935 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2936 } 2937 if (team->t.t_serialized && (!dd)) { 2938 team = team->t.t_parent; 2939 continue; 2940 } 2941 if (ii > level) { 2942 team = team->t.t_parent; 2943 ii--; 2944 } 2945 } 2946 2947 return team->t.t_nproc; 2948 } 2949 2950 kmp_r_sched_t __kmp_get_schedule_global() { 2951 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2952 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2953 // independently. So one can get the updated schedule here. 2954 2955 kmp_r_sched_t r_sched; 2956 2957 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2958 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2959 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2960 // different roots (even in OMP 2.5) 2961 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2962 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2963 if (s == kmp_sch_static) { 2964 // replace STATIC with more detailed schedule (balanced or greedy) 2965 r_sched.r_sched_type = __kmp_static; 2966 } else if (s == kmp_sch_guided_chunked) { 2967 // replace GUIDED with more detailed schedule (iterative or analytical) 2968 r_sched.r_sched_type = __kmp_guided; 2969 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2970 r_sched.r_sched_type = __kmp_sched; 2971 } 2972 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2973 2974 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2975 // __kmp_chunk may be wrong here (if it was not ever set) 2976 r_sched.chunk = KMP_DEFAULT_CHUNK; 2977 } else { 2978 r_sched.chunk = __kmp_chunk; 2979 } 2980 2981 return r_sched; 2982 } 2983 2984 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2985 at least argc number of *t_argv entries for the requested team. */ 2986 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2987 2988 KMP_DEBUG_ASSERT(team); 2989 if (!realloc || argc > team->t.t_max_argc) { 2990 2991 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 2992 "current entries=%d\n", 2993 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 2994 /* if previously allocated heap space for args, free them */ 2995 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 2996 __kmp_free((void *)team->t.t_argv); 2997 2998 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 2999 /* use unused space in the cache line for arguments */ 3000 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3001 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3002 "argv entries\n", 3003 team->t.t_id, team->t.t_max_argc)); 3004 team->t.t_argv = &team->t.t_inline_argv[0]; 3005 if (__kmp_storage_map) { 3006 __kmp_print_storage_map_gtid( 3007 -1, &team->t.t_inline_argv[0], 3008 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3009 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3010 team->t.t_id); 3011 } 3012 } else { 3013 /* allocate space for arguments in the heap */ 3014 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3015 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3016 : 2 * argc; 3017 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3018 "argv entries\n", 3019 team->t.t_id, team->t.t_max_argc)); 3020 team->t.t_argv = 3021 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3022 if (__kmp_storage_map) { 3023 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3024 &team->t.t_argv[team->t.t_max_argc], 3025 sizeof(void *) * team->t.t_max_argc, 3026 "team_%d.t_argv", team->t.t_id); 3027 } 3028 } 3029 } 3030 } 3031 3032 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3033 int i; 3034 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3035 team->t.t_threads = 3036 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3037 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3038 sizeof(dispatch_shared_info_t) * num_disp_buff); 3039 team->t.t_dispatch = 3040 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3041 team->t.t_implicit_task_taskdata = 3042 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3043 team->t.t_max_nproc = max_nth; 3044 3045 /* setup dispatch buffers */ 3046 for (i = 0; i < num_disp_buff; ++i) { 3047 team->t.t_disp_buffer[i].buffer_index = i; 3048 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3049 } 3050 } 3051 3052 static void __kmp_free_team_arrays(kmp_team_t *team) { 3053 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3054 int i; 3055 for (i = 0; i < team->t.t_max_nproc; ++i) { 3056 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3057 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3058 team->t.t_dispatch[i].th_disp_buffer = NULL; 3059 } 3060 } 3061 #if KMP_USE_HIER_SCHED 3062 __kmp_dispatch_free_hierarchies(team); 3063 #endif 3064 __kmp_free(team->t.t_threads); 3065 __kmp_free(team->t.t_disp_buffer); 3066 __kmp_free(team->t.t_dispatch); 3067 __kmp_free(team->t.t_implicit_task_taskdata); 3068 team->t.t_threads = NULL; 3069 team->t.t_disp_buffer = NULL; 3070 team->t.t_dispatch = NULL; 3071 team->t.t_implicit_task_taskdata = 0; 3072 } 3073 3074 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3075 kmp_info_t **oldThreads = team->t.t_threads; 3076 3077 __kmp_free(team->t.t_disp_buffer); 3078 __kmp_free(team->t.t_dispatch); 3079 __kmp_free(team->t.t_implicit_task_taskdata); 3080 __kmp_allocate_team_arrays(team, max_nth); 3081 3082 KMP_MEMCPY(team->t.t_threads, oldThreads, 3083 team->t.t_nproc * sizeof(kmp_info_t *)); 3084 3085 __kmp_free(oldThreads); 3086 } 3087 3088 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3089 3090 kmp_r_sched_t r_sched = 3091 __kmp_get_schedule_global(); // get current state of scheduling globals 3092 3093 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3094 3095 kmp_internal_control_t g_icvs = { 3096 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3097 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3098 // adjustment of threads (per thread) 3099 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3100 // whether blocktime is explicitly set 3101 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3102 #if KMP_USE_MONITOR 3103 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3104 // intervals 3105 #endif 3106 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3107 // next parallel region (per thread) 3108 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3109 __kmp_cg_max_nth, // int thread_limit; 3110 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3111 // for max_active_levels 3112 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3113 // {sched,chunk} pair 3114 __kmp_nested_proc_bind.bind_types[0], 3115 __kmp_default_device, 3116 NULL // struct kmp_internal_control *next; 3117 }; 3118 3119 return g_icvs; 3120 } 3121 3122 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3123 3124 kmp_internal_control_t gx_icvs; 3125 gx_icvs.serial_nesting_level = 3126 0; // probably =team->t.t_serial like in save_inter_controls 3127 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3128 gx_icvs.next = NULL; 3129 3130 return gx_icvs; 3131 } 3132 3133 static void __kmp_initialize_root(kmp_root_t *root) { 3134 int f; 3135 kmp_team_t *root_team; 3136 kmp_team_t *hot_team; 3137 int hot_team_max_nth; 3138 kmp_r_sched_t r_sched = 3139 __kmp_get_schedule_global(); // get current state of scheduling globals 3140 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3141 KMP_DEBUG_ASSERT(root); 3142 KMP_ASSERT(!root->r.r_begin); 3143 3144 /* setup the root state structure */ 3145 __kmp_init_lock(&root->r.r_begin_lock); 3146 root->r.r_begin = FALSE; 3147 root->r.r_active = FALSE; 3148 root->r.r_in_parallel = 0; 3149 root->r.r_blocktime = __kmp_dflt_blocktime; 3150 3151 /* setup the root team for this task */ 3152 /* allocate the root team structure */ 3153 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3154 3155 root_team = 3156 __kmp_allocate_team(root, 3157 1, // new_nproc 3158 1, // max_nproc 3159 #if OMPT_SUPPORT 3160 ompt_data_none, // root parallel id 3161 #endif 3162 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3163 0 // argc 3164 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3165 ); 3166 #if USE_DEBUGGER 3167 // Non-NULL value should be assigned to make the debugger display the root 3168 // team. 3169 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3170 #endif 3171 3172 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3173 3174 root->r.r_root_team = root_team; 3175 root_team->t.t_control_stack_top = NULL; 3176 3177 /* initialize root team */ 3178 root_team->t.t_threads[0] = NULL; 3179 root_team->t.t_nproc = 1; 3180 root_team->t.t_serialized = 1; 3181 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3182 root_team->t.t_sched.sched = r_sched.sched; 3183 KA_TRACE( 3184 20, 3185 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3186 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3187 3188 /* setup the hot team for this task */ 3189 /* allocate the hot team structure */ 3190 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3191 3192 hot_team = 3193 __kmp_allocate_team(root, 3194 1, // new_nproc 3195 __kmp_dflt_team_nth_ub * 2, // max_nproc 3196 #if OMPT_SUPPORT 3197 ompt_data_none, // root parallel id 3198 #endif 3199 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3200 0 // argc 3201 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3202 ); 3203 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3204 3205 root->r.r_hot_team = hot_team; 3206 root_team->t.t_control_stack_top = NULL; 3207 3208 /* first-time initialization */ 3209 hot_team->t.t_parent = root_team; 3210 3211 /* initialize hot team */ 3212 hot_team_max_nth = hot_team->t.t_max_nproc; 3213 for (f = 0; f < hot_team_max_nth; ++f) { 3214 hot_team->t.t_threads[f] = NULL; 3215 } 3216 hot_team->t.t_nproc = 1; 3217 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3218 hot_team->t.t_sched.sched = r_sched.sched; 3219 hot_team->t.t_size_changed = 0; 3220 } 3221 3222 #ifdef KMP_DEBUG 3223 3224 typedef struct kmp_team_list_item { 3225 kmp_team_p const *entry; 3226 struct kmp_team_list_item *next; 3227 } kmp_team_list_item_t; 3228 typedef kmp_team_list_item_t *kmp_team_list_t; 3229 3230 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3231 kmp_team_list_t list, // List of teams. 3232 kmp_team_p const *team // Team to add. 3233 ) { 3234 3235 // List must terminate with item where both entry and next are NULL. 3236 // Team is added to the list only once. 3237 // List is sorted in ascending order by team id. 3238 // Team id is *not* a key. 3239 3240 kmp_team_list_t l; 3241 3242 KMP_DEBUG_ASSERT(list != NULL); 3243 if (team == NULL) { 3244 return; 3245 } 3246 3247 __kmp_print_structure_team_accum(list, team->t.t_parent); 3248 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3249 3250 // Search list for the team. 3251 l = list; 3252 while (l->next != NULL && l->entry != team) { 3253 l = l->next; 3254 } 3255 if (l->next != NULL) { 3256 return; // Team has been added before, exit. 3257 } 3258 3259 // Team is not found. Search list again for insertion point. 3260 l = list; 3261 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3262 l = l->next; 3263 } 3264 3265 // Insert team. 3266 { 3267 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3268 sizeof(kmp_team_list_item_t)); 3269 *item = *l; 3270 l->entry = team; 3271 l->next = item; 3272 } 3273 } 3274 3275 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3276 3277 ) { 3278 __kmp_printf("%s", title); 3279 if (team != NULL) { 3280 __kmp_printf("%2x %p\n", team->t.t_id, team); 3281 } else { 3282 __kmp_printf(" - (nil)\n"); 3283 } 3284 } 3285 3286 static void __kmp_print_structure_thread(char const *title, 3287 kmp_info_p const *thread) { 3288 __kmp_printf("%s", title); 3289 if (thread != NULL) { 3290 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3291 } else { 3292 __kmp_printf(" - (nil)\n"); 3293 } 3294 } 3295 3296 void __kmp_print_structure(void) { 3297 3298 kmp_team_list_t list; 3299 3300 // Initialize list of teams. 3301 list = 3302 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3303 list->entry = NULL; 3304 list->next = NULL; 3305 3306 __kmp_printf("\n------------------------------\nGlobal Thread " 3307 "Table\n------------------------------\n"); 3308 { 3309 int gtid; 3310 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3311 __kmp_printf("%2d", gtid); 3312 if (__kmp_threads != NULL) { 3313 __kmp_printf(" %p", __kmp_threads[gtid]); 3314 } 3315 if (__kmp_root != NULL) { 3316 __kmp_printf(" %p", __kmp_root[gtid]); 3317 } 3318 __kmp_printf("\n"); 3319 } 3320 } 3321 3322 // Print out __kmp_threads array. 3323 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3324 "----------\n"); 3325 if (__kmp_threads != NULL) { 3326 int gtid; 3327 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3328 kmp_info_t const *thread = __kmp_threads[gtid]; 3329 if (thread != NULL) { 3330 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3331 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3332 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3333 __kmp_print_structure_team(" Serial Team: ", 3334 thread->th.th_serial_team); 3335 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3336 __kmp_print_structure_thread(" Primary: ", 3337 thread->th.th_team_master); 3338 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3339 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3340 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3341 __kmp_print_structure_thread(" Next in pool: ", 3342 thread->th.th_next_pool); 3343 __kmp_printf("\n"); 3344 __kmp_print_structure_team_accum(list, thread->th.th_team); 3345 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3346 } 3347 } 3348 } else { 3349 __kmp_printf("Threads array is not allocated.\n"); 3350 } 3351 3352 // Print out __kmp_root array. 3353 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3354 "--------\n"); 3355 if (__kmp_root != NULL) { 3356 int gtid; 3357 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3358 kmp_root_t const *root = __kmp_root[gtid]; 3359 if (root != NULL) { 3360 __kmp_printf("GTID %2d %p:\n", gtid, root); 3361 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3362 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3363 __kmp_print_structure_thread(" Uber Thread: ", 3364 root->r.r_uber_thread); 3365 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3366 __kmp_printf(" In Parallel: %2d\n", 3367 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3368 __kmp_printf("\n"); 3369 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3370 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3371 } 3372 } 3373 } else { 3374 __kmp_printf("Ubers array is not allocated.\n"); 3375 } 3376 3377 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3378 "--------\n"); 3379 while (list->next != NULL) { 3380 kmp_team_p const *team = list->entry; 3381 int i; 3382 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3383 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3384 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3385 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3386 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3387 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3388 for (i = 0; i < team->t.t_nproc; ++i) { 3389 __kmp_printf(" Thread %2d: ", i); 3390 __kmp_print_structure_thread("", team->t.t_threads[i]); 3391 } 3392 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3393 __kmp_printf("\n"); 3394 list = list->next; 3395 } 3396 3397 // Print out __kmp_thread_pool and __kmp_team_pool. 3398 __kmp_printf("\n------------------------------\nPools\n----------------------" 3399 "--------\n"); 3400 __kmp_print_structure_thread("Thread pool: ", 3401 CCAST(kmp_info_t *, __kmp_thread_pool)); 3402 __kmp_print_structure_team("Team pool: ", 3403 CCAST(kmp_team_t *, __kmp_team_pool)); 3404 __kmp_printf("\n"); 3405 3406 // Free team list. 3407 while (list != NULL) { 3408 kmp_team_list_item_t *item = list; 3409 list = list->next; 3410 KMP_INTERNAL_FREE(item); 3411 } 3412 } 3413 3414 #endif 3415 3416 //--------------------------------------------------------------------------- 3417 // Stuff for per-thread fast random number generator 3418 // Table of primes 3419 static const unsigned __kmp_primes[] = { 3420 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3421 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3422 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3423 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3424 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3425 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3426 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3427 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3428 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3429 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3430 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3431 3432 //--------------------------------------------------------------------------- 3433 // __kmp_get_random: Get a random number using a linear congruential method. 3434 unsigned short __kmp_get_random(kmp_info_t *thread) { 3435 unsigned x = thread->th.th_x; 3436 unsigned short r = (unsigned short)(x >> 16); 3437 3438 thread->th.th_x = x * thread->th.th_a + 1; 3439 3440 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3441 thread->th.th_info.ds.ds_tid, r)); 3442 3443 return r; 3444 } 3445 //-------------------------------------------------------- 3446 // __kmp_init_random: Initialize a random number generator 3447 void __kmp_init_random(kmp_info_t *thread) { 3448 unsigned seed = thread->th.th_info.ds.ds_tid; 3449 3450 thread->th.th_a = 3451 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3452 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3453 KA_TRACE(30, 3454 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3455 } 3456 3457 #if KMP_OS_WINDOWS 3458 /* reclaim array entries for root threads that are already dead, returns number 3459 * reclaimed */ 3460 static int __kmp_reclaim_dead_roots(void) { 3461 int i, r = 0; 3462 3463 for (i = 0; i < __kmp_threads_capacity; ++i) { 3464 if (KMP_UBER_GTID(i) && 3465 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3466 !__kmp_root[i] 3467 ->r.r_active) { // AC: reclaim only roots died in non-active state 3468 r += __kmp_unregister_root_other_thread(i); 3469 } 3470 } 3471 return r; 3472 } 3473 #endif 3474 3475 /* This function attempts to create free entries in __kmp_threads and 3476 __kmp_root, and returns the number of free entries generated. 3477 3478 For Windows* OS static library, the first mechanism used is to reclaim array 3479 entries for root threads that are already dead. 3480 3481 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3482 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3483 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3484 threadprivate cache array has been created. Synchronization with 3485 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3486 3487 After any dead root reclamation, if the clipping value allows array expansion 3488 to result in the generation of a total of nNeed free slots, the function does 3489 that expansion. If not, nothing is done beyond the possible initial root 3490 thread reclamation. 3491 3492 If any argument is negative, the behavior is undefined. */ 3493 static int __kmp_expand_threads(int nNeed) { 3494 int added = 0; 3495 int minimumRequiredCapacity; 3496 int newCapacity; 3497 kmp_info_t **newThreads; 3498 kmp_root_t **newRoot; 3499 3500 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3501 // resizing __kmp_threads does not need additional protection if foreign 3502 // threads are present 3503 3504 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3505 /* only for Windows static library */ 3506 /* reclaim array entries for root threads that are already dead */ 3507 added = __kmp_reclaim_dead_roots(); 3508 3509 if (nNeed) { 3510 nNeed -= added; 3511 if (nNeed < 0) 3512 nNeed = 0; 3513 } 3514 #endif 3515 if (nNeed <= 0) 3516 return added; 3517 3518 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3519 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3520 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3521 // > __kmp_max_nth in one of two ways: 3522 // 3523 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3524 // may not be reused by another thread, so we may need to increase 3525 // __kmp_threads_capacity to __kmp_max_nth + 1. 3526 // 3527 // 2) New foreign root(s) are encountered. We always register new foreign 3528 // roots. This may cause a smaller # of threads to be allocated at 3529 // subsequent parallel regions, but the worker threads hang around (and 3530 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3531 // 3532 // Anyway, that is the reason for moving the check to see if 3533 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3534 // instead of having it performed here. -BB 3535 3536 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3537 3538 /* compute expansion headroom to check if we can expand */ 3539 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3540 /* possible expansion too small -- give up */ 3541 return added; 3542 } 3543 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3544 3545 newCapacity = __kmp_threads_capacity; 3546 do { 3547 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3548 : __kmp_sys_max_nth; 3549 } while (newCapacity < minimumRequiredCapacity); 3550 newThreads = (kmp_info_t **)__kmp_allocate( 3551 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3552 newRoot = 3553 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3554 KMP_MEMCPY(newThreads, __kmp_threads, 3555 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3556 KMP_MEMCPY(newRoot, __kmp_root, 3557 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3558 3559 kmp_info_t **temp_threads = __kmp_threads; 3560 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3561 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3562 __kmp_free(temp_threads); 3563 added += newCapacity - __kmp_threads_capacity; 3564 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3565 3566 if (newCapacity > __kmp_tp_capacity) { 3567 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3568 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3569 __kmp_threadprivate_resize_cache(newCapacity); 3570 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3571 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3572 } 3573 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3574 } 3575 3576 return added; 3577 } 3578 3579 /* Register the current thread as a root thread and obtain our gtid. We must 3580 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3581 thread that calls from __kmp_do_serial_initialize() */ 3582 int __kmp_register_root(int initial_thread) { 3583 kmp_info_t *root_thread; 3584 kmp_root_t *root; 3585 int gtid; 3586 int capacity; 3587 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3588 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3589 KMP_MB(); 3590 3591 /* 2007-03-02: 3592 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3593 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3594 work as expected -- it may return false (that means there is at least one 3595 empty slot in __kmp_threads array), but it is possible the only free slot 3596 is #0, which is reserved for initial thread and so cannot be used for this 3597 one. Following code workarounds this bug. 3598 3599 However, right solution seems to be not reserving slot #0 for initial 3600 thread because: 3601 (1) there is no magic in slot #0, 3602 (2) we cannot detect initial thread reliably (the first thread which does 3603 serial initialization may be not a real initial thread). 3604 */ 3605 capacity = __kmp_threads_capacity; 3606 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3607 --capacity; 3608 } 3609 3610 /* see if there are too many threads */ 3611 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3612 if (__kmp_tp_cached) { 3613 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3614 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3615 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3616 } else { 3617 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3618 __kmp_msg_null); 3619 } 3620 } 3621 3622 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3623 // 0: initial thread, also a regular OpenMP thread. 3624 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3625 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3626 // regular OpenMP threads. 3627 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3628 // Find an available thread slot for hidden helper thread. Slots for hidden 3629 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3630 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3631 gtid <= __kmp_hidden_helper_threads_num; 3632 gtid++) 3633 ; 3634 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3635 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3636 "hidden helper thread: T#%d\n", 3637 gtid)); 3638 } else { 3639 /* find an available thread slot */ 3640 // Don't reassign the zero slot since we need that to only be used by 3641 // initial thread. Slots for hidden helper threads should also be skipped. 3642 if (initial_thread && __kmp_threads[0] == NULL) { 3643 gtid = 0; 3644 } else { 3645 for (gtid = __kmp_hidden_helper_threads_num + 1; 3646 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3647 ; 3648 } 3649 KA_TRACE( 3650 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3651 KMP_ASSERT(gtid < __kmp_threads_capacity); 3652 } 3653 3654 /* update global accounting */ 3655 __kmp_all_nth++; 3656 TCW_4(__kmp_nth, __kmp_nth + 1); 3657 3658 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3659 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3660 if (__kmp_adjust_gtid_mode) { 3661 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3662 if (TCR_4(__kmp_gtid_mode) != 2) { 3663 TCW_4(__kmp_gtid_mode, 2); 3664 } 3665 } else { 3666 if (TCR_4(__kmp_gtid_mode) != 1) { 3667 TCW_4(__kmp_gtid_mode, 1); 3668 } 3669 } 3670 } 3671 3672 #ifdef KMP_ADJUST_BLOCKTIME 3673 /* Adjust blocktime to zero if necessary */ 3674 /* Middle initialization might not have occurred yet */ 3675 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3676 if (__kmp_nth > __kmp_avail_proc) { 3677 __kmp_zero_bt = TRUE; 3678 } 3679 } 3680 #endif /* KMP_ADJUST_BLOCKTIME */ 3681 3682 /* setup this new hierarchy */ 3683 if (!(root = __kmp_root[gtid])) { 3684 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3685 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3686 } 3687 3688 #if KMP_STATS_ENABLED 3689 // Initialize stats as soon as possible (right after gtid assignment). 3690 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3691 __kmp_stats_thread_ptr->startLife(); 3692 KMP_SET_THREAD_STATE(SERIAL_REGION); 3693 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3694 #endif 3695 __kmp_initialize_root(root); 3696 3697 /* setup new root thread structure */ 3698 if (root->r.r_uber_thread) { 3699 root_thread = root->r.r_uber_thread; 3700 } else { 3701 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3702 if (__kmp_storage_map) { 3703 __kmp_print_thread_storage_map(root_thread, gtid); 3704 } 3705 root_thread->th.th_info.ds.ds_gtid = gtid; 3706 #if OMPT_SUPPORT 3707 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3708 #endif 3709 root_thread->th.th_root = root; 3710 if (__kmp_env_consistency_check) { 3711 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3712 } 3713 #if USE_FAST_MEMORY 3714 __kmp_initialize_fast_memory(root_thread); 3715 #endif /* USE_FAST_MEMORY */ 3716 3717 #if KMP_USE_BGET 3718 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3719 __kmp_initialize_bget(root_thread); 3720 #endif 3721 __kmp_init_random(root_thread); // Initialize random number generator 3722 } 3723 3724 /* setup the serial team held in reserve by the root thread */ 3725 if (!root_thread->th.th_serial_team) { 3726 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3727 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3728 root_thread->th.th_serial_team = __kmp_allocate_team( 3729 root, 1, 1, 3730 #if OMPT_SUPPORT 3731 ompt_data_none, // root parallel id 3732 #endif 3733 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3734 } 3735 KMP_ASSERT(root_thread->th.th_serial_team); 3736 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3737 root_thread->th.th_serial_team)); 3738 3739 /* drop root_thread into place */ 3740 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3741 3742 root->r.r_root_team->t.t_threads[0] = root_thread; 3743 root->r.r_hot_team->t.t_threads[0] = root_thread; 3744 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3745 // AC: the team created in reserve, not for execution (it is unused for now). 3746 root_thread->th.th_serial_team->t.t_serialized = 0; 3747 root->r.r_uber_thread = root_thread; 3748 3749 /* initialize the thread, get it ready to go */ 3750 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3751 TCW_4(__kmp_init_gtid, TRUE); 3752 3753 /* prepare the primary thread for get_gtid() */ 3754 __kmp_gtid_set_specific(gtid); 3755 3756 #if USE_ITT_BUILD 3757 __kmp_itt_thread_name(gtid); 3758 #endif /* USE_ITT_BUILD */ 3759 3760 #ifdef KMP_TDATA_GTID 3761 __kmp_gtid = gtid; 3762 #endif 3763 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3764 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3765 3766 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3767 "plain=%u\n", 3768 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3769 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3770 KMP_INIT_BARRIER_STATE)); 3771 { // Initialize barrier data. 3772 int b; 3773 for (b = 0; b < bs_last_barrier; ++b) { 3774 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3775 #if USE_DEBUGGER 3776 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3777 #endif 3778 } 3779 } 3780 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3781 KMP_INIT_BARRIER_STATE); 3782 3783 #if KMP_AFFINITY_SUPPORTED 3784 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3785 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3786 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3787 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3788 if (TCR_4(__kmp_init_middle)) { 3789 __kmp_affinity_set_init_mask(gtid, TRUE); 3790 } 3791 #endif /* KMP_AFFINITY_SUPPORTED */ 3792 root_thread->th.th_def_allocator = __kmp_def_allocator; 3793 root_thread->th.th_prev_level = 0; 3794 root_thread->th.th_prev_num_threads = 1; 3795 3796 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3797 tmp->cg_root = root_thread; 3798 tmp->cg_thread_limit = __kmp_cg_max_nth; 3799 tmp->cg_nthreads = 1; 3800 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3801 " cg_nthreads init to 1\n", 3802 root_thread, tmp)); 3803 tmp->up = NULL; 3804 root_thread->th.th_cg_roots = tmp; 3805 3806 __kmp_root_counter++; 3807 3808 #if OMPT_SUPPORT 3809 if (!initial_thread && ompt_enabled.enabled) { 3810 3811 kmp_info_t *root_thread = ompt_get_thread(); 3812 3813 ompt_set_thread_state(root_thread, ompt_state_overhead); 3814 3815 if (ompt_enabled.ompt_callback_thread_begin) { 3816 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3817 ompt_thread_initial, __ompt_get_thread_data_internal()); 3818 } 3819 ompt_data_t *task_data; 3820 ompt_data_t *parallel_data; 3821 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3822 NULL); 3823 if (ompt_enabled.ompt_callback_implicit_task) { 3824 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3825 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3826 } 3827 3828 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3829 } 3830 #endif 3831 3832 KMP_MB(); 3833 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3834 3835 return gtid; 3836 } 3837 3838 #if KMP_NESTED_HOT_TEAMS 3839 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3840 const int max_level) { 3841 int i, n, nth; 3842 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3843 if (!hot_teams || !hot_teams[level].hot_team) { 3844 return 0; 3845 } 3846 KMP_DEBUG_ASSERT(level < max_level); 3847 kmp_team_t *team = hot_teams[level].hot_team; 3848 nth = hot_teams[level].hot_team_nth; 3849 n = nth - 1; // primary thread is not freed 3850 if (level < max_level - 1) { 3851 for (i = 0; i < nth; ++i) { 3852 kmp_info_t *th = team->t.t_threads[i]; 3853 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3854 if (i > 0 && th->th.th_hot_teams) { 3855 __kmp_free(th->th.th_hot_teams); 3856 th->th.th_hot_teams = NULL; 3857 } 3858 } 3859 } 3860 __kmp_free_team(root, team, NULL); 3861 return n; 3862 } 3863 #endif 3864 3865 // Resets a root thread and clear its root and hot teams. 3866 // Returns the number of __kmp_threads entries directly and indirectly freed. 3867 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3868 kmp_team_t *root_team = root->r.r_root_team; 3869 kmp_team_t *hot_team = root->r.r_hot_team; 3870 int n = hot_team->t.t_nproc; 3871 int i; 3872 3873 KMP_DEBUG_ASSERT(!root->r.r_active); 3874 3875 root->r.r_root_team = NULL; 3876 root->r.r_hot_team = NULL; 3877 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3878 // before call to __kmp_free_team(). 3879 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3880 #if KMP_NESTED_HOT_TEAMS 3881 if (__kmp_hot_teams_max_level > 3882 0) { // need to free nested hot teams and their threads if any 3883 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3884 kmp_info_t *th = hot_team->t.t_threads[i]; 3885 if (__kmp_hot_teams_max_level > 1) { 3886 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3887 } 3888 if (th->th.th_hot_teams) { 3889 __kmp_free(th->th.th_hot_teams); 3890 th->th.th_hot_teams = NULL; 3891 } 3892 } 3893 } 3894 #endif 3895 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3896 3897 // Before we can reap the thread, we need to make certain that all other 3898 // threads in the teams that had this root as ancestor have stopped trying to 3899 // steal tasks. 3900 if (__kmp_tasking_mode != tskm_immediate_exec) { 3901 __kmp_wait_to_unref_task_teams(); 3902 } 3903 3904 #if KMP_OS_WINDOWS 3905 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3906 KA_TRACE( 3907 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3908 "\n", 3909 (LPVOID) & (root->r.r_uber_thread->th), 3910 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3911 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3912 #endif /* KMP_OS_WINDOWS */ 3913 3914 #if OMPT_SUPPORT 3915 ompt_data_t *task_data; 3916 ompt_data_t *parallel_data; 3917 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3918 NULL); 3919 if (ompt_enabled.ompt_callback_implicit_task) { 3920 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3921 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3922 } 3923 if (ompt_enabled.ompt_callback_thread_end) { 3924 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3925 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3926 } 3927 #endif 3928 3929 TCW_4(__kmp_nth, 3930 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3931 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3932 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3933 " to %d\n", 3934 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3935 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3936 if (i == 1) { 3937 // need to free contention group structure 3938 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3939 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3940 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3941 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3942 root->r.r_uber_thread->th.th_cg_roots = NULL; 3943 } 3944 __kmp_reap_thread(root->r.r_uber_thread, 1); 3945 3946 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3947 // instead of freeing. 3948 root->r.r_uber_thread = NULL; 3949 /* mark root as no longer in use */ 3950 root->r.r_begin = FALSE; 3951 3952 return n; 3953 } 3954 3955 void __kmp_unregister_root_current_thread(int gtid) { 3956 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3957 /* this lock should be ok, since unregister_root_current_thread is never 3958 called during an abort, only during a normal close. furthermore, if you 3959 have the forkjoin lock, you should never try to get the initz lock */ 3960 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3961 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3962 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3963 "exiting T#%d\n", 3964 gtid)); 3965 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3966 return; 3967 } 3968 kmp_root_t *root = __kmp_root[gtid]; 3969 3970 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3971 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3972 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3973 KMP_ASSERT(root->r.r_active == FALSE); 3974 3975 KMP_MB(); 3976 3977 kmp_info_t *thread = __kmp_threads[gtid]; 3978 kmp_team_t *team = thread->th.th_team; 3979 kmp_task_team_t *task_team = thread->th.th_task_team; 3980 3981 // we need to wait for the proxy tasks before finishing the thread 3982 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3983 #if OMPT_SUPPORT 3984 // the runtime is shutting down so we won't report any events 3985 thread->th.ompt_thread_info.state = ompt_state_undefined; 3986 #endif 3987 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3988 } 3989 3990 __kmp_reset_root(gtid, root); 3991 3992 KMP_MB(); 3993 KC_TRACE(10, 3994 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3995 3996 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3997 } 3998 3999 #if KMP_OS_WINDOWS 4000 /* __kmp_forkjoin_lock must be already held 4001 Unregisters a root thread that is not the current thread. Returns the number 4002 of __kmp_threads entries freed as a result. */ 4003 static int __kmp_unregister_root_other_thread(int gtid) { 4004 kmp_root_t *root = __kmp_root[gtid]; 4005 int r; 4006 4007 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4008 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4009 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4010 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4011 KMP_ASSERT(root->r.r_active == FALSE); 4012 4013 r = __kmp_reset_root(gtid, root); 4014 KC_TRACE(10, 4015 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4016 return r; 4017 } 4018 #endif 4019 4020 #if KMP_DEBUG 4021 void __kmp_task_info() { 4022 4023 kmp_int32 gtid = __kmp_entry_gtid(); 4024 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4025 kmp_info_t *this_thr = __kmp_threads[gtid]; 4026 kmp_team_t *steam = this_thr->th.th_serial_team; 4027 kmp_team_t *team = this_thr->th.th_team; 4028 4029 __kmp_printf( 4030 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4031 "ptask=%p\n", 4032 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4033 team->t.t_implicit_task_taskdata[tid].td_parent); 4034 } 4035 #endif // KMP_DEBUG 4036 4037 /* TODO optimize with one big memclr, take out what isn't needed, split 4038 responsibility to workers as much as possible, and delay initialization of 4039 features as much as possible */ 4040 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4041 int tid, int gtid) { 4042 /* this_thr->th.th_info.ds.ds_gtid is setup in 4043 kmp_allocate_thread/create_worker. 4044 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4045 kmp_info_t *master = team->t.t_threads[0]; 4046 KMP_DEBUG_ASSERT(this_thr != NULL); 4047 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4048 KMP_DEBUG_ASSERT(team); 4049 KMP_DEBUG_ASSERT(team->t.t_threads); 4050 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4051 KMP_DEBUG_ASSERT(master); 4052 KMP_DEBUG_ASSERT(master->th.th_root); 4053 4054 KMP_MB(); 4055 4056 TCW_SYNC_PTR(this_thr->th.th_team, team); 4057 4058 this_thr->th.th_info.ds.ds_tid = tid; 4059 this_thr->th.th_set_nproc = 0; 4060 if (__kmp_tasking_mode != tskm_immediate_exec) 4061 // When tasking is possible, threads are not safe to reap until they are 4062 // done tasking; this will be set when tasking code is exited in wait 4063 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4064 else // no tasking --> always safe to reap 4065 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4066 this_thr->th.th_set_proc_bind = proc_bind_default; 4067 #if KMP_AFFINITY_SUPPORTED 4068 this_thr->th.th_new_place = this_thr->th.th_current_place; 4069 #endif 4070 this_thr->th.th_root = master->th.th_root; 4071 4072 /* setup the thread's cache of the team structure */ 4073 this_thr->th.th_team_nproc = team->t.t_nproc; 4074 this_thr->th.th_team_master = master; 4075 this_thr->th.th_team_serialized = team->t.t_serialized; 4076 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4077 4078 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4079 4080 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4081 tid, gtid, this_thr, this_thr->th.th_current_task)); 4082 4083 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4084 team, tid, TRUE); 4085 4086 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4087 tid, gtid, this_thr, this_thr->th.th_current_task)); 4088 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4089 // __kmp_initialize_team()? 4090 4091 /* TODO no worksharing in speculative threads */ 4092 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4093 4094 this_thr->th.th_local.this_construct = 0; 4095 4096 if (!this_thr->th.th_pri_common) { 4097 this_thr->th.th_pri_common = 4098 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4099 if (__kmp_storage_map) { 4100 __kmp_print_storage_map_gtid( 4101 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4102 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4103 } 4104 this_thr->th.th_pri_head = NULL; 4105 } 4106 4107 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4108 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4109 // Make new thread's CG root same as primary thread's 4110 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4111 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4112 if (tmp) { 4113 // worker changes CG, need to check if old CG should be freed 4114 int i = tmp->cg_nthreads--; 4115 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4116 " on node %p of thread %p to %d\n", 4117 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4118 if (i == 1) { 4119 __kmp_free(tmp); // last thread left CG --> free it 4120 } 4121 } 4122 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4123 // Increment new thread's CG root's counter to add the new thread 4124 this_thr->th.th_cg_roots->cg_nthreads++; 4125 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4126 " node %p of thread %p to %d\n", 4127 this_thr, this_thr->th.th_cg_roots, 4128 this_thr->th.th_cg_roots->cg_root, 4129 this_thr->th.th_cg_roots->cg_nthreads)); 4130 this_thr->th.th_current_task->td_icvs.thread_limit = 4131 this_thr->th.th_cg_roots->cg_thread_limit; 4132 } 4133 4134 /* Initialize dynamic dispatch */ 4135 { 4136 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4137 // Use team max_nproc since this will never change for the team. 4138 size_t disp_size = 4139 sizeof(dispatch_private_info_t) * 4140 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4141 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4142 team->t.t_max_nproc)); 4143 KMP_ASSERT(dispatch); 4144 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4145 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4146 4147 dispatch->th_disp_index = 0; 4148 dispatch->th_doacross_buf_idx = 0; 4149 if (!dispatch->th_disp_buffer) { 4150 dispatch->th_disp_buffer = 4151 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4152 4153 if (__kmp_storage_map) { 4154 __kmp_print_storage_map_gtid( 4155 gtid, &dispatch->th_disp_buffer[0], 4156 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4157 ? 1 4158 : __kmp_dispatch_num_buffers], 4159 disp_size, 4160 "th_%d.th_dispatch.th_disp_buffer " 4161 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4162 gtid, team->t.t_id, gtid); 4163 } 4164 } else { 4165 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4166 } 4167 4168 dispatch->th_dispatch_pr_current = 0; 4169 dispatch->th_dispatch_sh_current = 0; 4170 4171 dispatch->th_deo_fcn = 0; /* ORDERED */ 4172 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4173 } 4174 4175 this_thr->th.th_next_pool = NULL; 4176 4177 if (!this_thr->th.th_task_state_memo_stack) { 4178 size_t i; 4179 this_thr->th.th_task_state_memo_stack = 4180 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4181 this_thr->th.th_task_state_top = 0; 4182 this_thr->th.th_task_state_stack_sz = 4; 4183 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4184 ++i) // zero init the stack 4185 this_thr->th.th_task_state_memo_stack[i] = 0; 4186 } 4187 4188 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4189 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4190 4191 KMP_MB(); 4192 } 4193 4194 /* allocate a new thread for the requesting team. this is only called from 4195 within a forkjoin critical section. we will first try to get an available 4196 thread from the thread pool. if none is available, we will fork a new one 4197 assuming we are able to create a new one. this should be assured, as the 4198 caller should check on this first. */ 4199 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4200 int new_tid) { 4201 kmp_team_t *serial_team; 4202 kmp_info_t *new_thr; 4203 int new_gtid; 4204 4205 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4206 KMP_DEBUG_ASSERT(root && team); 4207 #if !KMP_NESTED_HOT_TEAMS 4208 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4209 #endif 4210 KMP_MB(); 4211 4212 /* first, try to get one from the thread pool */ 4213 if (__kmp_thread_pool) { 4214 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4215 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4216 if (new_thr == __kmp_thread_pool_insert_pt) { 4217 __kmp_thread_pool_insert_pt = NULL; 4218 } 4219 TCW_4(new_thr->th.th_in_pool, FALSE); 4220 __kmp_suspend_initialize_thread(new_thr); 4221 __kmp_lock_suspend_mx(new_thr); 4222 if (new_thr->th.th_active_in_pool == TRUE) { 4223 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4224 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4225 new_thr->th.th_active_in_pool = FALSE; 4226 } 4227 __kmp_unlock_suspend_mx(new_thr); 4228 4229 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4230 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4231 KMP_ASSERT(!new_thr->th.th_team); 4232 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4233 4234 /* setup the thread structure */ 4235 __kmp_initialize_info(new_thr, team, new_tid, 4236 new_thr->th.th_info.ds.ds_gtid); 4237 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4238 4239 TCW_4(__kmp_nth, __kmp_nth + 1); 4240 4241 new_thr->th.th_task_state = 0; 4242 new_thr->th.th_task_state_top = 0; 4243 new_thr->th.th_task_state_stack_sz = 4; 4244 4245 #ifdef KMP_ADJUST_BLOCKTIME 4246 /* Adjust blocktime back to zero if necessary */ 4247 /* Middle initialization might not have occurred yet */ 4248 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4249 if (__kmp_nth > __kmp_avail_proc) { 4250 __kmp_zero_bt = TRUE; 4251 } 4252 } 4253 #endif /* KMP_ADJUST_BLOCKTIME */ 4254 4255 #if KMP_DEBUG 4256 // If thread entered pool via __kmp_free_thread, wait_flag should != 4257 // KMP_BARRIER_PARENT_FLAG. 4258 int b; 4259 kmp_balign_t *balign = new_thr->th.th_bar; 4260 for (b = 0; b < bs_last_barrier; ++b) 4261 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4262 #endif 4263 4264 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4265 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4266 4267 KMP_MB(); 4268 return new_thr; 4269 } 4270 4271 /* no, well fork a new one */ 4272 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4273 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4274 4275 #if KMP_USE_MONITOR 4276 // If this is the first worker thread the RTL is creating, then also 4277 // launch the monitor thread. We try to do this as early as possible. 4278 if (!TCR_4(__kmp_init_monitor)) { 4279 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4280 if (!TCR_4(__kmp_init_monitor)) { 4281 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4282 TCW_4(__kmp_init_monitor, 1); 4283 __kmp_create_monitor(&__kmp_monitor); 4284 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4285 #if KMP_OS_WINDOWS 4286 // AC: wait until monitor has started. This is a fix for CQ232808. 4287 // The reason is that if the library is loaded/unloaded in a loop with 4288 // small (parallel) work in between, then there is high probability that 4289 // monitor thread started after the library shutdown. At shutdown it is 4290 // too late to cope with the problem, because when the primary thread is 4291 // in DllMain (process detach) the monitor has no chances to start (it is 4292 // blocked), and primary thread has no means to inform the monitor that 4293 // the library has gone, because all the memory which the monitor can 4294 // access is going to be released/reset. 4295 while (TCR_4(__kmp_init_monitor) < 2) { 4296 KMP_YIELD(TRUE); 4297 } 4298 KF_TRACE(10, ("after monitor thread has started\n")); 4299 #endif 4300 } 4301 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4302 } 4303 #endif 4304 4305 KMP_MB(); 4306 4307 { 4308 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4309 ? 1 4310 : __kmp_hidden_helper_threads_num + 1; 4311 4312 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4313 ++new_gtid) { 4314 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4315 } 4316 4317 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4318 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4319 } 4320 } 4321 4322 /* allocate space for it. */ 4323 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4324 4325 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4326 4327 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4328 // suppress race conditions detection on synchronization flags in debug mode 4329 // this helps to analyze library internals eliminating false positives 4330 __itt_suppress_mark_range( 4331 __itt_suppress_range, __itt_suppress_threading_errors, 4332 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4333 __itt_suppress_mark_range( 4334 __itt_suppress_range, __itt_suppress_threading_errors, 4335 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4336 #if KMP_OS_WINDOWS 4337 __itt_suppress_mark_range( 4338 __itt_suppress_range, __itt_suppress_threading_errors, 4339 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4340 #else 4341 __itt_suppress_mark_range(__itt_suppress_range, 4342 __itt_suppress_threading_errors, 4343 &new_thr->th.th_suspend_init_count, 4344 sizeof(new_thr->th.th_suspend_init_count)); 4345 #endif 4346 // TODO: check if we need to also suppress b_arrived flags 4347 __itt_suppress_mark_range(__itt_suppress_range, 4348 __itt_suppress_threading_errors, 4349 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4350 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4351 __itt_suppress_mark_range(__itt_suppress_range, 4352 __itt_suppress_threading_errors, 4353 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4354 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4355 __itt_suppress_mark_range(__itt_suppress_range, 4356 __itt_suppress_threading_errors, 4357 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4358 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4359 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4360 if (__kmp_storage_map) { 4361 __kmp_print_thread_storage_map(new_thr, new_gtid); 4362 } 4363 4364 // add the reserve serialized team, initialized from the team's primary thread 4365 { 4366 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4367 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4368 new_thr->th.th_serial_team = serial_team = 4369 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4370 #if OMPT_SUPPORT 4371 ompt_data_none, // root parallel id 4372 #endif 4373 proc_bind_default, &r_icvs, 4374 0 USE_NESTED_HOT_ARG(NULL)); 4375 } 4376 KMP_ASSERT(serial_team); 4377 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4378 // execution (it is unused for now). 4379 serial_team->t.t_threads[0] = new_thr; 4380 KF_TRACE(10, 4381 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4382 new_thr)); 4383 4384 /* setup the thread structures */ 4385 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4386 4387 #if USE_FAST_MEMORY 4388 __kmp_initialize_fast_memory(new_thr); 4389 #endif /* USE_FAST_MEMORY */ 4390 4391 #if KMP_USE_BGET 4392 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4393 __kmp_initialize_bget(new_thr); 4394 #endif 4395 4396 __kmp_init_random(new_thr); // Initialize random number generator 4397 4398 /* Initialize these only once when thread is grabbed for a team allocation */ 4399 KA_TRACE(20, 4400 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4401 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4402 4403 int b; 4404 kmp_balign_t *balign = new_thr->th.th_bar; 4405 for (b = 0; b < bs_last_barrier; ++b) { 4406 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4407 balign[b].bb.team = NULL; 4408 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4409 balign[b].bb.use_oncore_barrier = 0; 4410 } 4411 4412 new_thr->th.th_spin_here = FALSE; 4413 new_thr->th.th_next_waiting = 0; 4414 #if KMP_OS_UNIX 4415 new_thr->th.th_blocking = false; 4416 #endif 4417 4418 #if KMP_AFFINITY_SUPPORTED 4419 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4420 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4421 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4422 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4423 #endif 4424 new_thr->th.th_def_allocator = __kmp_def_allocator; 4425 new_thr->th.th_prev_level = 0; 4426 new_thr->th.th_prev_num_threads = 1; 4427 4428 TCW_4(new_thr->th.th_in_pool, FALSE); 4429 new_thr->th.th_active_in_pool = FALSE; 4430 TCW_4(new_thr->th.th_active, TRUE); 4431 4432 /* adjust the global counters */ 4433 __kmp_all_nth++; 4434 __kmp_nth++; 4435 4436 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4437 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4438 if (__kmp_adjust_gtid_mode) { 4439 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4440 if (TCR_4(__kmp_gtid_mode) != 2) { 4441 TCW_4(__kmp_gtid_mode, 2); 4442 } 4443 } else { 4444 if (TCR_4(__kmp_gtid_mode) != 1) { 4445 TCW_4(__kmp_gtid_mode, 1); 4446 } 4447 } 4448 } 4449 4450 #ifdef KMP_ADJUST_BLOCKTIME 4451 /* Adjust blocktime back to zero if necessary */ 4452 /* Middle initialization might not have occurred yet */ 4453 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4454 if (__kmp_nth > __kmp_avail_proc) { 4455 __kmp_zero_bt = TRUE; 4456 } 4457 } 4458 #endif /* KMP_ADJUST_BLOCKTIME */ 4459 4460 /* actually fork it and create the new worker thread */ 4461 KF_TRACE( 4462 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4463 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4464 KF_TRACE(10, 4465 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4466 4467 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4468 new_gtid)); 4469 KMP_MB(); 4470 return new_thr; 4471 } 4472 4473 /* Reinitialize team for reuse. 4474 The hot team code calls this case at every fork barrier, so EPCC barrier 4475 test are extremely sensitive to changes in it, esp. writes to the team 4476 struct, which cause a cache invalidation in all threads. 4477 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4478 static void __kmp_reinitialize_team(kmp_team_t *team, 4479 kmp_internal_control_t *new_icvs, 4480 ident_t *loc) { 4481 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4482 team->t.t_threads[0], team)); 4483 KMP_DEBUG_ASSERT(team && new_icvs); 4484 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4485 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4486 4487 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4488 // Copy ICVs to the primary thread's implicit taskdata 4489 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4490 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4491 4492 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4493 team->t.t_threads[0], team)); 4494 } 4495 4496 /* Initialize the team data structure. 4497 This assumes the t_threads and t_max_nproc are already set. 4498 Also, we don't touch the arguments */ 4499 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4500 kmp_internal_control_t *new_icvs, 4501 ident_t *loc) { 4502 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4503 4504 /* verify */ 4505 KMP_DEBUG_ASSERT(team); 4506 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4507 KMP_DEBUG_ASSERT(team->t.t_threads); 4508 KMP_MB(); 4509 4510 team->t.t_master_tid = 0; /* not needed */ 4511 /* team->t.t_master_bar; not needed */ 4512 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4513 team->t.t_nproc = new_nproc; 4514 4515 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4516 team->t.t_next_pool = NULL; 4517 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4518 * up hot team */ 4519 4520 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4521 team->t.t_invoke = NULL; /* not needed */ 4522 4523 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4524 team->t.t_sched.sched = new_icvs->sched.sched; 4525 4526 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4527 team->t.t_fp_control_saved = FALSE; /* not needed */ 4528 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4529 team->t.t_mxcsr = 0; /* not needed */ 4530 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4531 4532 team->t.t_construct = 0; 4533 4534 team->t.t_ordered.dt.t_value = 0; 4535 team->t.t_master_active = FALSE; 4536 4537 #ifdef KMP_DEBUG 4538 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4539 #endif 4540 #if KMP_OS_WINDOWS 4541 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4542 #endif 4543 4544 team->t.t_control_stack_top = NULL; 4545 4546 __kmp_reinitialize_team(team, new_icvs, loc); 4547 4548 KMP_MB(); 4549 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4550 } 4551 4552 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4553 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4554 static void 4555 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4556 if (KMP_AFFINITY_CAPABLE()) { 4557 int status; 4558 if (old_mask != NULL) { 4559 status = __kmp_get_system_affinity(old_mask, TRUE); 4560 int error = errno; 4561 if (status != 0) { 4562 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4563 __kmp_msg_null); 4564 } 4565 } 4566 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4567 } 4568 } 4569 #endif 4570 4571 #if KMP_AFFINITY_SUPPORTED 4572 4573 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4574 // It calculates the worker + primary thread's partition based upon the parent 4575 // thread's partition, and binds each worker to a thread in their partition. 4576 // The primary thread's partition should already include its current binding. 4577 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4578 // Copy the primary thread's place partition to the team struct 4579 kmp_info_t *master_th = team->t.t_threads[0]; 4580 KMP_DEBUG_ASSERT(master_th != NULL); 4581 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4582 int first_place = master_th->th.th_first_place; 4583 int last_place = master_th->th.th_last_place; 4584 int masters_place = master_th->th.th_current_place; 4585 team->t.t_first_place = first_place; 4586 team->t.t_last_place = last_place; 4587 4588 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4589 "bound to place %d partition = [%d,%d]\n", 4590 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4591 team->t.t_id, masters_place, first_place, last_place)); 4592 4593 switch (proc_bind) { 4594 4595 case proc_bind_default: 4596 // Serial teams might have the proc_bind policy set to proc_bind_default. 4597 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4598 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4599 break; 4600 4601 case proc_bind_primary: { 4602 int f; 4603 int n_th = team->t.t_nproc; 4604 for (f = 1; f < n_th; f++) { 4605 kmp_info_t *th = team->t.t_threads[f]; 4606 KMP_DEBUG_ASSERT(th != NULL); 4607 th->th.th_first_place = first_place; 4608 th->th.th_last_place = last_place; 4609 th->th.th_new_place = masters_place; 4610 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4611 team->t.t_display_affinity != 1) { 4612 team->t.t_display_affinity = 1; 4613 } 4614 4615 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4616 "partition = [%d,%d]\n", 4617 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4618 f, masters_place, first_place, last_place)); 4619 } 4620 } break; 4621 4622 case proc_bind_close: { 4623 int f; 4624 int n_th = team->t.t_nproc; 4625 int n_places; 4626 if (first_place <= last_place) { 4627 n_places = last_place - first_place + 1; 4628 } else { 4629 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4630 } 4631 if (n_th <= n_places) { 4632 int place = masters_place; 4633 for (f = 1; f < n_th; f++) { 4634 kmp_info_t *th = team->t.t_threads[f]; 4635 KMP_DEBUG_ASSERT(th != NULL); 4636 4637 if (place == last_place) { 4638 place = first_place; 4639 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4640 place = 0; 4641 } else { 4642 place++; 4643 } 4644 th->th.th_first_place = first_place; 4645 th->th.th_last_place = last_place; 4646 th->th.th_new_place = place; 4647 if (__kmp_display_affinity && place != th->th.th_current_place && 4648 team->t.t_display_affinity != 1) { 4649 team->t.t_display_affinity = 1; 4650 } 4651 4652 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4653 "partition = [%d,%d]\n", 4654 __kmp_gtid_from_thread(team->t.t_threads[f]), 4655 team->t.t_id, f, place, first_place, last_place)); 4656 } 4657 } else { 4658 int S, rem, gap, s_count; 4659 S = n_th / n_places; 4660 s_count = 0; 4661 rem = n_th - (S * n_places); 4662 gap = rem > 0 ? n_places / rem : n_places; 4663 int place = masters_place; 4664 int gap_ct = gap; 4665 for (f = 0; f < n_th; f++) { 4666 kmp_info_t *th = team->t.t_threads[f]; 4667 KMP_DEBUG_ASSERT(th != NULL); 4668 4669 th->th.th_first_place = first_place; 4670 th->th.th_last_place = last_place; 4671 th->th.th_new_place = place; 4672 if (__kmp_display_affinity && place != th->th.th_current_place && 4673 team->t.t_display_affinity != 1) { 4674 team->t.t_display_affinity = 1; 4675 } 4676 s_count++; 4677 4678 if ((s_count == S) && rem && (gap_ct == gap)) { 4679 // do nothing, add an extra thread to place on next iteration 4680 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4681 // we added an extra thread to this place; move to next place 4682 if (place == last_place) { 4683 place = first_place; 4684 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4685 place = 0; 4686 } else { 4687 place++; 4688 } 4689 s_count = 0; 4690 gap_ct = 1; 4691 rem--; 4692 } else if (s_count == S) { // place full; don't add extra 4693 if (place == last_place) { 4694 place = first_place; 4695 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4696 place = 0; 4697 } else { 4698 place++; 4699 } 4700 gap_ct++; 4701 s_count = 0; 4702 } 4703 4704 KA_TRACE(100, 4705 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4706 "partition = [%d,%d]\n", 4707 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4708 th->th.th_new_place, first_place, last_place)); 4709 } 4710 KMP_DEBUG_ASSERT(place == masters_place); 4711 } 4712 } break; 4713 4714 case proc_bind_spread: { 4715 int f; 4716 int n_th = team->t.t_nproc; 4717 int n_places; 4718 int thidx; 4719 if (first_place <= last_place) { 4720 n_places = last_place - first_place + 1; 4721 } else { 4722 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4723 } 4724 if (n_th <= n_places) { 4725 int place = -1; 4726 4727 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4728 int S = n_places / n_th; 4729 int s_count, rem, gap, gap_ct; 4730 4731 place = masters_place; 4732 rem = n_places - n_th * S; 4733 gap = rem ? n_th / rem : 1; 4734 gap_ct = gap; 4735 thidx = n_th; 4736 if (update_master_only == 1) 4737 thidx = 1; 4738 for (f = 0; f < thidx; f++) { 4739 kmp_info_t *th = team->t.t_threads[f]; 4740 KMP_DEBUG_ASSERT(th != NULL); 4741 4742 th->th.th_first_place = place; 4743 th->th.th_new_place = place; 4744 if (__kmp_display_affinity && place != th->th.th_current_place && 4745 team->t.t_display_affinity != 1) { 4746 team->t.t_display_affinity = 1; 4747 } 4748 s_count = 1; 4749 while (s_count < S) { 4750 if (place == last_place) { 4751 place = first_place; 4752 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4753 place = 0; 4754 } else { 4755 place++; 4756 } 4757 s_count++; 4758 } 4759 if (rem && (gap_ct == gap)) { 4760 if (place == last_place) { 4761 place = first_place; 4762 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4763 place = 0; 4764 } else { 4765 place++; 4766 } 4767 rem--; 4768 gap_ct = 0; 4769 } 4770 th->th.th_last_place = place; 4771 gap_ct++; 4772 4773 if (place == last_place) { 4774 place = first_place; 4775 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4776 place = 0; 4777 } else { 4778 place++; 4779 } 4780 4781 KA_TRACE(100, 4782 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4783 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4784 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4785 f, th->th.th_new_place, th->th.th_first_place, 4786 th->th.th_last_place, __kmp_affinity_num_masks)); 4787 } 4788 } else { 4789 /* Having uniform space of available computation places I can create 4790 T partitions of round(P/T) size and put threads into the first 4791 place of each partition. */ 4792 double current = static_cast<double>(masters_place); 4793 double spacing = 4794 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4795 int first, last; 4796 kmp_info_t *th; 4797 4798 thidx = n_th + 1; 4799 if (update_master_only == 1) 4800 thidx = 1; 4801 for (f = 0; f < thidx; f++) { 4802 first = static_cast<int>(current); 4803 last = static_cast<int>(current + spacing) - 1; 4804 KMP_DEBUG_ASSERT(last >= first); 4805 if (first >= n_places) { 4806 if (masters_place) { 4807 first -= n_places; 4808 last -= n_places; 4809 if (first == (masters_place + 1)) { 4810 KMP_DEBUG_ASSERT(f == n_th); 4811 first--; 4812 } 4813 if (last == masters_place) { 4814 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4815 last--; 4816 } 4817 } else { 4818 KMP_DEBUG_ASSERT(f == n_th); 4819 first = 0; 4820 last = 0; 4821 } 4822 } 4823 if (last >= n_places) { 4824 last = (n_places - 1); 4825 } 4826 place = first; 4827 current += spacing; 4828 if (f < n_th) { 4829 KMP_DEBUG_ASSERT(0 <= first); 4830 KMP_DEBUG_ASSERT(n_places > first); 4831 KMP_DEBUG_ASSERT(0 <= last); 4832 KMP_DEBUG_ASSERT(n_places > last); 4833 KMP_DEBUG_ASSERT(last_place >= first_place); 4834 th = team->t.t_threads[f]; 4835 KMP_DEBUG_ASSERT(th); 4836 th->th.th_first_place = first; 4837 th->th.th_new_place = place; 4838 th->th.th_last_place = last; 4839 if (__kmp_display_affinity && place != th->th.th_current_place && 4840 team->t.t_display_affinity != 1) { 4841 team->t.t_display_affinity = 1; 4842 } 4843 KA_TRACE(100, 4844 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4845 "partition = [%d,%d], spacing = %.4f\n", 4846 __kmp_gtid_from_thread(team->t.t_threads[f]), 4847 team->t.t_id, f, th->th.th_new_place, 4848 th->th.th_first_place, th->th.th_last_place, spacing)); 4849 } 4850 } 4851 } 4852 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4853 } else { 4854 int S, rem, gap, s_count; 4855 S = n_th / n_places; 4856 s_count = 0; 4857 rem = n_th - (S * n_places); 4858 gap = rem > 0 ? n_places / rem : n_places; 4859 int place = masters_place; 4860 int gap_ct = gap; 4861 thidx = n_th; 4862 if (update_master_only == 1) 4863 thidx = 1; 4864 for (f = 0; f < thidx; f++) { 4865 kmp_info_t *th = team->t.t_threads[f]; 4866 KMP_DEBUG_ASSERT(th != NULL); 4867 4868 th->th.th_first_place = place; 4869 th->th.th_last_place = place; 4870 th->th.th_new_place = place; 4871 if (__kmp_display_affinity && place != th->th.th_current_place && 4872 team->t.t_display_affinity != 1) { 4873 team->t.t_display_affinity = 1; 4874 } 4875 s_count++; 4876 4877 if ((s_count == S) && rem && (gap_ct == gap)) { 4878 // do nothing, add an extra thread to place on next iteration 4879 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4880 // we added an extra thread to this place; move on to next place 4881 if (place == last_place) { 4882 place = first_place; 4883 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4884 place = 0; 4885 } else { 4886 place++; 4887 } 4888 s_count = 0; 4889 gap_ct = 1; 4890 rem--; 4891 } else if (s_count == S) { // place is full; don't add extra thread 4892 if (place == last_place) { 4893 place = first_place; 4894 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4895 place = 0; 4896 } else { 4897 place++; 4898 } 4899 gap_ct++; 4900 s_count = 0; 4901 } 4902 4903 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4904 "partition = [%d,%d]\n", 4905 __kmp_gtid_from_thread(team->t.t_threads[f]), 4906 team->t.t_id, f, th->th.th_new_place, 4907 th->th.th_first_place, th->th.th_last_place)); 4908 } 4909 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4910 } 4911 } break; 4912 4913 default: 4914 break; 4915 } 4916 4917 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4918 } 4919 4920 #endif // KMP_AFFINITY_SUPPORTED 4921 4922 /* allocate a new team data structure to use. take one off of the free pool if 4923 available */ 4924 kmp_team_t * 4925 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4926 #if OMPT_SUPPORT 4927 ompt_data_t ompt_parallel_data, 4928 #endif 4929 kmp_proc_bind_t new_proc_bind, 4930 kmp_internal_control_t *new_icvs, 4931 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4932 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4933 int f; 4934 kmp_team_t *team; 4935 int use_hot_team = !root->r.r_active; 4936 int level = 0; 4937 4938 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4939 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4940 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4941 KMP_MB(); 4942 4943 #if KMP_NESTED_HOT_TEAMS 4944 kmp_hot_team_ptr_t *hot_teams; 4945 if (master) { 4946 team = master->th.th_team; 4947 level = team->t.t_active_level; 4948 if (master->th.th_teams_microtask) { // in teams construct? 4949 if (master->th.th_teams_size.nteams > 1 && 4950 ( // #teams > 1 4951 team->t.t_pkfn == 4952 (microtask_t)__kmp_teams_master || // inner fork of the teams 4953 master->th.th_teams_level < 4954 team->t.t_level)) { // or nested parallel inside the teams 4955 ++level; // not increment if #teams==1, or for outer fork of the teams; 4956 // increment otherwise 4957 } 4958 } 4959 hot_teams = master->th.th_hot_teams; 4960 if (level < __kmp_hot_teams_max_level && hot_teams && 4961 hot_teams[level].hot_team) { 4962 // hot team has already been allocated for given level 4963 use_hot_team = 1; 4964 } else { 4965 use_hot_team = 0; 4966 } 4967 } else { 4968 // check we won't access uninitialized hot_teams, just in case 4969 KMP_DEBUG_ASSERT(new_nproc == 1); 4970 } 4971 #endif 4972 // Optimization to use a "hot" team 4973 if (use_hot_team && new_nproc > 1) { 4974 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4975 #if KMP_NESTED_HOT_TEAMS 4976 team = hot_teams[level].hot_team; 4977 #else 4978 team = root->r.r_hot_team; 4979 #endif 4980 #if KMP_DEBUG 4981 if (__kmp_tasking_mode != tskm_immediate_exec) { 4982 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4983 "task_team[1] = %p before reinit\n", 4984 team->t.t_task_team[0], team->t.t_task_team[1])); 4985 } 4986 #endif 4987 4988 // Has the number of threads changed? 4989 /* Let's assume the most common case is that the number of threads is 4990 unchanged, and put that case first. */ 4991 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4992 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4993 // This case can mean that omp_set_num_threads() was called and the hot 4994 // team size was already reduced, so we check the special flag 4995 if (team->t.t_size_changed == -1) { 4996 team->t.t_size_changed = 1; 4997 } else { 4998 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4999 } 5000 5001 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5002 kmp_r_sched_t new_sched = new_icvs->sched; 5003 // set primary thread's schedule as new run-time schedule 5004 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5005 5006 __kmp_reinitialize_team(team, new_icvs, 5007 root->r.r_uber_thread->th.th_ident); 5008 5009 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5010 team->t.t_threads[0], team)); 5011 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5012 5013 #if KMP_AFFINITY_SUPPORTED 5014 if ((team->t.t_size_changed == 0) && 5015 (team->t.t_proc_bind == new_proc_bind)) { 5016 if (new_proc_bind == proc_bind_spread) { 5017 __kmp_partition_places( 5018 team, 1); // add flag to update only master for spread 5019 } 5020 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5021 "proc_bind = %d, partition = [%d,%d]\n", 5022 team->t.t_id, new_proc_bind, team->t.t_first_place, 5023 team->t.t_last_place)); 5024 } else { 5025 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5026 __kmp_partition_places(team); 5027 } 5028 #else 5029 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5030 #endif /* KMP_AFFINITY_SUPPORTED */ 5031 } else if (team->t.t_nproc > new_nproc) { 5032 KA_TRACE(20, 5033 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5034 new_nproc)); 5035 5036 team->t.t_size_changed = 1; 5037 #if KMP_NESTED_HOT_TEAMS 5038 if (__kmp_hot_teams_mode == 0) { 5039 // AC: saved number of threads should correspond to team's value in this 5040 // mode, can be bigger in mode 1, when hot team has threads in reserve 5041 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5042 hot_teams[level].hot_team_nth = new_nproc; 5043 #endif // KMP_NESTED_HOT_TEAMS 5044 /* release the extra threads we don't need any more */ 5045 for (f = new_nproc; f < team->t.t_nproc; f++) { 5046 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5047 if (__kmp_tasking_mode != tskm_immediate_exec) { 5048 // When decreasing team size, threads no longer in the team should 5049 // unref task team. 5050 team->t.t_threads[f]->th.th_task_team = NULL; 5051 } 5052 __kmp_free_thread(team->t.t_threads[f]); 5053 team->t.t_threads[f] = NULL; 5054 } 5055 #if KMP_NESTED_HOT_TEAMS 5056 } // (__kmp_hot_teams_mode == 0) 5057 else { 5058 // When keeping extra threads in team, switch threads to wait on own 5059 // b_go flag 5060 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5061 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5062 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5063 for (int b = 0; b < bs_last_barrier; ++b) { 5064 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5065 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5066 } 5067 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5068 } 5069 } 5070 } 5071 #endif // KMP_NESTED_HOT_TEAMS 5072 team->t.t_nproc = new_nproc; 5073 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5074 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5075 __kmp_reinitialize_team(team, new_icvs, 5076 root->r.r_uber_thread->th.th_ident); 5077 5078 // Update remaining threads 5079 for (f = 0; f < new_nproc; ++f) { 5080 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5081 } 5082 5083 // restore the current task state of the primary thread: should be the 5084 // implicit task 5085 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5086 team->t.t_threads[0], team)); 5087 5088 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5089 5090 #ifdef KMP_DEBUG 5091 for (f = 0; f < team->t.t_nproc; f++) { 5092 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5093 team->t.t_threads[f]->th.th_team_nproc == 5094 team->t.t_nproc); 5095 } 5096 #endif 5097 5098 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5099 #if KMP_AFFINITY_SUPPORTED 5100 __kmp_partition_places(team); 5101 #endif 5102 } else { // team->t.t_nproc < new_nproc 5103 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5104 kmp_affin_mask_t *old_mask; 5105 if (KMP_AFFINITY_CAPABLE()) { 5106 KMP_CPU_ALLOC(old_mask); 5107 } 5108 #endif 5109 5110 KA_TRACE(20, 5111 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5112 new_nproc)); 5113 5114 team->t.t_size_changed = 1; 5115 5116 #if KMP_NESTED_HOT_TEAMS 5117 int avail_threads = hot_teams[level].hot_team_nth; 5118 if (new_nproc < avail_threads) 5119 avail_threads = new_nproc; 5120 kmp_info_t **other_threads = team->t.t_threads; 5121 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5122 // Adjust barrier data of reserved threads (if any) of the team 5123 // Other data will be set in __kmp_initialize_info() below. 5124 int b; 5125 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5126 for (b = 0; b < bs_last_barrier; ++b) { 5127 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5128 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5129 #if USE_DEBUGGER 5130 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5131 #endif 5132 } 5133 } 5134 if (hot_teams[level].hot_team_nth >= new_nproc) { 5135 // we have all needed threads in reserve, no need to allocate any 5136 // this only possible in mode 1, cannot have reserved threads in mode 0 5137 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5138 team->t.t_nproc = new_nproc; // just get reserved threads involved 5139 } else { 5140 // we may have some threads in reserve, but not enough 5141 team->t.t_nproc = 5142 hot_teams[level] 5143 .hot_team_nth; // get reserved threads involved if any 5144 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5145 #endif // KMP_NESTED_HOT_TEAMS 5146 if (team->t.t_max_nproc < new_nproc) { 5147 /* reallocate larger arrays */ 5148 __kmp_reallocate_team_arrays(team, new_nproc); 5149 __kmp_reinitialize_team(team, new_icvs, NULL); 5150 } 5151 5152 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5153 /* Temporarily set full mask for primary thread before creation of 5154 workers. The reason is that workers inherit the affinity from the 5155 primary thread, so if a lot of workers are created on the single 5156 core quickly, they don't get a chance to set their own affinity for 5157 a long time. */ 5158 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5159 #endif 5160 5161 /* allocate new threads for the hot team */ 5162 for (f = team->t.t_nproc; f < new_nproc; f++) { 5163 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5164 KMP_DEBUG_ASSERT(new_worker); 5165 team->t.t_threads[f] = new_worker; 5166 5167 KA_TRACE(20, 5168 ("__kmp_allocate_team: team %d init T#%d arrived: " 5169 "join=%llu, plain=%llu\n", 5170 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5171 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5172 team->t.t_bar[bs_plain_barrier].b_arrived)); 5173 5174 { // Initialize barrier data for new threads. 5175 int b; 5176 kmp_balign_t *balign = new_worker->th.th_bar; 5177 for (b = 0; b < bs_last_barrier; ++b) { 5178 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5179 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5180 KMP_BARRIER_PARENT_FLAG); 5181 #if USE_DEBUGGER 5182 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5183 #endif 5184 } 5185 } 5186 } 5187 5188 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5189 if (KMP_AFFINITY_CAPABLE()) { 5190 /* Restore initial primary thread's affinity mask */ 5191 __kmp_set_system_affinity(old_mask, TRUE); 5192 KMP_CPU_FREE(old_mask); 5193 } 5194 #endif 5195 #if KMP_NESTED_HOT_TEAMS 5196 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5197 #endif // KMP_NESTED_HOT_TEAMS 5198 /* make sure everyone is syncronized */ 5199 int old_nproc = team->t.t_nproc; // save old value and use to update only 5200 // new threads below 5201 __kmp_initialize_team(team, new_nproc, new_icvs, 5202 root->r.r_uber_thread->th.th_ident); 5203 5204 /* reinitialize the threads */ 5205 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5206 for (f = 0; f < team->t.t_nproc; ++f) 5207 __kmp_initialize_info(team->t.t_threads[f], team, f, 5208 __kmp_gtid_from_tid(f, team)); 5209 5210 if (level) { // set th_task_state for new threads in nested hot team 5211 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5212 // only need to set the th_task_state for the new threads. th_task_state 5213 // for primary thread will not be accurate until after this in 5214 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5215 // get the correct value. 5216 for (f = old_nproc; f < team->t.t_nproc; ++f) 5217 team->t.t_threads[f]->th.th_task_state = 5218 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5219 } else { // set th_task_state for new threads in non-nested hot team 5220 // copy primary thread's state 5221 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5222 for (f = old_nproc; f < team->t.t_nproc; ++f) 5223 team->t.t_threads[f]->th.th_task_state = old_state; 5224 } 5225 5226 #ifdef KMP_DEBUG 5227 for (f = 0; f < team->t.t_nproc; ++f) { 5228 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5229 team->t.t_threads[f]->th.th_team_nproc == 5230 team->t.t_nproc); 5231 } 5232 #endif 5233 5234 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5235 #if KMP_AFFINITY_SUPPORTED 5236 __kmp_partition_places(team); 5237 #endif 5238 } // Check changes in number of threads 5239 5240 kmp_info_t *master = team->t.t_threads[0]; 5241 if (master->th.th_teams_microtask) { 5242 for (f = 1; f < new_nproc; ++f) { 5243 // propagate teams construct specific info to workers 5244 kmp_info_t *thr = team->t.t_threads[f]; 5245 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5246 thr->th.th_teams_level = master->th.th_teams_level; 5247 thr->th.th_teams_size = master->th.th_teams_size; 5248 } 5249 } 5250 #if KMP_NESTED_HOT_TEAMS 5251 if (level) { 5252 // Sync barrier state for nested hot teams, not needed for outermost hot 5253 // team. 5254 for (f = 1; f < new_nproc; ++f) { 5255 kmp_info_t *thr = team->t.t_threads[f]; 5256 int b; 5257 kmp_balign_t *balign = thr->th.th_bar; 5258 for (b = 0; b < bs_last_barrier; ++b) { 5259 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5260 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5261 #if USE_DEBUGGER 5262 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5263 #endif 5264 } 5265 } 5266 } 5267 #endif // KMP_NESTED_HOT_TEAMS 5268 5269 /* reallocate space for arguments if necessary */ 5270 __kmp_alloc_argv_entries(argc, team, TRUE); 5271 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5272 // The hot team re-uses the previous task team, 5273 // if untouched during the previous release->gather phase. 5274 5275 KF_TRACE(10, (" hot_team = %p\n", team)); 5276 5277 #if KMP_DEBUG 5278 if (__kmp_tasking_mode != tskm_immediate_exec) { 5279 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5280 "task_team[1] = %p after reinit\n", 5281 team->t.t_task_team[0], team->t.t_task_team[1])); 5282 } 5283 #endif 5284 5285 #if OMPT_SUPPORT 5286 __ompt_team_assign_id(team, ompt_parallel_data); 5287 #endif 5288 5289 KMP_MB(); 5290 5291 return team; 5292 } 5293 5294 /* next, let's try to take one from the team pool */ 5295 KMP_MB(); 5296 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5297 /* TODO: consider resizing undersized teams instead of reaping them, now 5298 that we have a resizing mechanism */ 5299 if (team->t.t_max_nproc >= max_nproc) { 5300 /* take this team from the team pool */ 5301 __kmp_team_pool = team->t.t_next_pool; 5302 5303 /* setup the team for fresh use */ 5304 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5305 5306 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5307 "task_team[1] %p to NULL\n", 5308 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5309 team->t.t_task_team[0] = NULL; 5310 team->t.t_task_team[1] = NULL; 5311 5312 /* reallocate space for arguments if necessary */ 5313 __kmp_alloc_argv_entries(argc, team, TRUE); 5314 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5315 5316 KA_TRACE( 5317 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5318 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5319 { // Initialize barrier data. 5320 int b; 5321 for (b = 0; b < bs_last_barrier; ++b) { 5322 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5323 #if USE_DEBUGGER 5324 team->t.t_bar[b].b_master_arrived = 0; 5325 team->t.t_bar[b].b_team_arrived = 0; 5326 #endif 5327 } 5328 } 5329 5330 team->t.t_proc_bind = new_proc_bind; 5331 5332 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5333 team->t.t_id)); 5334 5335 #if OMPT_SUPPORT 5336 __ompt_team_assign_id(team, ompt_parallel_data); 5337 #endif 5338 5339 KMP_MB(); 5340 5341 return team; 5342 } 5343 5344 /* reap team if it is too small, then loop back and check the next one */ 5345 // not sure if this is wise, but, will be redone during the hot-teams 5346 // rewrite. 5347 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5348 team = __kmp_reap_team(team); 5349 __kmp_team_pool = team; 5350 } 5351 5352 /* nothing available in the pool, no matter, make a new team! */ 5353 KMP_MB(); 5354 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5355 5356 /* and set it up */ 5357 team->t.t_max_nproc = max_nproc; 5358 /* NOTE well, for some reason allocating one big buffer and dividing it up 5359 seems to really hurt performance a lot on the P4, so, let's not use this */ 5360 __kmp_allocate_team_arrays(team, max_nproc); 5361 5362 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5363 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5364 5365 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5366 "%p to NULL\n", 5367 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5368 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5369 // memory, no need to duplicate 5370 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5371 // memory, no need to duplicate 5372 5373 if (__kmp_storage_map) { 5374 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5375 } 5376 5377 /* allocate space for arguments */ 5378 __kmp_alloc_argv_entries(argc, team, FALSE); 5379 team->t.t_argc = argc; 5380 5381 KA_TRACE(20, 5382 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5383 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5384 { // Initialize barrier data. 5385 int b; 5386 for (b = 0; b < bs_last_barrier; ++b) { 5387 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5388 #if USE_DEBUGGER 5389 team->t.t_bar[b].b_master_arrived = 0; 5390 team->t.t_bar[b].b_team_arrived = 0; 5391 #endif 5392 } 5393 } 5394 5395 team->t.t_proc_bind = new_proc_bind; 5396 5397 #if OMPT_SUPPORT 5398 __ompt_team_assign_id(team, ompt_parallel_data); 5399 team->t.ompt_serialized_team_info = NULL; 5400 #endif 5401 5402 KMP_MB(); 5403 5404 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5405 team->t.t_id)); 5406 5407 return team; 5408 } 5409 5410 /* TODO implement hot-teams at all levels */ 5411 /* TODO implement lazy thread release on demand (disband request) */ 5412 5413 /* free the team. return it to the team pool. release all the threads 5414 * associated with it */ 5415 void __kmp_free_team(kmp_root_t *root, 5416 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5417 int f; 5418 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5419 team->t.t_id)); 5420 5421 /* verify state */ 5422 KMP_DEBUG_ASSERT(root); 5423 KMP_DEBUG_ASSERT(team); 5424 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5425 KMP_DEBUG_ASSERT(team->t.t_threads); 5426 5427 int use_hot_team = team == root->r.r_hot_team; 5428 #if KMP_NESTED_HOT_TEAMS 5429 int level; 5430 kmp_hot_team_ptr_t *hot_teams; 5431 if (master) { 5432 level = team->t.t_active_level - 1; 5433 if (master->th.th_teams_microtask) { // in teams construct? 5434 if (master->th.th_teams_size.nteams > 1) { 5435 ++level; // level was not increased in teams construct for 5436 // team_of_masters 5437 } 5438 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5439 master->th.th_teams_level == team->t.t_level) { 5440 ++level; // level was not increased in teams construct for 5441 // team_of_workers before the parallel 5442 } // team->t.t_level will be increased inside parallel 5443 } 5444 hot_teams = master->th.th_hot_teams; 5445 if (level < __kmp_hot_teams_max_level) { 5446 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5447 use_hot_team = 1; 5448 } 5449 } 5450 #endif // KMP_NESTED_HOT_TEAMS 5451 5452 /* team is done working */ 5453 TCW_SYNC_PTR(team->t.t_pkfn, 5454 NULL); // Important for Debugging Support Library. 5455 #if KMP_OS_WINDOWS 5456 team->t.t_copyin_counter = 0; // init counter for possible reuse 5457 #endif 5458 // Do not reset pointer to parent team to NULL for hot teams. 5459 5460 /* if we are non-hot team, release our threads */ 5461 if (!use_hot_team) { 5462 if (__kmp_tasking_mode != tskm_immediate_exec) { 5463 // Wait for threads to reach reapable state 5464 for (f = 1; f < team->t.t_nproc; ++f) { 5465 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5466 kmp_info_t *th = team->t.t_threads[f]; 5467 volatile kmp_uint32 *state = &th->th.th_reap_state; 5468 while (*state != KMP_SAFE_TO_REAP) { 5469 #if KMP_OS_WINDOWS 5470 // On Windows a thread can be killed at any time, check this 5471 DWORD ecode; 5472 if (!__kmp_is_thread_alive(th, &ecode)) { 5473 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5474 break; 5475 } 5476 #endif 5477 // first check if thread is sleeping 5478 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5479 if (fl.is_sleeping()) 5480 fl.resume(__kmp_gtid_from_thread(th)); 5481 KMP_CPU_PAUSE(); 5482 } 5483 } 5484 5485 // Delete task teams 5486 int tt_idx; 5487 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5488 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5489 if (task_team != NULL) { 5490 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5491 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5492 team->t.t_threads[f]->th.th_task_team = NULL; 5493 } 5494 KA_TRACE( 5495 20, 5496 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5497 __kmp_get_gtid(), task_team, team->t.t_id)); 5498 #if KMP_NESTED_HOT_TEAMS 5499 __kmp_free_task_team(master, task_team); 5500 #endif 5501 team->t.t_task_team[tt_idx] = NULL; 5502 } 5503 } 5504 } 5505 5506 // Reset pointer to parent team only for non-hot teams. 5507 team->t.t_parent = NULL; 5508 team->t.t_level = 0; 5509 team->t.t_active_level = 0; 5510 5511 /* free the worker threads */ 5512 for (f = 1; f < team->t.t_nproc; ++f) { 5513 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5514 __kmp_free_thread(team->t.t_threads[f]); 5515 team->t.t_threads[f] = NULL; 5516 } 5517 5518 /* put the team back in the team pool */ 5519 /* TODO limit size of team pool, call reap_team if pool too large */ 5520 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5521 __kmp_team_pool = (volatile kmp_team_t *)team; 5522 } else { // Check if team was created for primary threads in teams construct 5523 // See if first worker is a CG root 5524 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5525 team->t.t_threads[1]->th.th_cg_roots); 5526 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5527 // Clean up the CG root nodes on workers so that this team can be re-used 5528 for (f = 1; f < team->t.t_nproc; ++f) { 5529 kmp_info_t *thr = team->t.t_threads[f]; 5530 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5531 thr->th.th_cg_roots->cg_root == thr); 5532 // Pop current CG root off list 5533 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5534 thr->th.th_cg_roots = tmp->up; 5535 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5536 " up to node %p. cg_nthreads was %d\n", 5537 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5538 int i = tmp->cg_nthreads--; 5539 if (i == 1) { 5540 __kmp_free(tmp); // free CG if we are the last thread in it 5541 } 5542 // Restore current task's thread_limit from CG root 5543 if (thr->th.th_cg_roots) 5544 thr->th.th_current_task->td_icvs.thread_limit = 5545 thr->th.th_cg_roots->cg_thread_limit; 5546 } 5547 } 5548 } 5549 5550 KMP_MB(); 5551 } 5552 5553 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5554 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5555 kmp_team_t *next_pool = team->t.t_next_pool; 5556 5557 KMP_DEBUG_ASSERT(team); 5558 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5559 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5560 KMP_DEBUG_ASSERT(team->t.t_threads); 5561 KMP_DEBUG_ASSERT(team->t.t_argv); 5562 5563 /* TODO clean the threads that are a part of this? */ 5564 5565 /* free stuff */ 5566 __kmp_free_team_arrays(team); 5567 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5568 __kmp_free((void *)team->t.t_argv); 5569 __kmp_free(team); 5570 5571 KMP_MB(); 5572 return next_pool; 5573 } 5574 5575 // Free the thread. Don't reap it, just place it on the pool of available 5576 // threads. 5577 // 5578 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5579 // binding for the affinity mechanism to be useful. 5580 // 5581 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5582 // However, we want to avoid a potential performance problem by always 5583 // scanning through the list to find the correct point at which to insert 5584 // the thread (potential N**2 behavior). To do this we keep track of the 5585 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5586 // With single-level parallelism, threads will always be added to the tail 5587 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5588 // parallelism, all bets are off and we may need to scan through the entire 5589 // free list. 5590 // 5591 // This change also has a potentially large performance benefit, for some 5592 // applications. Previously, as threads were freed from the hot team, they 5593 // would be placed back on the free list in inverse order. If the hot team 5594 // grew back to it's original size, then the freed thread would be placed 5595 // back on the hot team in reverse order. This could cause bad cache 5596 // locality problems on programs where the size of the hot team regularly 5597 // grew and shrunk. 5598 // 5599 // Now, for single-level parallelism, the OMP tid is always == gtid. 5600 void __kmp_free_thread(kmp_info_t *this_th) { 5601 int gtid; 5602 kmp_info_t **scan; 5603 5604 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5605 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5606 5607 KMP_DEBUG_ASSERT(this_th); 5608 5609 // When moving thread to pool, switch thread to wait on own b_go flag, and 5610 // uninitialized (NULL team). 5611 int b; 5612 kmp_balign_t *balign = this_th->th.th_bar; 5613 for (b = 0; b < bs_last_barrier; ++b) { 5614 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5615 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5616 balign[b].bb.team = NULL; 5617 balign[b].bb.leaf_kids = 0; 5618 } 5619 this_th->th.th_task_state = 0; 5620 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5621 5622 /* put thread back on the free pool */ 5623 TCW_PTR(this_th->th.th_team, NULL); 5624 TCW_PTR(this_th->th.th_root, NULL); 5625 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5626 5627 while (this_th->th.th_cg_roots) { 5628 this_th->th.th_cg_roots->cg_nthreads--; 5629 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5630 " %p of thread %p to %d\n", 5631 this_th, this_th->th.th_cg_roots, 5632 this_th->th.th_cg_roots->cg_root, 5633 this_th->th.th_cg_roots->cg_nthreads)); 5634 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5635 if (tmp->cg_root == this_th) { // Thread is a cg_root 5636 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5637 KA_TRACE( 5638 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5639 this_th->th.th_cg_roots = tmp->up; 5640 __kmp_free(tmp); 5641 } else { // Worker thread 5642 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5643 __kmp_free(tmp); 5644 } 5645 this_th->th.th_cg_roots = NULL; 5646 break; 5647 } 5648 } 5649 5650 /* If the implicit task assigned to this thread can be used by other threads 5651 * -> multiple threads can share the data and try to free the task at 5652 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5653 * with higher probability when hot team is disabled but can occurs even when 5654 * the hot team is enabled */ 5655 __kmp_free_implicit_task(this_th); 5656 this_th->th.th_current_task = NULL; 5657 5658 // If the __kmp_thread_pool_insert_pt is already past the new insert 5659 // point, then we need to re-scan the entire list. 5660 gtid = this_th->th.th_info.ds.ds_gtid; 5661 if (__kmp_thread_pool_insert_pt != NULL) { 5662 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5663 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5664 __kmp_thread_pool_insert_pt = NULL; 5665 } 5666 } 5667 5668 // Scan down the list to find the place to insert the thread. 5669 // scan is the address of a link in the list, possibly the address of 5670 // __kmp_thread_pool itself. 5671 // 5672 // In the absence of nested parallelism, the for loop will have 0 iterations. 5673 if (__kmp_thread_pool_insert_pt != NULL) { 5674 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5675 } else { 5676 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5677 } 5678 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5679 scan = &((*scan)->th.th_next_pool)) 5680 ; 5681 5682 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5683 // to its address. 5684 TCW_PTR(this_th->th.th_next_pool, *scan); 5685 __kmp_thread_pool_insert_pt = *scan = this_th; 5686 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5687 (this_th->th.th_info.ds.ds_gtid < 5688 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5689 TCW_4(this_th->th.th_in_pool, TRUE); 5690 __kmp_suspend_initialize_thread(this_th); 5691 __kmp_lock_suspend_mx(this_th); 5692 if (this_th->th.th_active == TRUE) { 5693 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5694 this_th->th.th_active_in_pool = TRUE; 5695 } 5696 #if KMP_DEBUG 5697 else { 5698 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5699 } 5700 #endif 5701 __kmp_unlock_suspend_mx(this_th); 5702 5703 TCW_4(__kmp_nth, __kmp_nth - 1); 5704 5705 #ifdef KMP_ADJUST_BLOCKTIME 5706 /* Adjust blocktime back to user setting or default if necessary */ 5707 /* Middle initialization might never have occurred */ 5708 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5709 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5710 if (__kmp_nth <= __kmp_avail_proc) { 5711 __kmp_zero_bt = FALSE; 5712 } 5713 } 5714 #endif /* KMP_ADJUST_BLOCKTIME */ 5715 5716 KMP_MB(); 5717 } 5718 5719 /* ------------------------------------------------------------------------ */ 5720 5721 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5722 #if OMP_PROFILING_SUPPORT 5723 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5724 // TODO: add a configuration option for time granularity 5725 if (ProfileTraceFile) 5726 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5727 #endif 5728 5729 int gtid = this_thr->th.th_info.ds.ds_gtid; 5730 /* void *stack_data;*/ 5731 kmp_team_t **volatile pteam; 5732 5733 KMP_MB(); 5734 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5735 5736 if (__kmp_env_consistency_check) { 5737 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5738 } 5739 5740 #if OMPT_SUPPORT 5741 ompt_data_t *thread_data; 5742 if (ompt_enabled.enabled) { 5743 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5744 *thread_data = ompt_data_none; 5745 5746 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5747 this_thr->th.ompt_thread_info.wait_id = 0; 5748 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5749 this_thr->th.ompt_thread_info.parallel_flags = 0; 5750 if (ompt_enabled.ompt_callback_thread_begin) { 5751 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5752 ompt_thread_worker, thread_data); 5753 } 5754 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5755 } 5756 #endif 5757 5758 /* This is the place where threads wait for work */ 5759 while (!TCR_4(__kmp_global.g.g_done)) { 5760 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5761 KMP_MB(); 5762 5763 /* wait for work to do */ 5764 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5765 5766 /* No tid yet since not part of a team */ 5767 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5768 5769 #if OMPT_SUPPORT 5770 if (ompt_enabled.enabled) { 5771 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5772 } 5773 #endif 5774 5775 pteam = &this_thr->th.th_team; 5776 5777 /* have we been allocated? */ 5778 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5779 /* we were just woken up, so run our new task */ 5780 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5781 int rc; 5782 KA_TRACE(20, 5783 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5784 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5785 (*pteam)->t.t_pkfn)); 5786 5787 updateHWFPControl(*pteam); 5788 5789 #if OMPT_SUPPORT 5790 if (ompt_enabled.enabled) { 5791 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5792 } 5793 #endif 5794 5795 rc = (*pteam)->t.t_invoke(gtid); 5796 KMP_ASSERT(rc); 5797 5798 KMP_MB(); 5799 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5800 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5801 (*pteam)->t.t_pkfn)); 5802 } 5803 #if OMPT_SUPPORT 5804 if (ompt_enabled.enabled) { 5805 /* no frame set while outside task */ 5806 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5807 5808 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5809 } 5810 #endif 5811 /* join barrier after parallel region */ 5812 __kmp_join_barrier(gtid); 5813 } 5814 } 5815 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5816 5817 #if OMPT_SUPPORT 5818 if (ompt_enabled.ompt_callback_thread_end) { 5819 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5820 } 5821 #endif 5822 5823 this_thr->th.th_task_team = NULL; 5824 /* run the destructors for the threadprivate data for this thread */ 5825 __kmp_common_destroy_gtid(gtid); 5826 5827 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5828 KMP_MB(); 5829 5830 #if OMP_PROFILING_SUPPORT 5831 llvm::timeTraceProfilerFinishThread(); 5832 #endif 5833 return this_thr; 5834 } 5835 5836 /* ------------------------------------------------------------------------ */ 5837 5838 void __kmp_internal_end_dest(void *specific_gtid) { 5839 // Make sure no significant bits are lost 5840 int gtid; 5841 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5842 5843 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5844 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5845 * this is because 0 is reserved for the nothing-stored case */ 5846 5847 __kmp_internal_end_thread(gtid); 5848 } 5849 5850 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5851 5852 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5853 __kmp_internal_end_atexit(); 5854 } 5855 5856 #endif 5857 5858 /* [Windows] josh: when the atexit handler is called, there may still be more 5859 than one thread alive */ 5860 void __kmp_internal_end_atexit(void) { 5861 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5862 /* [Windows] 5863 josh: ideally, we want to completely shutdown the library in this atexit 5864 handler, but stat code that depends on thread specific data for gtid fails 5865 because that data becomes unavailable at some point during the shutdown, so 5866 we call __kmp_internal_end_thread instead. We should eventually remove the 5867 dependency on __kmp_get_specific_gtid in the stat code and use 5868 __kmp_internal_end_library to cleanly shutdown the library. 5869 5870 // TODO: Can some of this comment about GVS be removed? 5871 I suspect that the offending stat code is executed when the calling thread 5872 tries to clean up a dead root thread's data structures, resulting in GVS 5873 code trying to close the GVS structures for that thread, but since the stat 5874 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5875 the calling thread is cleaning up itself instead of another thread, it get 5876 confused. This happens because allowing a thread to unregister and cleanup 5877 another thread is a recent modification for addressing an issue. 5878 Based on the current design (20050722), a thread may end up 5879 trying to unregister another thread only if thread death does not trigger 5880 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5881 thread specific data destructor function to detect thread death. For 5882 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5883 is nothing. Thus, the workaround is applicable only for Windows static 5884 stat library. */ 5885 __kmp_internal_end_library(-1); 5886 #if KMP_OS_WINDOWS 5887 __kmp_close_console(); 5888 #endif 5889 } 5890 5891 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5892 // It is assumed __kmp_forkjoin_lock is acquired. 5893 5894 int gtid; 5895 5896 KMP_DEBUG_ASSERT(thread != NULL); 5897 5898 gtid = thread->th.th_info.ds.ds_gtid; 5899 5900 if (!is_root) { 5901 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5902 /* Assume the threads are at the fork barrier here */ 5903 KA_TRACE( 5904 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5905 gtid)); 5906 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5907 * (GEH) */ 5908 ANNOTATE_HAPPENS_BEFORE(thread); 5909 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5910 thread); 5911 __kmp_release_64(&flag); 5912 } 5913 5914 // Terminate OS thread. 5915 __kmp_reap_worker(thread); 5916 5917 // The thread was killed asynchronously. If it was actively 5918 // spinning in the thread pool, decrement the global count. 5919 // 5920 // There is a small timing hole here - if the worker thread was just waking 5921 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5922 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5923 // the global counter might not get updated. 5924 // 5925 // Currently, this can only happen as the library is unloaded, 5926 // so there are no harmful side effects. 5927 if (thread->th.th_active_in_pool) { 5928 thread->th.th_active_in_pool = FALSE; 5929 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5930 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5931 } 5932 } 5933 5934 __kmp_free_implicit_task(thread); 5935 5936 // Free the fast memory for tasking 5937 #if USE_FAST_MEMORY 5938 __kmp_free_fast_memory(thread); 5939 #endif /* USE_FAST_MEMORY */ 5940 5941 __kmp_suspend_uninitialize_thread(thread); 5942 5943 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5944 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5945 5946 --__kmp_all_nth; 5947 // __kmp_nth was decremented when thread is added to the pool. 5948 5949 #ifdef KMP_ADJUST_BLOCKTIME 5950 /* Adjust blocktime back to user setting or default if necessary */ 5951 /* Middle initialization might never have occurred */ 5952 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5953 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5954 if (__kmp_nth <= __kmp_avail_proc) { 5955 __kmp_zero_bt = FALSE; 5956 } 5957 } 5958 #endif /* KMP_ADJUST_BLOCKTIME */ 5959 5960 /* free the memory being used */ 5961 if (__kmp_env_consistency_check) { 5962 if (thread->th.th_cons) { 5963 __kmp_free_cons_stack(thread->th.th_cons); 5964 thread->th.th_cons = NULL; 5965 } 5966 } 5967 5968 if (thread->th.th_pri_common != NULL) { 5969 __kmp_free(thread->th.th_pri_common); 5970 thread->th.th_pri_common = NULL; 5971 } 5972 5973 if (thread->th.th_task_state_memo_stack != NULL) { 5974 __kmp_free(thread->th.th_task_state_memo_stack); 5975 thread->th.th_task_state_memo_stack = NULL; 5976 } 5977 5978 #if KMP_USE_BGET 5979 if (thread->th.th_local.bget_data != NULL) { 5980 __kmp_finalize_bget(thread); 5981 } 5982 #endif 5983 5984 #if KMP_AFFINITY_SUPPORTED 5985 if (thread->th.th_affin_mask != NULL) { 5986 KMP_CPU_FREE(thread->th.th_affin_mask); 5987 thread->th.th_affin_mask = NULL; 5988 } 5989 #endif /* KMP_AFFINITY_SUPPORTED */ 5990 5991 #if KMP_USE_HIER_SCHED 5992 if (thread->th.th_hier_bar_data != NULL) { 5993 __kmp_free(thread->th.th_hier_bar_data); 5994 thread->th.th_hier_bar_data = NULL; 5995 } 5996 #endif 5997 5998 __kmp_reap_team(thread->th.th_serial_team); 5999 thread->th.th_serial_team = NULL; 6000 __kmp_free(thread); 6001 6002 KMP_MB(); 6003 6004 } // __kmp_reap_thread 6005 6006 static void __kmp_internal_end(void) { 6007 int i; 6008 6009 /* First, unregister the library */ 6010 __kmp_unregister_library(); 6011 6012 #if KMP_OS_WINDOWS 6013 /* In Win static library, we can't tell when a root actually dies, so we 6014 reclaim the data structures for any root threads that have died but not 6015 unregistered themselves, in order to shut down cleanly. 6016 In Win dynamic library we also can't tell when a thread dies. */ 6017 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6018 // dead roots 6019 #endif 6020 6021 for (i = 0; i < __kmp_threads_capacity; i++) 6022 if (__kmp_root[i]) 6023 if (__kmp_root[i]->r.r_active) 6024 break; 6025 KMP_MB(); /* Flush all pending memory write invalidates. */ 6026 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6027 6028 if (i < __kmp_threads_capacity) { 6029 #if KMP_USE_MONITOR 6030 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6031 KMP_MB(); /* Flush all pending memory write invalidates. */ 6032 6033 // Need to check that monitor was initialized before reaping it. If we are 6034 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6035 // __kmp_monitor will appear to contain valid data, but it is only valid in 6036 // the parent process, not the child. 6037 // New behavior (201008): instead of keying off of the flag 6038 // __kmp_init_parallel, the monitor thread creation is keyed off 6039 // of the new flag __kmp_init_monitor. 6040 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6041 if (TCR_4(__kmp_init_monitor)) { 6042 __kmp_reap_monitor(&__kmp_monitor); 6043 TCW_4(__kmp_init_monitor, 0); 6044 } 6045 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6046 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6047 #endif // KMP_USE_MONITOR 6048 } else { 6049 /* TODO move this to cleanup code */ 6050 #ifdef KMP_DEBUG 6051 /* make sure that everything has properly ended */ 6052 for (i = 0; i < __kmp_threads_capacity; i++) { 6053 if (__kmp_root[i]) { 6054 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6055 // there can be uber threads alive here 6056 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6057 } 6058 } 6059 #endif 6060 6061 KMP_MB(); 6062 6063 // Reap the worker threads. 6064 // This is valid for now, but be careful if threads are reaped sooner. 6065 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6066 // Get the next thread from the pool. 6067 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6068 __kmp_thread_pool = thread->th.th_next_pool; 6069 // Reap it. 6070 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6071 thread->th.th_next_pool = NULL; 6072 thread->th.th_in_pool = FALSE; 6073 __kmp_reap_thread(thread, 0); 6074 } 6075 __kmp_thread_pool_insert_pt = NULL; 6076 6077 // Reap teams. 6078 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6079 // Get the next team from the pool. 6080 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6081 __kmp_team_pool = team->t.t_next_pool; 6082 // Reap it. 6083 team->t.t_next_pool = NULL; 6084 __kmp_reap_team(team); 6085 } 6086 6087 __kmp_reap_task_teams(); 6088 6089 #if KMP_OS_UNIX 6090 // Threads that are not reaped should not access any resources since they 6091 // are going to be deallocated soon, so the shutdown sequence should wait 6092 // until all threads either exit the final spin-waiting loop or begin 6093 // sleeping after the given blocktime. 6094 for (i = 0; i < __kmp_threads_capacity; i++) { 6095 kmp_info_t *thr = __kmp_threads[i]; 6096 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6097 KMP_CPU_PAUSE(); 6098 } 6099 #endif 6100 6101 for (i = 0; i < __kmp_threads_capacity; ++i) { 6102 // TBD: Add some checking... 6103 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6104 } 6105 6106 /* Make sure all threadprivate destructors get run by joining with all 6107 worker threads before resetting this flag */ 6108 TCW_SYNC_4(__kmp_init_common, FALSE); 6109 6110 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6111 KMP_MB(); 6112 6113 #if KMP_USE_MONITOR 6114 // See note above: One of the possible fixes for CQ138434 / CQ140126 6115 // 6116 // FIXME: push both code fragments down and CSE them? 6117 // push them into __kmp_cleanup() ? 6118 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6119 if (TCR_4(__kmp_init_monitor)) { 6120 __kmp_reap_monitor(&__kmp_monitor); 6121 TCW_4(__kmp_init_monitor, 0); 6122 } 6123 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6124 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6125 #endif 6126 } /* else !__kmp_global.t_active */ 6127 TCW_4(__kmp_init_gtid, FALSE); 6128 KMP_MB(); /* Flush all pending memory write invalidates. */ 6129 6130 __kmp_cleanup(); 6131 #if OMPT_SUPPORT 6132 ompt_fini(); 6133 #endif 6134 } 6135 6136 void __kmp_internal_end_library(int gtid_req) { 6137 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6138 /* this shouldn't be a race condition because __kmp_internal_end() is the 6139 only place to clear __kmp_serial_init */ 6140 /* we'll check this later too, after we get the lock */ 6141 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6142 // redundant, because the next check will work in any case. 6143 if (__kmp_global.g.g_abort) { 6144 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6145 /* TODO abort? */ 6146 return; 6147 } 6148 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6149 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6150 return; 6151 } 6152 6153 KMP_MB(); /* Flush all pending memory write invalidates. */ 6154 /* find out who we are and what we should do */ 6155 { 6156 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6157 KA_TRACE( 6158 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6159 if (gtid == KMP_GTID_SHUTDOWN) { 6160 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6161 "already shutdown\n")); 6162 return; 6163 } else if (gtid == KMP_GTID_MONITOR) { 6164 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6165 "registered, or system shutdown\n")); 6166 return; 6167 } else if (gtid == KMP_GTID_DNE) { 6168 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6169 "shutdown\n")); 6170 /* we don't know who we are, but we may still shutdown the library */ 6171 } else if (KMP_UBER_GTID(gtid)) { 6172 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6173 if (__kmp_root[gtid]->r.r_active) { 6174 __kmp_global.g.g_abort = -1; 6175 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6176 __kmp_unregister_library(); 6177 KA_TRACE(10, 6178 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6179 gtid)); 6180 return; 6181 } else { 6182 KA_TRACE( 6183 10, 6184 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6185 __kmp_unregister_root_current_thread(gtid); 6186 } 6187 } else { 6188 /* worker threads may call this function through the atexit handler, if they 6189 * call exit() */ 6190 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6191 TODO: do a thorough shutdown instead */ 6192 #ifdef DUMP_DEBUG_ON_EXIT 6193 if (__kmp_debug_buf) 6194 __kmp_dump_debug_buffer(); 6195 #endif 6196 // added unregister library call here when we switch to shm linux 6197 // if we don't, it will leave lots of files in /dev/shm 6198 // cleanup shared memory file before exiting. 6199 __kmp_unregister_library(); 6200 return; 6201 } 6202 } 6203 /* synchronize the termination process */ 6204 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6205 6206 /* have we already finished */ 6207 if (__kmp_global.g.g_abort) { 6208 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6209 /* TODO abort? */ 6210 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6211 return; 6212 } 6213 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6214 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6215 return; 6216 } 6217 6218 /* We need this lock to enforce mutex between this reading of 6219 __kmp_threads_capacity and the writing by __kmp_register_root. 6220 Alternatively, we can use a counter of roots that is atomically updated by 6221 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6222 __kmp_internal_end_*. */ 6223 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6224 6225 /* now we can safely conduct the actual termination */ 6226 __kmp_internal_end(); 6227 6228 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6229 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6230 6231 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6232 6233 #ifdef DUMP_DEBUG_ON_EXIT 6234 if (__kmp_debug_buf) 6235 __kmp_dump_debug_buffer(); 6236 #endif 6237 6238 #if KMP_OS_WINDOWS 6239 __kmp_close_console(); 6240 #endif 6241 6242 __kmp_fini_allocator(); 6243 6244 } // __kmp_internal_end_library 6245 6246 void __kmp_internal_end_thread(int gtid_req) { 6247 int i; 6248 6249 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6250 /* this shouldn't be a race condition because __kmp_internal_end() is the 6251 * only place to clear __kmp_serial_init */ 6252 /* we'll check this later too, after we get the lock */ 6253 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6254 // redundant, because the next check will work in any case. 6255 if (__kmp_global.g.g_abort) { 6256 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6257 /* TODO abort? */ 6258 return; 6259 } 6260 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6261 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6262 return; 6263 } 6264 6265 // If hidden helper team has been initialized, we need to deinit it 6266 if (TCR_4(__kmp_init_hidden_helper)) { 6267 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6268 // First release the main thread to let it continue its work 6269 __kmp_hidden_helper_main_thread_release(); 6270 // Wait until the hidden helper team has been destroyed 6271 __kmp_hidden_helper_threads_deinitz_wait(); 6272 } 6273 6274 KMP_MB(); /* Flush all pending memory write invalidates. */ 6275 6276 /* find out who we are and what we should do */ 6277 { 6278 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6279 KA_TRACE(10, 6280 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6281 if (gtid == KMP_GTID_SHUTDOWN) { 6282 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6283 "already shutdown\n")); 6284 return; 6285 } else if (gtid == KMP_GTID_MONITOR) { 6286 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6287 "registered, or system shutdown\n")); 6288 return; 6289 } else if (gtid == KMP_GTID_DNE) { 6290 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6291 "shutdown\n")); 6292 return; 6293 /* we don't know who we are */ 6294 } else if (KMP_UBER_GTID(gtid)) { 6295 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6296 if (__kmp_root[gtid]->r.r_active) { 6297 __kmp_global.g.g_abort = -1; 6298 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6299 KA_TRACE(10, 6300 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6301 gtid)); 6302 return; 6303 } else { 6304 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6305 gtid)); 6306 __kmp_unregister_root_current_thread(gtid); 6307 } 6308 } else { 6309 /* just a worker thread, let's leave */ 6310 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6311 6312 if (gtid >= 0) { 6313 __kmp_threads[gtid]->th.th_task_team = NULL; 6314 } 6315 6316 KA_TRACE(10, 6317 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6318 gtid)); 6319 return; 6320 } 6321 } 6322 #if KMP_DYNAMIC_LIB 6323 if (__kmp_pause_status != kmp_hard_paused) 6324 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6325 // because we will better shutdown later in the library destructor. 6326 { 6327 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6328 return; 6329 } 6330 #endif 6331 /* synchronize the termination process */ 6332 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6333 6334 /* have we already finished */ 6335 if (__kmp_global.g.g_abort) { 6336 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6337 /* TODO abort? */ 6338 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6339 return; 6340 } 6341 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6342 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6343 return; 6344 } 6345 6346 /* We need this lock to enforce mutex between this reading of 6347 __kmp_threads_capacity and the writing by __kmp_register_root. 6348 Alternatively, we can use a counter of roots that is atomically updated by 6349 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6350 __kmp_internal_end_*. */ 6351 6352 /* should we finish the run-time? are all siblings done? */ 6353 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6354 6355 for (i = 0; i < __kmp_threads_capacity; ++i) { 6356 if (KMP_UBER_GTID(i)) { 6357 KA_TRACE( 6358 10, 6359 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6360 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6361 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6362 return; 6363 } 6364 } 6365 6366 /* now we can safely conduct the actual termination */ 6367 6368 __kmp_internal_end(); 6369 6370 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6371 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6372 6373 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6374 6375 #ifdef DUMP_DEBUG_ON_EXIT 6376 if (__kmp_debug_buf) 6377 __kmp_dump_debug_buffer(); 6378 #endif 6379 } // __kmp_internal_end_thread 6380 6381 // ----------------------------------------------------------------------------- 6382 // Library registration stuff. 6383 6384 static long __kmp_registration_flag = 0; 6385 // Random value used to indicate library initialization. 6386 static char *__kmp_registration_str = NULL; 6387 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6388 6389 static inline char *__kmp_reg_status_name() { 6390 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6391 each thread. If registration and unregistration go in different threads 6392 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6393 env var can not be found, because the name will contain different pid. */ 6394 // macOS* complains about name being too long with additional getuid() 6395 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6396 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6397 (int)getuid()); 6398 #else 6399 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6400 #endif 6401 } // __kmp_reg_status_get 6402 6403 void __kmp_register_library_startup(void) { 6404 6405 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6406 int done = 0; 6407 union { 6408 double dtime; 6409 long ltime; 6410 } time; 6411 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6412 __kmp_initialize_system_tick(); 6413 #endif 6414 __kmp_read_system_time(&time.dtime); 6415 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6416 __kmp_registration_str = 6417 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6418 __kmp_registration_flag, KMP_LIBRARY_FILE); 6419 6420 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6421 __kmp_registration_str)); 6422 6423 while (!done) { 6424 6425 char *value = NULL; // Actual value of the environment variable. 6426 6427 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6428 char *shm_name = __kmp_str_format("/%s", name); 6429 int shm_preexist = 0; 6430 char *data1; 6431 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6432 if ((fd1 == -1) && (errno == EEXIST)) { 6433 // file didn't open because it already exists. 6434 // try opening existing file 6435 fd1 = shm_open(shm_name, O_RDWR, 0666); 6436 if (fd1 == -1) { // file didn't open 6437 // error out here 6438 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6439 __kmp_msg_null); 6440 } else { 6441 // able to open existing file 6442 shm_preexist = 1; 6443 } 6444 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6445 // already exists. 6446 // error out here. 6447 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6448 __kmp_msg_null); 6449 } 6450 if (shm_preexist == 0) { 6451 // we created SHM now set size 6452 if (ftruncate(fd1, SHM_SIZE) == -1) { 6453 // error occured setting size; 6454 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6455 KMP_ERR(errno), __kmp_msg_null); 6456 } 6457 } 6458 data1 = 6459 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6460 if (data1 == MAP_FAILED) { 6461 // failed to map shared memory 6462 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6463 __kmp_msg_null); 6464 } 6465 if (shm_preexist == 0) { // set data to SHM, set value 6466 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6467 } 6468 // Read value from either what we just wrote or existing file. 6469 value = __kmp_str_format("%s", data1); // read value from SHM 6470 munmap(data1, SHM_SIZE); 6471 close(fd1); 6472 #else // Windows and unix with static library 6473 // Set environment variable, but do not overwrite if it is exist. 6474 __kmp_env_set(name, __kmp_registration_str, 0); 6475 // read value to see if it got set 6476 value = __kmp_env_get(name); 6477 #endif 6478 6479 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6480 done = 1; // Ok, environment variable set successfully, exit the loop. 6481 } else { 6482 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6483 // Check whether it alive or dead. 6484 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6485 char *tail = value; 6486 char *flag_addr_str = NULL; 6487 char *flag_val_str = NULL; 6488 char const *file_name = NULL; 6489 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6490 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6491 file_name = tail; 6492 if (tail != NULL) { 6493 long *flag_addr = 0; 6494 unsigned long flag_val = 0; 6495 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6496 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6497 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6498 // First, check whether environment-encoded address is mapped into 6499 // addr space. 6500 // If so, dereference it to see if it still has the right value. 6501 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6502 neighbor = 1; 6503 } else { 6504 // If not, then we know the other copy of the library is no longer 6505 // running. 6506 neighbor = 2; 6507 } 6508 } 6509 } 6510 switch (neighbor) { 6511 case 0: // Cannot parse environment variable -- neighbor status unknown. 6512 // Assume it is the incompatible format of future version of the 6513 // library. Assume the other library is alive. 6514 // WARN( ... ); // TODO: Issue a warning. 6515 file_name = "unknown library"; 6516 KMP_FALLTHROUGH(); 6517 // Attention! Falling to the next case. That's intentional. 6518 case 1: { // Neighbor is alive. 6519 // Check it is allowed. 6520 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6521 if (!__kmp_str_match_true(duplicate_ok)) { 6522 // That's not allowed. Issue fatal error. 6523 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6524 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6525 } 6526 KMP_INTERNAL_FREE(duplicate_ok); 6527 __kmp_duplicate_library_ok = 1; 6528 done = 1; // Exit the loop. 6529 } break; 6530 case 2: { // Neighbor is dead. 6531 6532 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6533 // close shared memory. 6534 shm_unlink(shm_name); // this removes file in /dev/shm 6535 #else 6536 // Clear the variable and try to register library again. 6537 __kmp_env_unset(name); 6538 #endif 6539 } break; 6540 default: { 6541 KMP_DEBUG_ASSERT(0); 6542 } break; 6543 } 6544 } 6545 KMP_INTERNAL_FREE((void *)value); 6546 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6547 KMP_INTERNAL_FREE((void *)shm_name); 6548 #endif 6549 } // while 6550 KMP_INTERNAL_FREE((void *)name); 6551 6552 } // func __kmp_register_library_startup 6553 6554 void __kmp_unregister_library(void) { 6555 6556 char *name = __kmp_reg_status_name(); 6557 char *value = NULL; 6558 6559 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6560 char *shm_name = __kmp_str_format("/%s", name); 6561 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6562 if (fd1 == -1) { 6563 // file did not open. return. 6564 return; 6565 } 6566 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6567 if (data1 != MAP_FAILED) { 6568 value = __kmp_str_format("%s", data1); // read value from SHM 6569 munmap(data1, SHM_SIZE); 6570 } 6571 close(fd1); 6572 #else 6573 value = __kmp_env_get(name); 6574 #endif 6575 6576 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6577 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6578 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6579 // Ok, this is our variable. Delete it. 6580 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6581 shm_unlink(shm_name); // this removes file in /dev/shm 6582 #else 6583 __kmp_env_unset(name); 6584 #endif 6585 } 6586 6587 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6588 KMP_INTERNAL_FREE(shm_name); 6589 #endif 6590 6591 KMP_INTERNAL_FREE(__kmp_registration_str); 6592 KMP_INTERNAL_FREE(value); 6593 KMP_INTERNAL_FREE(name); 6594 6595 __kmp_registration_flag = 0; 6596 __kmp_registration_str = NULL; 6597 6598 } // __kmp_unregister_library 6599 6600 // End of Library registration stuff. 6601 // ----------------------------------------------------------------------------- 6602 6603 #if KMP_MIC_SUPPORTED 6604 6605 static void __kmp_check_mic_type() { 6606 kmp_cpuid_t cpuid_state = {0}; 6607 kmp_cpuid_t *cs_p = &cpuid_state; 6608 __kmp_x86_cpuid(1, 0, cs_p); 6609 // We don't support mic1 at the moment 6610 if ((cs_p->eax & 0xff0) == 0xB10) { 6611 __kmp_mic_type = mic2; 6612 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6613 __kmp_mic_type = mic3; 6614 } else { 6615 __kmp_mic_type = non_mic; 6616 } 6617 } 6618 6619 #endif /* KMP_MIC_SUPPORTED */ 6620 6621 #if KMP_HAVE_UMWAIT 6622 static void __kmp_user_level_mwait_init() { 6623 struct kmp_cpuid buf; 6624 __kmp_x86_cpuid(7, 0, &buf); 6625 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6626 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6627 __kmp_umwait_enabled)); 6628 } 6629 #elif KMP_HAVE_MWAIT 6630 #ifndef AT_INTELPHIUSERMWAIT 6631 // Spurious, non-existent value that should always fail to return anything. 6632 // Will be replaced with the correct value when we know that. 6633 #define AT_INTELPHIUSERMWAIT 10000 6634 #endif 6635 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6636 // earlier OS is used to build the RTL, we'll use the following internal 6637 // function when the entry is not found. 6638 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6639 unsigned long getauxval(unsigned long) { return 0; } 6640 6641 static void __kmp_user_level_mwait_init() { 6642 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6643 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6644 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6645 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6646 if (__kmp_mic_type == mic3) { 6647 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6648 if ((res & 0x1) || __kmp_user_level_mwait) { 6649 __kmp_mwait_enabled = TRUE; 6650 if (__kmp_user_level_mwait) { 6651 KMP_INFORM(EnvMwaitWarn); 6652 } 6653 } else { 6654 __kmp_mwait_enabled = FALSE; 6655 } 6656 } 6657 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6658 "__kmp_mwait_enabled = %d\n", 6659 __kmp_mic_type, __kmp_mwait_enabled)); 6660 } 6661 #endif /* KMP_HAVE_UMWAIT */ 6662 6663 static void __kmp_do_serial_initialize(void) { 6664 int i, gtid; 6665 size_t size; 6666 6667 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6668 6669 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6670 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6671 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6672 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6673 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6674 6675 #if OMPT_SUPPORT 6676 ompt_pre_init(); 6677 #endif 6678 6679 __kmp_validate_locks(); 6680 6681 /* Initialize internal memory allocator */ 6682 __kmp_init_allocator(); 6683 6684 /* Register the library startup via an environment variable and check to see 6685 whether another copy of the library is already registered. */ 6686 6687 __kmp_register_library_startup(); 6688 6689 /* TODO reinitialization of library */ 6690 if (TCR_4(__kmp_global.g.g_done)) { 6691 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6692 } 6693 6694 __kmp_global.g.g_abort = 0; 6695 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6696 6697 /* initialize the locks */ 6698 #if KMP_USE_ADAPTIVE_LOCKS 6699 #if KMP_DEBUG_ADAPTIVE_LOCKS 6700 __kmp_init_speculative_stats(); 6701 #endif 6702 #endif 6703 #if KMP_STATS_ENABLED 6704 __kmp_stats_init(); 6705 #endif 6706 __kmp_init_lock(&__kmp_global_lock); 6707 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6708 __kmp_init_lock(&__kmp_debug_lock); 6709 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6710 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6711 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6712 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6713 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6714 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6715 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6716 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6717 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6718 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6719 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6720 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6721 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6722 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6723 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6724 #if KMP_USE_MONITOR 6725 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6726 #endif 6727 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6728 6729 /* conduct initialization and initial setup of configuration */ 6730 6731 __kmp_runtime_initialize(); 6732 6733 #if KMP_MIC_SUPPORTED 6734 __kmp_check_mic_type(); 6735 #endif 6736 6737 // Some global variable initialization moved here from kmp_env_initialize() 6738 #ifdef KMP_DEBUG 6739 kmp_diag = 0; 6740 #endif 6741 __kmp_abort_delay = 0; 6742 6743 // From __kmp_init_dflt_team_nth() 6744 /* assume the entire machine will be used */ 6745 __kmp_dflt_team_nth_ub = __kmp_xproc; 6746 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6747 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6748 } 6749 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6750 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6751 } 6752 __kmp_max_nth = __kmp_sys_max_nth; 6753 __kmp_cg_max_nth = __kmp_sys_max_nth; 6754 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6755 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6756 __kmp_teams_max_nth = __kmp_sys_max_nth; 6757 } 6758 6759 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6760 // part 6761 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6762 #if KMP_USE_MONITOR 6763 __kmp_monitor_wakeups = 6764 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6765 __kmp_bt_intervals = 6766 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6767 #endif 6768 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6769 __kmp_library = library_throughput; 6770 // From KMP_SCHEDULE initialization 6771 __kmp_static = kmp_sch_static_balanced; 6772 // AC: do not use analytical here, because it is non-monotonous 6773 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6774 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6775 // need to repeat assignment 6776 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6777 // bit control and barrier method control parts 6778 #if KMP_FAST_REDUCTION_BARRIER 6779 #define kmp_reduction_barrier_gather_bb ((int)1) 6780 #define kmp_reduction_barrier_release_bb ((int)1) 6781 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6782 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6783 #endif // KMP_FAST_REDUCTION_BARRIER 6784 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6785 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6786 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6787 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6788 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6789 #if KMP_FAST_REDUCTION_BARRIER 6790 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6791 // lin_64 ): hyper,1 6792 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6793 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6794 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6795 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6796 } 6797 #endif // KMP_FAST_REDUCTION_BARRIER 6798 } 6799 #if KMP_FAST_REDUCTION_BARRIER 6800 #undef kmp_reduction_barrier_release_pat 6801 #undef kmp_reduction_barrier_gather_pat 6802 #undef kmp_reduction_barrier_release_bb 6803 #undef kmp_reduction_barrier_gather_bb 6804 #endif // KMP_FAST_REDUCTION_BARRIER 6805 #if KMP_MIC_SUPPORTED 6806 if (__kmp_mic_type == mic2) { // KNC 6807 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6808 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6809 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6810 1; // forkjoin release 6811 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6812 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6813 } 6814 #if KMP_FAST_REDUCTION_BARRIER 6815 if (__kmp_mic_type == mic2) { // KNC 6816 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6817 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6818 } 6819 #endif // KMP_FAST_REDUCTION_BARRIER 6820 #endif // KMP_MIC_SUPPORTED 6821 6822 // From KMP_CHECKS initialization 6823 #ifdef KMP_DEBUG 6824 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6825 #else 6826 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6827 #endif 6828 6829 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6830 __kmp_foreign_tp = TRUE; 6831 6832 __kmp_global.g.g_dynamic = FALSE; 6833 __kmp_global.g.g_dynamic_mode = dynamic_default; 6834 6835 __kmp_env_initialize(NULL); 6836 6837 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6838 __kmp_user_level_mwait_init(); 6839 #endif 6840 // Print all messages in message catalog for testing purposes. 6841 #ifdef KMP_DEBUG 6842 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6843 if (__kmp_str_match_true(val)) { 6844 kmp_str_buf_t buffer; 6845 __kmp_str_buf_init(&buffer); 6846 __kmp_i18n_dump_catalog(&buffer); 6847 __kmp_printf("%s", buffer.str); 6848 __kmp_str_buf_free(&buffer); 6849 } 6850 __kmp_env_free(&val); 6851 #endif 6852 6853 __kmp_threads_capacity = 6854 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6855 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6856 __kmp_tp_capacity = __kmp_default_tp_capacity( 6857 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6858 6859 // If the library is shut down properly, both pools must be NULL. Just in 6860 // case, set them to NULL -- some memory may leak, but subsequent code will 6861 // work even if pools are not freed. 6862 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6863 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6864 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6865 __kmp_thread_pool = NULL; 6866 __kmp_thread_pool_insert_pt = NULL; 6867 __kmp_team_pool = NULL; 6868 6869 /* Allocate all of the variable sized records */ 6870 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6871 * expandable */ 6872 /* Since allocation is cache-aligned, just add extra padding at the end */ 6873 size = 6874 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6875 CACHE_LINE; 6876 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6877 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6878 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6879 6880 /* init thread counts */ 6881 KMP_DEBUG_ASSERT(__kmp_all_nth == 6882 0); // Asserts fail if the library is reinitializing and 6883 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6884 __kmp_all_nth = 0; 6885 __kmp_nth = 0; 6886 6887 /* setup the uber master thread and hierarchy */ 6888 gtid = __kmp_register_root(TRUE); 6889 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6890 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6891 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6892 6893 KMP_MB(); /* Flush all pending memory write invalidates. */ 6894 6895 __kmp_common_initialize(); 6896 6897 #if KMP_OS_UNIX 6898 /* invoke the child fork handler */ 6899 __kmp_register_atfork(); 6900 #endif 6901 6902 #if !KMP_DYNAMIC_LIB 6903 { 6904 /* Invoke the exit handler when the program finishes, only for static 6905 library. For dynamic library, we already have _fini and DllMain. */ 6906 int rc = atexit(__kmp_internal_end_atexit); 6907 if (rc != 0) { 6908 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6909 __kmp_msg_null); 6910 } 6911 } 6912 #endif 6913 6914 #if KMP_HANDLE_SIGNALS 6915 #if KMP_OS_UNIX 6916 /* NOTE: make sure that this is called before the user installs their own 6917 signal handlers so that the user handlers are called first. this way they 6918 can return false, not call our handler, avoid terminating the library, and 6919 continue execution where they left off. */ 6920 __kmp_install_signals(FALSE); 6921 #endif /* KMP_OS_UNIX */ 6922 #if KMP_OS_WINDOWS 6923 __kmp_install_signals(TRUE); 6924 #endif /* KMP_OS_WINDOWS */ 6925 #endif 6926 6927 /* we have finished the serial initialization */ 6928 __kmp_init_counter++; 6929 6930 __kmp_init_serial = TRUE; 6931 6932 if (__kmp_settings) { 6933 __kmp_env_print(); 6934 } 6935 6936 if (__kmp_display_env || __kmp_display_env_verbose) { 6937 __kmp_env_print_2(); 6938 } 6939 6940 #if OMPT_SUPPORT 6941 ompt_post_init(); 6942 #endif 6943 6944 KMP_MB(); 6945 6946 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6947 } 6948 6949 void __kmp_serial_initialize(void) { 6950 if (__kmp_init_serial) { 6951 return; 6952 } 6953 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6954 if (__kmp_init_serial) { 6955 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6956 return; 6957 } 6958 __kmp_do_serial_initialize(); 6959 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6960 } 6961 6962 static void __kmp_do_middle_initialize(void) { 6963 int i, j; 6964 int prev_dflt_team_nth; 6965 6966 if (!__kmp_init_serial) { 6967 __kmp_do_serial_initialize(); 6968 } 6969 6970 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6971 6972 // Save the previous value for the __kmp_dflt_team_nth so that 6973 // we can avoid some reinitialization if it hasn't changed. 6974 prev_dflt_team_nth = __kmp_dflt_team_nth; 6975 6976 #if KMP_AFFINITY_SUPPORTED 6977 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6978 // number of cores on the machine. 6979 __kmp_affinity_initialize(); 6980 6981 // Run through the __kmp_threads array and set the affinity mask 6982 // for each root thread that is currently registered with the RTL. 6983 for (i = 0; i < __kmp_threads_capacity; i++) { 6984 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6985 __kmp_affinity_set_init_mask(i, TRUE); 6986 } 6987 } 6988 #endif /* KMP_AFFINITY_SUPPORTED */ 6989 6990 KMP_ASSERT(__kmp_xproc > 0); 6991 if (__kmp_avail_proc == 0) { 6992 __kmp_avail_proc = __kmp_xproc; 6993 } 6994 6995 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6996 // correct them now 6997 j = 0; 6998 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6999 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7000 __kmp_avail_proc; 7001 j++; 7002 } 7003 7004 if (__kmp_dflt_team_nth == 0) { 7005 #ifdef KMP_DFLT_NTH_CORES 7006 // Default #threads = #cores 7007 __kmp_dflt_team_nth = __kmp_ncores; 7008 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7009 "__kmp_ncores (%d)\n", 7010 __kmp_dflt_team_nth)); 7011 #else 7012 // Default #threads = #available OS procs 7013 __kmp_dflt_team_nth = __kmp_avail_proc; 7014 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7015 "__kmp_avail_proc(%d)\n", 7016 __kmp_dflt_team_nth)); 7017 #endif /* KMP_DFLT_NTH_CORES */ 7018 } 7019 7020 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7021 __kmp_dflt_team_nth = KMP_MIN_NTH; 7022 } 7023 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7024 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7025 } 7026 7027 // There's no harm in continuing if the following check fails, 7028 // but it indicates an error in the previous logic. 7029 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7030 7031 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7032 // Run through the __kmp_threads array and set the num threads icv for each 7033 // root thread that is currently registered with the RTL (which has not 7034 // already explicitly set its nthreads-var with a call to 7035 // omp_set_num_threads()). 7036 for (i = 0; i < __kmp_threads_capacity; i++) { 7037 kmp_info_t *thread = __kmp_threads[i]; 7038 if (thread == NULL) 7039 continue; 7040 if (thread->th.th_current_task->td_icvs.nproc != 0) 7041 continue; 7042 7043 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7044 } 7045 } 7046 KA_TRACE( 7047 20, 7048 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7049 __kmp_dflt_team_nth)); 7050 7051 #ifdef KMP_ADJUST_BLOCKTIME 7052 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7053 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7054 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7055 if (__kmp_nth > __kmp_avail_proc) { 7056 __kmp_zero_bt = TRUE; 7057 } 7058 } 7059 #endif /* KMP_ADJUST_BLOCKTIME */ 7060 7061 /* we have finished middle initialization */ 7062 TCW_SYNC_4(__kmp_init_middle, TRUE); 7063 7064 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7065 } 7066 7067 void __kmp_middle_initialize(void) { 7068 if (__kmp_init_middle) { 7069 return; 7070 } 7071 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7072 if (__kmp_init_middle) { 7073 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7074 return; 7075 } 7076 __kmp_do_middle_initialize(); 7077 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7078 } 7079 7080 void __kmp_parallel_initialize(void) { 7081 int gtid = __kmp_entry_gtid(); // this might be a new root 7082 7083 /* synchronize parallel initialization (for sibling) */ 7084 if (TCR_4(__kmp_init_parallel)) 7085 return; 7086 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7087 if (TCR_4(__kmp_init_parallel)) { 7088 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7089 return; 7090 } 7091 7092 /* TODO reinitialization after we have already shut down */ 7093 if (TCR_4(__kmp_global.g.g_done)) { 7094 KA_TRACE( 7095 10, 7096 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7097 __kmp_infinite_loop(); 7098 } 7099 7100 /* jc: The lock __kmp_initz_lock is already held, so calling 7101 __kmp_serial_initialize would cause a deadlock. So we call 7102 __kmp_do_serial_initialize directly. */ 7103 if (!__kmp_init_middle) { 7104 __kmp_do_middle_initialize(); 7105 } 7106 __kmp_resume_if_hard_paused(); 7107 7108 /* begin initialization */ 7109 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7110 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7111 7112 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7113 // Save the FP control regs. 7114 // Worker threads will set theirs to these values at thread startup. 7115 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7116 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7117 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7118 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7119 7120 #if KMP_OS_UNIX 7121 #if KMP_HANDLE_SIGNALS 7122 /* must be after __kmp_serial_initialize */ 7123 __kmp_install_signals(TRUE); 7124 #endif 7125 #endif 7126 7127 __kmp_suspend_initialize(); 7128 7129 #if defined(USE_LOAD_BALANCE) 7130 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7131 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7132 } 7133 #else 7134 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7135 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7136 } 7137 #endif 7138 7139 if (__kmp_version) { 7140 __kmp_print_version_2(); 7141 } 7142 7143 /* we have finished parallel initialization */ 7144 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7145 7146 KMP_MB(); 7147 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7148 7149 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7150 } 7151 7152 void __kmp_hidden_helper_initialize() { 7153 if (TCR_4(__kmp_init_hidden_helper)) 7154 return; 7155 7156 // __kmp_parallel_initialize is required before we initialize hidden helper 7157 if (!TCR_4(__kmp_init_parallel)) 7158 __kmp_parallel_initialize(); 7159 7160 // Double check. Note that this double check should not be placed before 7161 // __kmp_parallel_initialize as it will cause dead lock. 7162 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7163 if (TCR_4(__kmp_init_hidden_helper)) { 7164 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7165 return; 7166 } 7167 7168 // Set the count of hidden helper tasks to be executed to zero 7169 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7170 7171 // Set the global variable indicating that we're initializing hidden helper 7172 // team/threads 7173 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7174 7175 // Platform independent initialization 7176 __kmp_do_initialize_hidden_helper_threads(); 7177 7178 // Wait here for the finish of initialization of hidden helper teams 7179 __kmp_hidden_helper_threads_initz_wait(); 7180 7181 // We have finished hidden helper initialization 7182 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7183 7184 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7185 } 7186 7187 /* ------------------------------------------------------------------------ */ 7188 7189 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7190 kmp_team_t *team) { 7191 kmp_disp_t *dispatch; 7192 7193 KMP_MB(); 7194 7195 /* none of the threads have encountered any constructs, yet. */ 7196 this_thr->th.th_local.this_construct = 0; 7197 #if KMP_CACHE_MANAGE 7198 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7199 #endif /* KMP_CACHE_MANAGE */ 7200 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7201 KMP_DEBUG_ASSERT(dispatch); 7202 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7203 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7204 // this_thr->th.th_info.ds.ds_tid ] ); 7205 7206 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7207 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7208 if (__kmp_env_consistency_check) 7209 __kmp_push_parallel(gtid, team->t.t_ident); 7210 7211 KMP_MB(); /* Flush all pending memory write invalidates. */ 7212 } 7213 7214 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7215 kmp_team_t *team) { 7216 if (__kmp_env_consistency_check) 7217 __kmp_pop_parallel(gtid, team->t.t_ident); 7218 7219 __kmp_finish_implicit_task(this_thr); 7220 } 7221 7222 int __kmp_invoke_task_func(int gtid) { 7223 int rc; 7224 int tid = __kmp_tid_from_gtid(gtid); 7225 kmp_info_t *this_thr = __kmp_threads[gtid]; 7226 kmp_team_t *team = this_thr->th.th_team; 7227 7228 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7229 #if USE_ITT_BUILD 7230 if (__itt_stack_caller_create_ptr) { 7231 // inform ittnotify about entering user's code 7232 if (team->t.t_stack_id != NULL) { 7233 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7234 } else { 7235 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7236 __kmp_itt_stack_callee_enter( 7237 (__itt_caller)team->t.t_parent->t.t_stack_id); 7238 } 7239 } 7240 #endif /* USE_ITT_BUILD */ 7241 #if INCLUDE_SSC_MARKS 7242 SSC_MARK_INVOKING(); 7243 #endif 7244 7245 #if OMPT_SUPPORT 7246 void *dummy; 7247 void **exit_frame_p; 7248 ompt_data_t *my_task_data; 7249 ompt_data_t *my_parallel_data; 7250 int ompt_team_size; 7251 7252 if (ompt_enabled.enabled) { 7253 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7254 .ompt_task_info.frame.exit_frame.ptr); 7255 } else { 7256 exit_frame_p = &dummy; 7257 } 7258 7259 my_task_data = 7260 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7261 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7262 if (ompt_enabled.ompt_callback_implicit_task) { 7263 ompt_team_size = team->t.t_nproc; 7264 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7265 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7266 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7267 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7268 } 7269 #endif 7270 7271 #if KMP_STATS_ENABLED 7272 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7273 if (previous_state == stats_state_e::TEAMS_REGION) { 7274 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7275 } else { 7276 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7277 } 7278 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7279 #endif 7280 7281 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7282 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7283 #if OMPT_SUPPORT 7284 , 7285 exit_frame_p 7286 #endif 7287 ); 7288 #if OMPT_SUPPORT 7289 *exit_frame_p = NULL; 7290 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7291 #endif 7292 7293 #if KMP_STATS_ENABLED 7294 if (previous_state == stats_state_e::TEAMS_REGION) { 7295 KMP_SET_THREAD_STATE(previous_state); 7296 } 7297 KMP_POP_PARTITIONED_TIMER(); 7298 #endif 7299 7300 #if USE_ITT_BUILD 7301 if (__itt_stack_caller_create_ptr) { 7302 // inform ittnotify about leaving user's code 7303 if (team->t.t_stack_id != NULL) { 7304 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7305 } else { 7306 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7307 __kmp_itt_stack_callee_leave( 7308 (__itt_caller)team->t.t_parent->t.t_stack_id); 7309 } 7310 } 7311 #endif /* USE_ITT_BUILD */ 7312 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7313 7314 return rc; 7315 } 7316 7317 void __kmp_teams_master(int gtid) { 7318 // This routine is called by all primary threads in teams construct 7319 kmp_info_t *thr = __kmp_threads[gtid]; 7320 kmp_team_t *team = thr->th.th_team; 7321 ident_t *loc = team->t.t_ident; 7322 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7323 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7324 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7325 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7326 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7327 7328 // This thread is a new CG root. Set up the proper variables. 7329 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7330 tmp->cg_root = thr; // Make thr the CG root 7331 // Init to thread limit stored when league primary threads were forked 7332 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7333 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7334 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7335 " cg_nthreads to 1\n", 7336 thr, tmp)); 7337 tmp->up = thr->th.th_cg_roots; 7338 thr->th.th_cg_roots = tmp; 7339 7340 // Launch league of teams now, but not let workers execute 7341 // (they hang on fork barrier until next parallel) 7342 #if INCLUDE_SSC_MARKS 7343 SSC_MARK_FORKING(); 7344 #endif 7345 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7346 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7347 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7348 #if INCLUDE_SSC_MARKS 7349 SSC_MARK_JOINING(); 7350 #endif 7351 // If the team size was reduced from the limit, set it to the new size 7352 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7353 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7354 // AC: last parameter "1" eliminates join barrier which won't work because 7355 // worker threads are in a fork barrier waiting for more parallel regions 7356 __kmp_join_call(loc, gtid 7357 #if OMPT_SUPPORT 7358 , 7359 fork_context_intel 7360 #endif 7361 , 7362 1); 7363 } 7364 7365 int __kmp_invoke_teams_master(int gtid) { 7366 kmp_info_t *this_thr = __kmp_threads[gtid]; 7367 kmp_team_t *team = this_thr->th.th_team; 7368 #if KMP_DEBUG 7369 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7370 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7371 (void *)__kmp_teams_master); 7372 #endif 7373 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7374 #if OMPT_SUPPORT 7375 int tid = __kmp_tid_from_gtid(gtid); 7376 ompt_data_t *task_data = 7377 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7378 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7379 if (ompt_enabled.ompt_callback_implicit_task) { 7380 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7381 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7382 ompt_task_initial); 7383 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7384 } 7385 #endif 7386 __kmp_teams_master(gtid); 7387 #if OMPT_SUPPORT 7388 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7389 #endif 7390 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7391 return 1; 7392 } 7393 7394 /* this sets the requested number of threads for the next parallel region 7395 encountered by this team. since this should be enclosed in the forkjoin 7396 critical section it should avoid race conditions with asymmetrical nested 7397 parallelism */ 7398 7399 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7400 kmp_info_t *thr = __kmp_threads[gtid]; 7401 7402 if (num_threads > 0) 7403 thr->th.th_set_nproc = num_threads; 7404 } 7405 7406 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7407 int num_threads) { 7408 KMP_DEBUG_ASSERT(thr); 7409 // Remember the number of threads for inner parallel regions 7410 if (!TCR_4(__kmp_init_middle)) 7411 __kmp_middle_initialize(); // get internal globals calculated 7412 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7413 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7414 7415 if (num_threads == 0) { 7416 if (__kmp_teams_thread_limit > 0) { 7417 num_threads = __kmp_teams_thread_limit; 7418 } else { 7419 num_threads = __kmp_avail_proc / num_teams; 7420 } 7421 // adjust num_threads w/o warning as it is not user setting 7422 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7423 // no thread_limit clause specified - do not change thread-limit-var ICV 7424 if (num_threads > __kmp_dflt_team_nth) { 7425 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7426 } 7427 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7428 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7429 } // prevent team size to exceed thread-limit-var 7430 if (num_teams * num_threads > __kmp_teams_max_nth) { 7431 num_threads = __kmp_teams_max_nth / num_teams; 7432 } 7433 if (num_threads == 0) { 7434 num_threads = 1; 7435 } 7436 } else { 7437 // This thread will be the primary thread of the league primary threads 7438 // Store new thread limit; old limit is saved in th_cg_roots list 7439 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7440 // num_threads = min(num_threads, nthreads-var) 7441 if (num_threads > __kmp_dflt_team_nth) { 7442 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7443 } 7444 if (num_teams * num_threads > __kmp_teams_max_nth) { 7445 int new_threads = __kmp_teams_max_nth / num_teams; 7446 if (new_threads == 0) { 7447 new_threads = 1; 7448 } 7449 if (new_threads != num_threads) { 7450 if (!__kmp_reserve_warn) { // user asked for too many threads 7451 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7452 __kmp_msg(kmp_ms_warning, 7453 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7454 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7455 } 7456 } 7457 num_threads = new_threads; 7458 } 7459 } 7460 thr->th.th_teams_size.nth = num_threads; 7461 } 7462 7463 /* this sets the requested number of teams for the teams region and/or 7464 the number of threads for the next parallel region encountered */ 7465 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7466 int num_threads) { 7467 kmp_info_t *thr = __kmp_threads[gtid]; 7468 KMP_DEBUG_ASSERT(num_teams >= 0); 7469 KMP_DEBUG_ASSERT(num_threads >= 0); 7470 7471 if (num_teams == 0) { 7472 if (__kmp_nteams > 0) { 7473 num_teams = __kmp_nteams; 7474 } else { 7475 num_teams = 1; // default number of teams is 1. 7476 } 7477 } 7478 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7479 if (!__kmp_reserve_warn) { 7480 __kmp_reserve_warn = 1; 7481 __kmp_msg(kmp_ms_warning, 7482 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7483 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7484 } 7485 num_teams = __kmp_teams_max_nth; 7486 } 7487 // Set number of teams (number of threads in the outer "parallel" of the 7488 // teams) 7489 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7490 7491 __kmp_push_thread_limit(thr, num_teams, num_threads); 7492 } 7493 7494 /* This sets the requested number of teams for the teams region and/or 7495 the number of threads for the next parallel region encountered */ 7496 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7497 int num_teams_ub, int num_threads) { 7498 kmp_info_t *thr = __kmp_threads[gtid]; 7499 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7500 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7501 KMP_DEBUG_ASSERT(num_threads >= 0); 7502 7503 if (num_teams_lb > num_teams_ub) { 7504 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7505 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7506 } 7507 7508 int num_teams = 1; // defalt number of teams is 1. 7509 7510 if (num_teams_lb == 0 && num_teams_ub > 0) 7511 num_teams_lb = num_teams_ub; 7512 7513 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7514 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7515 if (num_teams > __kmp_teams_max_nth) { 7516 if (!__kmp_reserve_warn) { 7517 __kmp_reserve_warn = 1; 7518 __kmp_msg(kmp_ms_warning, 7519 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7520 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7521 } 7522 num_teams = __kmp_teams_max_nth; 7523 } 7524 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7525 num_teams = num_teams_ub; 7526 } else { // num_teams_lb <= num_teams <= num_teams_ub 7527 if (num_threads == 0) { 7528 if (num_teams_ub > __kmp_teams_max_nth) { 7529 num_teams = num_teams_lb; 7530 } else { 7531 num_teams = num_teams_ub; 7532 } 7533 } else { 7534 num_teams = (num_threads > __kmp_teams_max_nth) 7535 ? num_teams 7536 : __kmp_teams_max_nth / num_threads; 7537 if (num_teams < num_teams_lb) { 7538 num_teams = num_teams_lb; 7539 } else if (num_teams > num_teams_ub) { 7540 num_teams = num_teams_ub; 7541 } 7542 } 7543 } 7544 // Set number of teams (number of threads in the outer "parallel" of the 7545 // teams) 7546 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7547 7548 __kmp_push_thread_limit(thr, num_teams, num_threads); 7549 } 7550 7551 // Set the proc_bind var to use in the following parallel region. 7552 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7553 kmp_info_t *thr = __kmp_threads[gtid]; 7554 thr->th.th_set_proc_bind = proc_bind; 7555 } 7556 7557 /* Launch the worker threads into the microtask. */ 7558 7559 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7560 kmp_info_t *this_thr = __kmp_threads[gtid]; 7561 7562 #ifdef KMP_DEBUG 7563 int f; 7564 #endif /* KMP_DEBUG */ 7565 7566 KMP_DEBUG_ASSERT(team); 7567 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7568 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7569 KMP_MB(); /* Flush all pending memory write invalidates. */ 7570 7571 team->t.t_construct = 0; /* no single directives seen yet */ 7572 team->t.t_ordered.dt.t_value = 7573 0; /* thread 0 enters the ordered section first */ 7574 7575 /* Reset the identifiers on the dispatch buffer */ 7576 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7577 if (team->t.t_max_nproc > 1) { 7578 int i; 7579 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7580 team->t.t_disp_buffer[i].buffer_index = i; 7581 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7582 } 7583 } else { 7584 team->t.t_disp_buffer[0].buffer_index = 0; 7585 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7586 } 7587 7588 KMP_MB(); /* Flush all pending memory write invalidates. */ 7589 KMP_ASSERT(this_thr->th.th_team == team); 7590 7591 #ifdef KMP_DEBUG 7592 for (f = 0; f < team->t.t_nproc; f++) { 7593 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7594 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7595 } 7596 #endif /* KMP_DEBUG */ 7597 7598 /* release the worker threads so they may begin working */ 7599 __kmp_fork_barrier(gtid, 0); 7600 } 7601 7602 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7603 kmp_info_t *this_thr = __kmp_threads[gtid]; 7604 7605 KMP_DEBUG_ASSERT(team); 7606 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7607 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7608 KMP_MB(); /* Flush all pending memory write invalidates. */ 7609 7610 /* Join barrier after fork */ 7611 7612 #ifdef KMP_DEBUG 7613 if (__kmp_threads[gtid] && 7614 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7615 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7616 __kmp_threads[gtid]); 7617 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7618 "team->t.t_nproc=%d\n", 7619 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7620 team->t.t_nproc); 7621 __kmp_print_structure(); 7622 } 7623 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7624 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7625 #endif /* KMP_DEBUG */ 7626 7627 __kmp_join_barrier(gtid); /* wait for everyone */ 7628 #if OMPT_SUPPORT 7629 if (ompt_enabled.enabled && 7630 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7631 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7632 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7633 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7634 #if OMPT_OPTIONAL 7635 void *codeptr = NULL; 7636 if (KMP_MASTER_TID(ds_tid) && 7637 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7638 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7639 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7640 7641 if (ompt_enabled.ompt_callback_sync_region_wait) { 7642 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7643 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7644 codeptr); 7645 } 7646 if (ompt_enabled.ompt_callback_sync_region) { 7647 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7648 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7649 codeptr); 7650 } 7651 #endif 7652 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7653 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7654 ompt_scope_end, NULL, task_data, 0, ds_tid, 7655 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7656 } 7657 } 7658 #endif 7659 7660 KMP_MB(); /* Flush all pending memory write invalidates. */ 7661 KMP_ASSERT(this_thr->th.th_team == team); 7662 } 7663 7664 /* ------------------------------------------------------------------------ */ 7665 7666 #ifdef USE_LOAD_BALANCE 7667 7668 // Return the worker threads actively spinning in the hot team, if we 7669 // are at the outermost level of parallelism. Otherwise, return 0. 7670 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7671 int i; 7672 int retval; 7673 kmp_team_t *hot_team; 7674 7675 if (root->r.r_active) { 7676 return 0; 7677 } 7678 hot_team = root->r.r_hot_team; 7679 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7680 return hot_team->t.t_nproc - 1; // Don't count primary thread 7681 } 7682 7683 // Skip the primary thread - it is accounted for elsewhere. 7684 retval = 0; 7685 for (i = 1; i < hot_team->t.t_nproc; i++) { 7686 if (hot_team->t.t_threads[i]->th.th_active) { 7687 retval++; 7688 } 7689 } 7690 return retval; 7691 } 7692 7693 // Perform an automatic adjustment to the number of 7694 // threads used by the next parallel region. 7695 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7696 int retval; 7697 int pool_active; 7698 int hot_team_active; 7699 int team_curr_active; 7700 int system_active; 7701 7702 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7703 set_nproc)); 7704 KMP_DEBUG_ASSERT(root); 7705 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7706 ->th.th_current_task->td_icvs.dynamic == TRUE); 7707 KMP_DEBUG_ASSERT(set_nproc > 1); 7708 7709 if (set_nproc == 1) { 7710 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7711 return 1; 7712 } 7713 7714 // Threads that are active in the thread pool, active in the hot team for this 7715 // particular root (if we are at the outer par level), and the currently 7716 // executing thread (to become the primary thread) are available to add to the 7717 // new team, but are currently contributing to the system load, and must be 7718 // accounted for. 7719 pool_active = __kmp_thread_pool_active_nth; 7720 hot_team_active = __kmp_active_hot_team_nproc(root); 7721 team_curr_active = pool_active + hot_team_active + 1; 7722 7723 // Check the system load. 7724 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7725 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7726 "hot team active = %d\n", 7727 system_active, pool_active, hot_team_active)); 7728 7729 if (system_active < 0) { 7730 // There was an error reading the necessary info from /proc, so use the 7731 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7732 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7733 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7734 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7735 7736 // Make this call behave like the thread limit algorithm. 7737 retval = __kmp_avail_proc - __kmp_nth + 7738 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7739 if (retval > set_nproc) { 7740 retval = set_nproc; 7741 } 7742 if (retval < KMP_MIN_NTH) { 7743 retval = KMP_MIN_NTH; 7744 } 7745 7746 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7747 retval)); 7748 return retval; 7749 } 7750 7751 // There is a slight delay in the load balance algorithm in detecting new 7752 // running procs. The real system load at this instant should be at least as 7753 // large as the #active omp thread that are available to add to the team. 7754 if (system_active < team_curr_active) { 7755 system_active = team_curr_active; 7756 } 7757 retval = __kmp_avail_proc - system_active + team_curr_active; 7758 if (retval > set_nproc) { 7759 retval = set_nproc; 7760 } 7761 if (retval < KMP_MIN_NTH) { 7762 retval = KMP_MIN_NTH; 7763 } 7764 7765 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7766 return retval; 7767 } // __kmp_load_balance_nproc() 7768 7769 #endif /* USE_LOAD_BALANCE */ 7770 7771 /* ------------------------------------------------------------------------ */ 7772 7773 /* NOTE: this is called with the __kmp_init_lock held */ 7774 void __kmp_cleanup(void) { 7775 int f; 7776 7777 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7778 7779 if (TCR_4(__kmp_init_parallel)) { 7780 #if KMP_HANDLE_SIGNALS 7781 __kmp_remove_signals(); 7782 #endif 7783 TCW_4(__kmp_init_parallel, FALSE); 7784 } 7785 7786 if (TCR_4(__kmp_init_middle)) { 7787 #if KMP_AFFINITY_SUPPORTED 7788 __kmp_affinity_uninitialize(); 7789 #endif /* KMP_AFFINITY_SUPPORTED */ 7790 __kmp_cleanup_hierarchy(); 7791 TCW_4(__kmp_init_middle, FALSE); 7792 } 7793 7794 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7795 7796 if (__kmp_init_serial) { 7797 __kmp_runtime_destroy(); 7798 __kmp_init_serial = FALSE; 7799 } 7800 7801 __kmp_cleanup_threadprivate_caches(); 7802 7803 for (f = 0; f < __kmp_threads_capacity; f++) { 7804 if (__kmp_root[f] != NULL) { 7805 __kmp_free(__kmp_root[f]); 7806 __kmp_root[f] = NULL; 7807 } 7808 } 7809 __kmp_free(__kmp_threads); 7810 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7811 // there is no need in freeing __kmp_root. 7812 __kmp_threads = NULL; 7813 __kmp_root = NULL; 7814 __kmp_threads_capacity = 0; 7815 7816 #if KMP_USE_DYNAMIC_LOCK 7817 __kmp_cleanup_indirect_user_locks(); 7818 #else 7819 __kmp_cleanup_user_locks(); 7820 #endif 7821 7822 #if KMP_AFFINITY_SUPPORTED 7823 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7824 __kmp_cpuinfo_file = NULL; 7825 #endif /* KMP_AFFINITY_SUPPORTED */ 7826 7827 #if KMP_USE_ADAPTIVE_LOCKS 7828 #if KMP_DEBUG_ADAPTIVE_LOCKS 7829 __kmp_print_speculative_stats(); 7830 #endif 7831 #endif 7832 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7833 __kmp_nested_nth.nth = NULL; 7834 __kmp_nested_nth.size = 0; 7835 __kmp_nested_nth.used = 0; 7836 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7837 __kmp_nested_proc_bind.bind_types = NULL; 7838 __kmp_nested_proc_bind.size = 0; 7839 __kmp_nested_proc_bind.used = 0; 7840 if (__kmp_affinity_format) { 7841 KMP_INTERNAL_FREE(__kmp_affinity_format); 7842 __kmp_affinity_format = NULL; 7843 } 7844 7845 __kmp_i18n_catclose(); 7846 7847 #if KMP_USE_HIER_SCHED 7848 __kmp_hier_scheds.deallocate(); 7849 #endif 7850 7851 #if KMP_STATS_ENABLED 7852 __kmp_stats_fini(); 7853 #endif 7854 7855 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7856 } 7857 7858 /* ------------------------------------------------------------------------ */ 7859 7860 int __kmp_ignore_mppbeg(void) { 7861 char *env; 7862 7863 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7864 if (__kmp_str_match_false(env)) 7865 return FALSE; 7866 } 7867 // By default __kmpc_begin() is no-op. 7868 return TRUE; 7869 } 7870 7871 int __kmp_ignore_mppend(void) { 7872 char *env; 7873 7874 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7875 if (__kmp_str_match_false(env)) 7876 return FALSE; 7877 } 7878 // By default __kmpc_end() is no-op. 7879 return TRUE; 7880 } 7881 7882 void __kmp_internal_begin(void) { 7883 int gtid; 7884 kmp_root_t *root; 7885 7886 /* this is a very important step as it will register new sibling threads 7887 and assign these new uber threads a new gtid */ 7888 gtid = __kmp_entry_gtid(); 7889 root = __kmp_threads[gtid]->th.th_root; 7890 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7891 7892 if (root->r.r_begin) 7893 return; 7894 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7895 if (root->r.r_begin) { 7896 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7897 return; 7898 } 7899 7900 root->r.r_begin = TRUE; 7901 7902 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7903 } 7904 7905 /* ------------------------------------------------------------------------ */ 7906 7907 void __kmp_user_set_library(enum library_type arg) { 7908 int gtid; 7909 kmp_root_t *root; 7910 kmp_info_t *thread; 7911 7912 /* first, make sure we are initialized so we can get our gtid */ 7913 7914 gtid = __kmp_entry_gtid(); 7915 thread = __kmp_threads[gtid]; 7916 7917 root = thread->th.th_root; 7918 7919 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7920 library_serial)); 7921 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7922 thread */ 7923 KMP_WARNING(SetLibraryIncorrectCall); 7924 return; 7925 } 7926 7927 switch (arg) { 7928 case library_serial: 7929 thread->th.th_set_nproc = 0; 7930 set__nproc(thread, 1); 7931 break; 7932 case library_turnaround: 7933 thread->th.th_set_nproc = 0; 7934 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7935 : __kmp_dflt_team_nth_ub); 7936 break; 7937 case library_throughput: 7938 thread->th.th_set_nproc = 0; 7939 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7940 : __kmp_dflt_team_nth_ub); 7941 break; 7942 default: 7943 KMP_FATAL(UnknownLibraryType, arg); 7944 } 7945 7946 __kmp_aux_set_library(arg); 7947 } 7948 7949 void __kmp_aux_set_stacksize(size_t arg) { 7950 if (!__kmp_init_serial) 7951 __kmp_serial_initialize(); 7952 7953 #if KMP_OS_DARWIN 7954 if (arg & (0x1000 - 1)) { 7955 arg &= ~(0x1000 - 1); 7956 if (arg + 0x1000) /* check for overflow if we round up */ 7957 arg += 0x1000; 7958 } 7959 #endif 7960 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7961 7962 /* only change the default stacksize before the first parallel region */ 7963 if (!TCR_4(__kmp_init_parallel)) { 7964 size_t value = arg; /* argument is in bytes */ 7965 7966 if (value < __kmp_sys_min_stksize) 7967 value = __kmp_sys_min_stksize; 7968 else if (value > KMP_MAX_STKSIZE) 7969 value = KMP_MAX_STKSIZE; 7970 7971 __kmp_stksize = value; 7972 7973 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7974 } 7975 7976 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7977 } 7978 7979 /* set the behaviour of the runtime library */ 7980 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7981 void __kmp_aux_set_library(enum library_type arg) { 7982 __kmp_library = arg; 7983 7984 switch (__kmp_library) { 7985 case library_serial: { 7986 KMP_INFORM(LibraryIsSerial); 7987 } break; 7988 case library_turnaround: 7989 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7990 __kmp_use_yield = 2; // only yield when oversubscribed 7991 break; 7992 case library_throughput: 7993 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7994 __kmp_dflt_blocktime = 200; 7995 break; 7996 default: 7997 KMP_FATAL(UnknownLibraryType, arg); 7998 } 7999 } 8000 8001 /* Getting team information common for all team API */ 8002 // Returns NULL if not in teams construct 8003 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8004 kmp_info_t *thr = __kmp_entry_thread(); 8005 teams_serialized = 0; 8006 if (thr->th.th_teams_microtask) { 8007 kmp_team_t *team = thr->th.th_team; 8008 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8009 int ii = team->t.t_level; 8010 teams_serialized = team->t.t_serialized; 8011 int level = tlevel + 1; 8012 KMP_DEBUG_ASSERT(ii >= tlevel); 8013 while (ii > level) { 8014 for (teams_serialized = team->t.t_serialized; 8015 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8016 } 8017 if (team->t.t_serialized && (!teams_serialized)) { 8018 team = team->t.t_parent; 8019 continue; 8020 } 8021 if (ii > level) { 8022 team = team->t.t_parent; 8023 ii--; 8024 } 8025 } 8026 return team; 8027 } 8028 return NULL; 8029 } 8030 8031 int __kmp_aux_get_team_num() { 8032 int serialized; 8033 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8034 if (team) { 8035 if (serialized > 1) { 8036 return 0; // teams region is serialized ( 1 team of 1 thread ). 8037 } else { 8038 return team->t.t_master_tid; 8039 } 8040 } 8041 return 0; 8042 } 8043 8044 int __kmp_aux_get_num_teams() { 8045 int serialized; 8046 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8047 if (team) { 8048 if (serialized > 1) { 8049 return 1; 8050 } else { 8051 return team->t.t_parent->t.t_nproc; 8052 } 8053 } 8054 return 1; 8055 } 8056 8057 /* ------------------------------------------------------------------------ */ 8058 8059 /* 8060 * Affinity Format Parser 8061 * 8062 * Field is in form of: %[[[0].]size]type 8063 * % and type are required (%% means print a literal '%') 8064 * type is either single char or long name surrounded by {}, 8065 * e.g., N or {num_threads} 8066 * 0 => leading zeros 8067 * . => right justified when size is specified 8068 * by default output is left justified 8069 * size is the *minimum* field length 8070 * All other characters are printed as is 8071 * 8072 * Available field types: 8073 * L {thread_level} - omp_get_level() 8074 * n {thread_num} - omp_get_thread_num() 8075 * h {host} - name of host machine 8076 * P {process_id} - process id (integer) 8077 * T {thread_identifier} - native thread identifier (integer) 8078 * N {num_threads} - omp_get_num_threads() 8079 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8080 * a {thread_affinity} - comma separated list of integers or integer ranges 8081 * (values of affinity mask) 8082 * 8083 * Implementation-specific field types can be added 8084 * If a type is unknown, print "undefined" 8085 */ 8086 8087 // Structure holding the short name, long name, and corresponding data type 8088 // for snprintf. A table of these will represent the entire valid keyword 8089 // field types. 8090 typedef struct kmp_affinity_format_field_t { 8091 char short_name; // from spec e.g., L -> thread level 8092 const char *long_name; // from spec thread_level -> thread level 8093 char field_format; // data type for snprintf (typically 'd' or 's' 8094 // for integer or string) 8095 } kmp_affinity_format_field_t; 8096 8097 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8098 #if KMP_AFFINITY_SUPPORTED 8099 {'A', "thread_affinity", 's'}, 8100 #endif 8101 {'t', "team_num", 'd'}, 8102 {'T', "num_teams", 'd'}, 8103 {'L', "nesting_level", 'd'}, 8104 {'n', "thread_num", 'd'}, 8105 {'N', "num_threads", 'd'}, 8106 {'a', "ancestor_tnum", 'd'}, 8107 {'H', "host", 's'}, 8108 {'P', "process_id", 'd'}, 8109 {'i', "native_thread_id", 'd'}}; 8110 8111 // Return the number of characters it takes to hold field 8112 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8113 const char **ptr, 8114 kmp_str_buf_t *field_buffer) { 8115 int rc, format_index, field_value; 8116 const char *width_left, *width_right; 8117 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8118 static const int FORMAT_SIZE = 20; 8119 char format[FORMAT_SIZE] = {0}; 8120 char absolute_short_name = 0; 8121 8122 KMP_DEBUG_ASSERT(gtid >= 0); 8123 KMP_DEBUG_ASSERT(th); 8124 KMP_DEBUG_ASSERT(**ptr == '%'); 8125 KMP_DEBUG_ASSERT(field_buffer); 8126 8127 __kmp_str_buf_clear(field_buffer); 8128 8129 // Skip the initial % 8130 (*ptr)++; 8131 8132 // Check for %% first 8133 if (**ptr == '%') { 8134 __kmp_str_buf_cat(field_buffer, "%", 1); 8135 (*ptr)++; // skip over the second % 8136 return 1; 8137 } 8138 8139 // Parse field modifiers if they are present 8140 pad_zeros = false; 8141 if (**ptr == '0') { 8142 pad_zeros = true; 8143 (*ptr)++; // skip over 0 8144 } 8145 right_justify = false; 8146 if (**ptr == '.') { 8147 right_justify = true; 8148 (*ptr)++; // skip over . 8149 } 8150 // Parse width of field: [width_left, width_right) 8151 width_left = width_right = NULL; 8152 if (**ptr >= '0' && **ptr <= '9') { 8153 width_left = *ptr; 8154 SKIP_DIGITS(*ptr); 8155 width_right = *ptr; 8156 } 8157 8158 // Create the format for KMP_SNPRINTF based on flags parsed above 8159 format_index = 0; 8160 format[format_index++] = '%'; 8161 if (!right_justify) 8162 format[format_index++] = '-'; 8163 if (pad_zeros) 8164 format[format_index++] = '0'; 8165 if (width_left && width_right) { 8166 int i = 0; 8167 // Only allow 8 digit number widths. 8168 // This also prevents overflowing format variable 8169 while (i < 8 && width_left < width_right) { 8170 format[format_index++] = *width_left; 8171 width_left++; 8172 i++; 8173 } 8174 } 8175 8176 // Parse a name (long or short) 8177 // Canonicalize the name into absolute_short_name 8178 found_valid_name = false; 8179 parse_long_name = (**ptr == '{'); 8180 if (parse_long_name) 8181 (*ptr)++; // skip initial left brace 8182 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8183 sizeof(__kmp_affinity_format_table[0]); 8184 ++i) { 8185 char short_name = __kmp_affinity_format_table[i].short_name; 8186 const char *long_name = __kmp_affinity_format_table[i].long_name; 8187 char field_format = __kmp_affinity_format_table[i].field_format; 8188 if (parse_long_name) { 8189 size_t length = KMP_STRLEN(long_name); 8190 if (strncmp(*ptr, long_name, length) == 0) { 8191 found_valid_name = true; 8192 (*ptr) += length; // skip the long name 8193 } 8194 } else if (**ptr == short_name) { 8195 found_valid_name = true; 8196 (*ptr)++; // skip the short name 8197 } 8198 if (found_valid_name) { 8199 format[format_index++] = field_format; 8200 format[format_index++] = '\0'; 8201 absolute_short_name = short_name; 8202 break; 8203 } 8204 } 8205 if (parse_long_name) { 8206 if (**ptr != '}') { 8207 absolute_short_name = 0; 8208 } else { 8209 (*ptr)++; // skip over the right brace 8210 } 8211 } 8212 8213 // Attempt to fill the buffer with the requested 8214 // value using snprintf within __kmp_str_buf_print() 8215 switch (absolute_short_name) { 8216 case 't': 8217 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8218 break; 8219 case 'T': 8220 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8221 break; 8222 case 'L': 8223 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8224 break; 8225 case 'n': 8226 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8227 break; 8228 case 'H': { 8229 static const int BUFFER_SIZE = 256; 8230 char buf[BUFFER_SIZE]; 8231 __kmp_expand_host_name(buf, BUFFER_SIZE); 8232 rc = __kmp_str_buf_print(field_buffer, format, buf); 8233 } break; 8234 case 'P': 8235 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8236 break; 8237 case 'i': 8238 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8239 break; 8240 case 'N': 8241 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8242 break; 8243 case 'a': 8244 field_value = 8245 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8246 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8247 break; 8248 #if KMP_AFFINITY_SUPPORTED 8249 case 'A': { 8250 kmp_str_buf_t buf; 8251 __kmp_str_buf_init(&buf); 8252 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8253 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8254 __kmp_str_buf_free(&buf); 8255 } break; 8256 #endif 8257 default: 8258 // According to spec, If an implementation does not have info for field 8259 // type, then "undefined" is printed 8260 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8261 // Skip the field 8262 if (parse_long_name) { 8263 SKIP_TOKEN(*ptr); 8264 if (**ptr == '}') 8265 (*ptr)++; 8266 } else { 8267 (*ptr)++; 8268 } 8269 } 8270 8271 KMP_ASSERT(format_index <= FORMAT_SIZE); 8272 return rc; 8273 } 8274 8275 /* 8276 * Return number of characters needed to hold the affinity string 8277 * (not including null byte character) 8278 * The resultant string is printed to buffer, which the caller can then 8279 * handle afterwards 8280 */ 8281 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8282 kmp_str_buf_t *buffer) { 8283 const char *parse_ptr; 8284 size_t retval; 8285 const kmp_info_t *th; 8286 kmp_str_buf_t field; 8287 8288 KMP_DEBUG_ASSERT(buffer); 8289 KMP_DEBUG_ASSERT(gtid >= 0); 8290 8291 __kmp_str_buf_init(&field); 8292 __kmp_str_buf_clear(buffer); 8293 8294 th = __kmp_threads[gtid]; 8295 retval = 0; 8296 8297 // If format is NULL or zero-length string, then we use 8298 // affinity-format-var ICV 8299 parse_ptr = format; 8300 if (parse_ptr == NULL || *parse_ptr == '\0') { 8301 parse_ptr = __kmp_affinity_format; 8302 } 8303 KMP_DEBUG_ASSERT(parse_ptr); 8304 8305 while (*parse_ptr != '\0') { 8306 // Parse a field 8307 if (*parse_ptr == '%') { 8308 // Put field in the buffer 8309 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8310 __kmp_str_buf_catbuf(buffer, &field); 8311 retval += rc; 8312 } else { 8313 // Put literal character in buffer 8314 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8315 retval++; 8316 parse_ptr++; 8317 } 8318 } 8319 __kmp_str_buf_free(&field); 8320 return retval; 8321 } 8322 8323 // Displays the affinity string to stdout 8324 void __kmp_aux_display_affinity(int gtid, const char *format) { 8325 kmp_str_buf_t buf; 8326 __kmp_str_buf_init(&buf); 8327 __kmp_aux_capture_affinity(gtid, format, &buf); 8328 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8329 __kmp_str_buf_free(&buf); 8330 } 8331 8332 /* ------------------------------------------------------------------------ */ 8333 8334 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8335 int blocktime = arg; /* argument is in milliseconds */ 8336 #if KMP_USE_MONITOR 8337 int bt_intervals; 8338 #endif 8339 kmp_int8 bt_set; 8340 8341 __kmp_save_internal_controls(thread); 8342 8343 /* Normalize and set blocktime for the teams */ 8344 if (blocktime < KMP_MIN_BLOCKTIME) 8345 blocktime = KMP_MIN_BLOCKTIME; 8346 else if (blocktime > KMP_MAX_BLOCKTIME) 8347 blocktime = KMP_MAX_BLOCKTIME; 8348 8349 set__blocktime_team(thread->th.th_team, tid, blocktime); 8350 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8351 8352 #if KMP_USE_MONITOR 8353 /* Calculate and set blocktime intervals for the teams */ 8354 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8355 8356 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8357 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8358 #endif 8359 8360 /* Set whether blocktime has been set to "TRUE" */ 8361 bt_set = TRUE; 8362 8363 set__bt_set_team(thread->th.th_team, tid, bt_set); 8364 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8365 #if KMP_USE_MONITOR 8366 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8367 "bt_intervals=%d, monitor_updates=%d\n", 8368 __kmp_gtid_from_tid(tid, thread->th.th_team), 8369 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8370 __kmp_monitor_wakeups)); 8371 #else 8372 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8373 __kmp_gtid_from_tid(tid, thread->th.th_team), 8374 thread->th.th_team->t.t_id, tid, blocktime)); 8375 #endif 8376 } 8377 8378 void __kmp_aux_set_defaults(char const *str, size_t len) { 8379 if (!__kmp_init_serial) { 8380 __kmp_serial_initialize(); 8381 } 8382 __kmp_env_initialize(str); 8383 8384 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8385 __kmp_env_print(); 8386 } 8387 } // __kmp_aux_set_defaults 8388 8389 /* ------------------------------------------------------------------------ */ 8390 /* internal fast reduction routines */ 8391 8392 PACKED_REDUCTION_METHOD_T 8393 __kmp_determine_reduction_method( 8394 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8395 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8396 kmp_critical_name *lck) { 8397 8398 // Default reduction method: critical construct ( lck != NULL, like in current 8399 // PAROPT ) 8400 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8401 // can be selected by RTL 8402 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8403 // can be selected by RTL 8404 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8405 // among generated by PAROPT. 8406 8407 PACKED_REDUCTION_METHOD_T retval; 8408 8409 int team_size; 8410 8411 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8412 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8413 8414 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8415 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8416 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8417 8418 retval = critical_reduce_block; 8419 8420 // another choice of getting a team size (with 1 dynamic deference) is slower 8421 team_size = __kmp_get_team_num_threads(global_tid); 8422 if (team_size == 1) { 8423 8424 retval = empty_reduce_block; 8425 8426 } else { 8427 8428 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8429 8430 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8431 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8432 8433 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8434 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8435 8436 int teamsize_cutoff = 4; 8437 8438 #if KMP_MIC_SUPPORTED 8439 if (__kmp_mic_type != non_mic) { 8440 teamsize_cutoff = 8; 8441 } 8442 #endif 8443 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8444 if (tree_available) { 8445 if (team_size <= teamsize_cutoff) { 8446 if (atomic_available) { 8447 retval = atomic_reduce_block; 8448 } 8449 } else { 8450 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8451 } 8452 } else if (atomic_available) { 8453 retval = atomic_reduce_block; 8454 } 8455 #else 8456 #error "Unknown or unsupported OS" 8457 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8458 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8459 8460 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8461 8462 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8463 8464 // basic tuning 8465 8466 if (atomic_available) { 8467 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8468 retval = atomic_reduce_block; 8469 } 8470 } // otherwise: use critical section 8471 8472 #elif KMP_OS_DARWIN 8473 8474 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8475 if (atomic_available && (num_vars <= 3)) { 8476 retval = atomic_reduce_block; 8477 } else if (tree_available) { 8478 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8479 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8480 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8481 } 8482 } // otherwise: use critical section 8483 8484 #else 8485 #error "Unknown or unsupported OS" 8486 #endif 8487 8488 #else 8489 #error "Unknown or unsupported architecture" 8490 #endif 8491 } 8492 8493 // KMP_FORCE_REDUCTION 8494 8495 // If the team is serialized (team_size == 1), ignore the forced reduction 8496 // method and stay with the unsynchronized method (empty_reduce_block) 8497 if (__kmp_force_reduction_method != reduction_method_not_defined && 8498 team_size != 1) { 8499 8500 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8501 8502 int atomic_available, tree_available; 8503 8504 switch ((forced_retval = __kmp_force_reduction_method)) { 8505 case critical_reduce_block: 8506 KMP_ASSERT(lck); // lck should be != 0 8507 break; 8508 8509 case atomic_reduce_block: 8510 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8511 if (!atomic_available) { 8512 KMP_WARNING(RedMethodNotSupported, "atomic"); 8513 forced_retval = critical_reduce_block; 8514 } 8515 break; 8516 8517 case tree_reduce_block: 8518 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8519 if (!tree_available) { 8520 KMP_WARNING(RedMethodNotSupported, "tree"); 8521 forced_retval = critical_reduce_block; 8522 } else { 8523 #if KMP_FAST_REDUCTION_BARRIER 8524 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8525 #endif 8526 } 8527 break; 8528 8529 default: 8530 KMP_ASSERT(0); // "unsupported method specified" 8531 } 8532 8533 retval = forced_retval; 8534 } 8535 8536 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8537 8538 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8539 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8540 8541 return (retval); 8542 } 8543 // this function is for testing set/get/determine reduce method 8544 kmp_int32 __kmp_get_reduce_method(void) { 8545 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8546 } 8547 8548 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8549 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8550 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8551 8552 // Hard pause shuts down the runtime completely. Resume happens naturally when 8553 // OpenMP is used subsequently. 8554 void __kmp_hard_pause() { 8555 __kmp_pause_status = kmp_hard_paused; 8556 __kmp_internal_end_thread(-1); 8557 } 8558 8559 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8560 void __kmp_resume_if_soft_paused() { 8561 if (__kmp_pause_status == kmp_soft_paused) { 8562 __kmp_pause_status = kmp_not_paused; 8563 8564 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8565 kmp_info_t *thread = __kmp_threads[gtid]; 8566 if (thread) { // Wake it if sleeping 8567 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8568 thread); 8569 if (fl.is_sleeping()) 8570 fl.resume(gtid); 8571 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8572 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8573 } else { // thread holds the lock and may sleep soon 8574 do { // until either the thread sleeps, or we can get the lock 8575 if (fl.is_sleeping()) { 8576 fl.resume(gtid); 8577 break; 8578 } else if (__kmp_try_suspend_mx(thread)) { 8579 __kmp_unlock_suspend_mx(thread); 8580 break; 8581 } 8582 } while (1); 8583 } 8584 } 8585 } 8586 } 8587 } 8588 8589 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8590 // TODO: add warning messages 8591 int __kmp_pause_resource(kmp_pause_status_t level) { 8592 if (level == kmp_not_paused) { // requesting resume 8593 if (__kmp_pause_status == kmp_not_paused) { 8594 // error message about runtime not being paused, so can't resume 8595 return 1; 8596 } else { 8597 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8598 __kmp_pause_status == kmp_hard_paused); 8599 __kmp_pause_status = kmp_not_paused; 8600 return 0; 8601 } 8602 } else if (level == kmp_soft_paused) { // requesting soft pause 8603 if (__kmp_pause_status != kmp_not_paused) { 8604 // error message about already being paused 8605 return 1; 8606 } else { 8607 __kmp_soft_pause(); 8608 return 0; 8609 } 8610 } else if (level == kmp_hard_paused) { // requesting hard pause 8611 if (__kmp_pause_status != kmp_not_paused) { 8612 // error message about already being paused 8613 return 1; 8614 } else { 8615 __kmp_hard_pause(); 8616 return 0; 8617 } 8618 } else { 8619 // error message about invalid level 8620 return 1; 8621 } 8622 } 8623 8624 void __kmp_omp_display_env(int verbose) { 8625 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8626 if (__kmp_init_serial == 0) 8627 __kmp_do_serial_initialize(); 8628 __kmp_display_env_impl(!verbose, verbose); 8629 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8630 } 8631 8632 // Globals and functions for hidden helper task 8633 kmp_info_t **__kmp_hidden_helper_threads; 8634 kmp_info_t *__kmp_hidden_helper_main_thread; 8635 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8636 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8637 #if KMP_OS_LINUX 8638 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8639 #else 8640 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8641 #endif 8642 8643 namespace { 8644 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8645 8646 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8647 // This is an explicit synchronization on all hidden helper threads in case 8648 // that when a regular thread pushes a hidden helper task to one hidden 8649 // helper thread, the thread has not been awaken once since they're released 8650 // by the main thread after creating the team. 8651 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8652 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8653 __kmp_hidden_helper_threads_num) 8654 ; 8655 8656 // If main thread, then wait for signal 8657 if (__kmpc_master(nullptr, *gtid)) { 8658 // First, unset the initial state and release the initial thread 8659 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8660 __kmp_hidden_helper_initz_release(); 8661 __kmp_hidden_helper_main_thread_wait(); 8662 // Now wake up all worker threads 8663 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8664 __kmp_hidden_helper_worker_thread_signal(); 8665 } 8666 } 8667 } 8668 } // namespace 8669 8670 void __kmp_hidden_helper_threads_initz_routine() { 8671 // Create a new root for hidden helper team/threads 8672 const int gtid = __kmp_register_root(TRUE); 8673 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8674 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8675 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8676 __kmp_hidden_helper_threads_num; 8677 8678 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8679 8680 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8681 8682 // Set the initialization flag to FALSE 8683 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8684 8685 __kmp_hidden_helper_threads_deinitz_release(); 8686 } 8687