1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #include "tsan_annotations.h" 51 52 #if KMP_OS_WINDOWS 53 // windows does not need include files as it doesn't use shared memory 54 #else 55 #include <sys/mman.h> 56 #include <sys/stat.h> 57 #include <fcntl.h> 58 #define SHM_SIZE 1024 59 #endif 60 61 #if defined(KMP_GOMP_COMPAT) 62 char const __kmp_version_alt_comp[] = 63 KMP_VERSION_PREFIX "alternative compiler support: yes"; 64 #endif /* defined(KMP_GOMP_COMPAT) */ 65 66 char const __kmp_version_omp_api[] = 67 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 68 69 #ifdef KMP_DEBUG 70 char const __kmp_version_lock[] = 71 KMP_VERSION_PREFIX "lock type: run time selectable"; 72 #endif /* KMP_DEBUG */ 73 74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 75 76 /* ------------------------------------------------------------------------ */ 77 78 #if KMP_USE_MONITOR 79 kmp_info_t __kmp_monitor; 80 #endif 81 82 /* Forward declarations */ 83 84 void __kmp_cleanup(void); 85 86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 87 int gtid); 88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 89 kmp_internal_control_t *new_icvs, 90 ident_t *loc); 91 #if KMP_AFFINITY_SUPPORTED 92 static void __kmp_partition_places(kmp_team_t *team, 93 int update_master_only = 0); 94 #endif 95 static void __kmp_do_serial_initialize(void); 96 void __kmp_fork_barrier(int gtid, int tid); 97 void __kmp_join_barrier(int gtid); 98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 99 kmp_internal_control_t *new_icvs, ident_t *loc); 100 101 #ifdef USE_LOAD_BALANCE 102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 103 #endif 104 105 static int __kmp_expand_threads(int nNeed); 106 #if KMP_OS_WINDOWS 107 static int __kmp_unregister_root_other_thread(int gtid); 108 #endif 109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 111 112 /* Calculate the identifier of the current thread */ 113 /* fast (and somewhat portable) way to get unique identifier of executing 114 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 115 int __kmp_get_global_thread_id() { 116 int i; 117 kmp_info_t **other_threads; 118 size_t stack_data; 119 char *stack_addr; 120 size_t stack_size; 121 char *stack_base; 122 123 KA_TRACE( 124 1000, 125 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 126 __kmp_nth, __kmp_all_nth)); 127 128 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 129 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 130 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 131 __kmp_init_gtid for this to work. */ 132 133 if (!TCR_4(__kmp_init_gtid)) 134 return KMP_GTID_DNE; 135 136 #ifdef KMP_TDATA_GTID 137 if (TCR_4(__kmp_gtid_mode) >= 3) { 138 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 139 return __kmp_gtid; 140 } 141 #endif 142 if (TCR_4(__kmp_gtid_mode) >= 2) { 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 144 return __kmp_gtid_get_specific(); 145 } 146 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 147 148 stack_addr = (char *)&stack_data; 149 other_threads = __kmp_threads; 150 151 /* ATT: The code below is a source of potential bugs due to unsynchronized 152 access to __kmp_threads array. For example: 153 1. Current thread loads other_threads[i] to thr and checks it, it is 154 non-NULL. 155 2. Current thread is suspended by OS. 156 3. Another thread unregisters and finishes (debug versions of free() 157 may fill memory with something like 0xEF). 158 4. Current thread is resumed. 159 5. Current thread reads junk from *thr. 160 TODO: Fix it. --ln */ 161 162 for (i = 0; i < __kmp_threads_capacity; i++) { 163 164 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 165 if (!thr) 166 continue; 167 168 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 169 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 170 171 /* stack grows down -- search through all of the active threads */ 172 173 if (stack_addr <= stack_base) { 174 size_t stack_diff = stack_base - stack_addr; 175 176 if (stack_diff <= stack_size) { 177 /* The only way we can be closer than the allocated */ 178 /* stack size is if we are running on this thread. */ 179 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 180 return i; 181 } 182 } 183 } 184 185 /* get specific to try and determine our gtid */ 186 KA_TRACE(1000, 187 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 188 "thread, using TLS\n")); 189 i = __kmp_gtid_get_specific(); 190 191 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 192 193 /* if we havn't been assigned a gtid, then return code */ 194 if (i < 0) 195 return i; 196 197 /* dynamically updated stack window for uber threads to avoid get_specific 198 call */ 199 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 200 KMP_FATAL(StackOverflow, i); 201 } 202 203 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 204 if (stack_addr > stack_base) { 205 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 206 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 207 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 208 stack_base); 209 } else { 210 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 211 stack_base - stack_addr); 212 } 213 214 /* Reprint stack bounds for ubermaster since they have been refined */ 215 if (__kmp_storage_map) { 216 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 217 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 218 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 219 other_threads[i]->th.th_info.ds.ds_stacksize, 220 "th_%d stack (refinement)", i); 221 } 222 return i; 223 } 224 225 int __kmp_get_global_thread_id_reg() { 226 int gtid; 227 228 if (!__kmp_init_serial) { 229 gtid = KMP_GTID_DNE; 230 } else 231 #ifdef KMP_TDATA_GTID 232 if (TCR_4(__kmp_gtid_mode) >= 3) { 233 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 234 gtid = __kmp_gtid; 235 } else 236 #endif 237 if (TCR_4(__kmp_gtid_mode) >= 2) { 238 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 239 gtid = __kmp_gtid_get_specific(); 240 } else { 241 KA_TRACE(1000, 242 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 243 gtid = __kmp_get_global_thread_id(); 244 } 245 246 /* we must be a new uber master sibling thread */ 247 if (gtid == KMP_GTID_DNE) { 248 KA_TRACE(10, 249 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 250 "Registering a new gtid.\n")); 251 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 252 if (!__kmp_init_serial) { 253 __kmp_do_serial_initialize(); 254 gtid = __kmp_gtid_get_specific(); 255 } else { 256 gtid = __kmp_register_root(FALSE); 257 } 258 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 259 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 260 } 261 262 KMP_DEBUG_ASSERT(gtid >= 0); 263 264 return gtid; 265 } 266 267 /* caller must hold forkjoin_lock */ 268 void __kmp_check_stack_overlap(kmp_info_t *th) { 269 int f; 270 char *stack_beg = NULL; 271 char *stack_end = NULL; 272 int gtid; 273 274 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 275 if (__kmp_storage_map) { 276 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 277 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 278 279 gtid = __kmp_gtid_from_thread(th); 280 281 if (gtid == KMP_GTID_MONITOR) { 282 __kmp_print_storage_map_gtid( 283 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 284 "th_%s stack (%s)", "mon", 285 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 286 } else { 287 __kmp_print_storage_map_gtid( 288 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 289 "th_%d stack (%s)", gtid, 290 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 291 } 292 } 293 294 /* No point in checking ubermaster threads since they use refinement and 295 * cannot overlap */ 296 gtid = __kmp_gtid_from_thread(th); 297 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 298 KA_TRACE(10, 299 ("__kmp_check_stack_overlap: performing extensive checking\n")); 300 if (stack_beg == NULL) { 301 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 302 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 303 } 304 305 for (f = 0; f < __kmp_threads_capacity; f++) { 306 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 307 308 if (f_th && f_th != th) { 309 char *other_stack_end = 310 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 311 char *other_stack_beg = 312 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 313 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 314 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 315 316 /* Print the other stack values before the abort */ 317 if (__kmp_storage_map) 318 __kmp_print_storage_map_gtid( 319 -1, other_stack_beg, other_stack_end, 320 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 321 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 322 323 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 324 __kmp_msg_null); 325 } 326 } 327 } 328 } 329 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 330 } 331 332 /* ------------------------------------------------------------------------ */ 333 334 void __kmp_infinite_loop(void) { 335 static int done = FALSE; 336 337 while (!done) { 338 KMP_YIELD(TRUE); 339 } 340 } 341 342 #define MAX_MESSAGE 512 343 344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 345 char const *format, ...) { 346 char buffer[MAX_MESSAGE]; 347 va_list ap; 348 349 va_start(ap, format); 350 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 351 p2, (unsigned long)size, format); 352 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 353 __kmp_vprintf(kmp_err, buffer, ap); 354 #if KMP_PRINT_DATA_PLACEMENT 355 int node; 356 if (gtid >= 0) { 357 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 358 if (__kmp_storage_map_verbose) { 359 node = __kmp_get_host_node(p1); 360 if (node < 0) /* doesn't work, so don't try this next time */ 361 __kmp_storage_map_verbose = FALSE; 362 else { 363 char *last; 364 int lastNode; 365 int localProc = __kmp_get_cpu_from_gtid(gtid); 366 367 const int page_size = KMP_GET_PAGE_SIZE(); 368 369 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 370 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 371 if (localProc >= 0) 372 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 373 localProc >> 1); 374 else 375 __kmp_printf_no_lock(" GTID %d\n", gtid); 376 #if KMP_USE_PRCTL 377 /* The more elaborate format is disabled for now because of the prctl 378 * hanging bug. */ 379 do { 380 last = p1; 381 lastNode = node; 382 /* This loop collates adjacent pages with the same host node. */ 383 do { 384 (char *)p1 += page_size; 385 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 386 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 387 lastNode); 388 } while (p1 <= p2); 389 #else 390 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 391 (char *)p1 + (page_size - 1), 392 __kmp_get_host_node(p1)); 393 if (p1 < p2) { 394 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 395 (char *)p2 + (page_size - 1), 396 __kmp_get_host_node(p2)); 397 } 398 #endif 399 } 400 } 401 } else 402 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 403 } 404 #endif /* KMP_PRINT_DATA_PLACEMENT */ 405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 406 } 407 408 void __kmp_warn(char const *format, ...) { 409 char buffer[MAX_MESSAGE]; 410 va_list ap; 411 412 if (__kmp_generate_warnings == kmp_warnings_off) { 413 return; 414 } 415 416 va_start(ap, format); 417 418 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 419 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 420 __kmp_vprintf(kmp_err, buffer, ap); 421 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 422 423 va_end(ap); 424 } 425 426 void __kmp_abort_process() { 427 // Later threads may stall here, but that's ok because abort() will kill them. 428 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 429 430 if (__kmp_debug_buf) { 431 __kmp_dump_debug_buffer(); 432 } 433 434 if (KMP_OS_WINDOWS) { 435 // Let other threads know of abnormal termination and prevent deadlock 436 // if abort happened during library initialization or shutdown 437 __kmp_global.g.g_abort = SIGABRT; 438 439 /* On Windows* OS by default abort() causes pop-up error box, which stalls 440 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 441 boxes. _set_abort_behavior() works well, but this function is not 442 available in VS7 (this is not problem for DLL, but it is a problem for 443 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 444 help, at least in some versions of MS C RTL. 445 446 It seems following sequence is the only way to simulate abort() and 447 avoid pop-up error box. */ 448 raise(SIGABRT); 449 _exit(3); // Just in case, if signal ignored, exit anyway. 450 } else { 451 __kmp_unregister_library(); 452 abort(); 453 } 454 455 __kmp_infinite_loop(); 456 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 457 458 } // __kmp_abort_process 459 460 void __kmp_abort_thread(void) { 461 // TODO: Eliminate g_abort global variable and this function. 462 // In case of abort just call abort(), it will kill all the threads. 463 __kmp_infinite_loop(); 464 } // __kmp_abort_thread 465 466 /* Print out the storage map for the major kmp_info_t thread data structures 467 that are allocated together. */ 468 469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 470 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 471 gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 474 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 475 476 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 477 sizeof(kmp_local_t), "th_%d.th_local", gtid); 478 479 __kmp_print_storage_map_gtid( 480 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 481 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 482 483 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 484 &thr->th.th_bar[bs_plain_barrier + 1], 485 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 486 gtid); 487 488 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 489 &thr->th.th_bar[bs_forkjoin_barrier + 1], 490 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 491 gtid); 492 493 #if KMP_FAST_REDUCTION_BARRIER 494 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 495 &thr->th.th_bar[bs_reduction_barrier + 1], 496 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 497 gtid); 498 #endif // KMP_FAST_REDUCTION_BARRIER 499 } 500 501 /* Print out the storage map for the major kmp_team_t team data structures 502 that are allocated together. */ 503 504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 505 int team_id, int num_thr) { 506 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 507 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 508 header, team_id); 509 510 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 511 &team->t.t_bar[bs_last_barrier], 512 sizeof(kmp_balign_team_t) * bs_last_barrier, 513 "%s_%d.t_bar", header, team_id); 514 515 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 516 &team->t.t_bar[bs_plain_barrier + 1], 517 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 518 header, team_id); 519 520 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 521 &team->t.t_bar[bs_forkjoin_barrier + 1], 522 sizeof(kmp_balign_team_t), 523 "%s_%d.t_bar[forkjoin]", header, team_id); 524 525 #if KMP_FAST_REDUCTION_BARRIER 526 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 527 &team->t.t_bar[bs_reduction_barrier + 1], 528 sizeof(kmp_balign_team_t), 529 "%s_%d.t_bar[reduction]", header, team_id); 530 #endif // KMP_FAST_REDUCTION_BARRIER 531 532 __kmp_print_storage_map_gtid( 533 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 534 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 535 536 __kmp_print_storage_map_gtid( 537 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 538 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 539 540 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 541 &team->t.t_disp_buffer[num_disp_buff], 542 sizeof(dispatch_shared_info_t) * num_disp_buff, 543 "%s_%d.t_disp_buffer", header, team_id); 544 } 545 546 static void __kmp_init_allocator() { 547 __kmp_init_memkind(); 548 __kmp_init_target_mem(); 549 } 550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 551 552 /* ------------------------------------------------------------------------ */ 553 554 #if KMP_DYNAMIC_LIB 555 #if KMP_OS_WINDOWS 556 557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 558 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 559 560 switch (fdwReason) { 561 562 case DLL_PROCESS_ATTACH: 563 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 564 565 return TRUE; 566 567 case DLL_PROCESS_DETACH: 568 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 569 570 // According to Windows* documentation for DllMain entry point: 571 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 572 // lpReserved == NULL when FreeLibrary() is called, 573 // lpReserved != NULL when the process is terminated. 574 // When FreeLibrary() is called, worker threads remain alive. So the 575 // runtime's state is consistent and executing proper shutdown is OK. 576 // When the process is terminated, worker threads have exited or been 577 // forcefully terminated by the OS and only the shutdown thread remains. 578 // This can leave the runtime in an inconsistent state. 579 // Hence, only attempt proper cleanup when FreeLibrary() is called. 580 // Otherwise, rely on OS to reclaim resources. 581 if (lpReserved == NULL) 582 __kmp_internal_end_library(__kmp_gtid_get_specific()); 583 584 return TRUE; 585 586 case DLL_THREAD_ATTACH: 587 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 588 589 /* if we want to register new siblings all the time here call 590 * __kmp_get_gtid(); */ 591 return TRUE; 592 593 case DLL_THREAD_DETACH: 594 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 595 596 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 597 return TRUE; 598 } 599 600 return TRUE; 601 } 602 603 #endif /* KMP_OS_WINDOWS */ 604 #endif /* KMP_DYNAMIC_LIB */ 605 606 /* __kmp_parallel_deo -- Wait until it's our turn. */ 607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 608 int gtid = *gtid_ref; 609 #ifdef BUILD_PARALLEL_ORDERED 610 kmp_team_t *team = __kmp_team_from_gtid(gtid); 611 #endif /* BUILD_PARALLEL_ORDERED */ 612 613 if (__kmp_env_consistency_check) { 614 if (__kmp_threads[gtid]->th.th_root->r.r_active) 615 #if KMP_USE_DYNAMIC_LOCK 616 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 617 #else 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 619 #endif 620 } 621 #ifdef BUILD_PARALLEL_ORDERED 622 if (!team->t.t_serialized) { 623 KMP_MB(); 624 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 625 NULL); 626 KMP_MB(); 627 } 628 #endif /* BUILD_PARALLEL_ORDERED */ 629 } 630 631 /* __kmp_parallel_dxo -- Signal the next task. */ 632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 633 int gtid = *gtid_ref; 634 #ifdef BUILD_PARALLEL_ORDERED 635 int tid = __kmp_tid_from_gtid(gtid); 636 kmp_team_t *team = __kmp_team_from_gtid(gtid); 637 #endif /* BUILD_PARALLEL_ORDERED */ 638 639 if (__kmp_env_consistency_check) { 640 if (__kmp_threads[gtid]->th.th_root->r.r_active) 641 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 642 } 643 #ifdef BUILD_PARALLEL_ORDERED 644 if (!team->t.t_serialized) { 645 KMP_MB(); /* Flush all pending memory write invalidates. */ 646 647 /* use the tid of the next thread in this team */ 648 /* TODO replace with general release procedure */ 649 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 650 651 KMP_MB(); /* Flush all pending memory write invalidates. */ 652 } 653 #endif /* BUILD_PARALLEL_ORDERED */ 654 } 655 656 /* ------------------------------------------------------------------------ */ 657 /* The BARRIER for a SINGLE process section is always explicit */ 658 659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 660 int status; 661 kmp_info_t *th; 662 kmp_team_t *team; 663 664 if (!TCR_4(__kmp_init_parallel)) 665 __kmp_parallel_initialize(); 666 __kmp_resume_if_soft_paused(); 667 668 th = __kmp_threads[gtid]; 669 team = th->th.th_team; 670 status = 0; 671 672 th->th.th_ident = id_ref; 673 674 if (team->t.t_serialized) { 675 status = 1; 676 } else { 677 kmp_int32 old_this = th->th.th_local.this_construct; 678 679 ++th->th.th_local.this_construct; 680 /* try to set team count to thread count--success means thread got the 681 single block */ 682 /* TODO: Should this be acquire or release? */ 683 if (team->t.t_construct == old_this) { 684 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 685 th->th.th_local.this_construct); 686 } 687 #if USE_ITT_BUILD 688 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 689 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 690 team->t.t_active_level == 1) { 691 // Only report metadata by primary thread of active team at level 1 692 __kmp_itt_metadata_single(id_ref); 693 } 694 #endif /* USE_ITT_BUILD */ 695 } 696 697 if (__kmp_env_consistency_check) { 698 if (status && push_ws) { 699 __kmp_push_workshare(gtid, ct_psingle, id_ref); 700 } else { 701 __kmp_check_workshare(gtid, ct_psingle, id_ref); 702 } 703 } 704 #if USE_ITT_BUILD 705 if (status) { 706 __kmp_itt_single_start(gtid); 707 } 708 #endif /* USE_ITT_BUILD */ 709 return status; 710 } 711 712 void __kmp_exit_single(int gtid) { 713 #if USE_ITT_BUILD 714 __kmp_itt_single_end(gtid); 715 #endif /* USE_ITT_BUILD */ 716 if (__kmp_env_consistency_check) 717 __kmp_pop_workshare(gtid, ct_psingle, NULL); 718 } 719 720 /* determine if we can go parallel or must use a serialized parallel region and 721 * how many threads we can use 722 * set_nproc is the number of threads requested for the team 723 * returns 0 if we should serialize or only use one thread, 724 * otherwise the number of threads to use 725 * The forkjoin lock is held by the caller. */ 726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 727 int master_tid, int set_nthreads, 728 int enter_teams) { 729 int capacity; 730 int new_nthreads; 731 KMP_DEBUG_ASSERT(__kmp_init_serial); 732 KMP_DEBUG_ASSERT(root && parent_team); 733 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 734 735 // If dyn-var is set, dynamically adjust the number of desired threads, 736 // according to the method specified by dynamic_mode. 737 new_nthreads = set_nthreads; 738 if (!get__dynamic_2(parent_team, master_tid)) { 739 ; 740 } 741 #ifdef USE_LOAD_BALANCE 742 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 743 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 744 if (new_nthreads == 1) { 745 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 746 "reservation to 1 thread\n", 747 master_tid)); 748 return 1; 749 } 750 if (new_nthreads < set_nthreads) { 751 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 752 "reservation to %d threads\n", 753 master_tid, new_nthreads)); 754 } 755 } 756 #endif /* USE_LOAD_BALANCE */ 757 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 758 new_nthreads = __kmp_avail_proc - __kmp_nth + 759 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 760 if (new_nthreads <= 1) { 761 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 762 "reservation to 1 thread\n", 763 master_tid)); 764 return 1; 765 } 766 if (new_nthreads < set_nthreads) { 767 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 768 "reservation to %d threads\n", 769 master_tid, new_nthreads)); 770 } else { 771 new_nthreads = set_nthreads; 772 } 773 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 774 if (set_nthreads > 2) { 775 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 776 new_nthreads = (new_nthreads % set_nthreads) + 1; 777 if (new_nthreads == 1) { 778 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 779 "reservation to 1 thread\n", 780 master_tid)); 781 return 1; 782 } 783 if (new_nthreads < set_nthreads) { 784 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 785 "reservation to %d threads\n", 786 master_tid, new_nthreads)); 787 } 788 } 789 } else { 790 KMP_ASSERT(0); 791 } 792 793 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 794 if (__kmp_nth + new_nthreads - 795 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 796 __kmp_max_nth) { 797 int tl_nthreads = __kmp_max_nth - __kmp_nth + 798 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 799 if (tl_nthreads <= 0) { 800 tl_nthreads = 1; 801 } 802 803 // If dyn-var is false, emit a 1-time warning. 804 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 805 __kmp_reserve_warn = 1; 806 __kmp_msg(kmp_ms_warning, 807 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 808 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 809 } 810 if (tl_nthreads == 1) { 811 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 812 "reduced reservation to 1 thread\n", 813 master_tid)); 814 return 1; 815 } 816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 817 "reservation to %d threads\n", 818 master_tid, tl_nthreads)); 819 new_nthreads = tl_nthreads; 820 } 821 822 // Respect OMP_THREAD_LIMIT 823 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 824 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 825 if (cg_nthreads + new_nthreads - 826 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 827 max_cg_threads) { 828 int tl_nthreads = max_cg_threads - cg_nthreads + 829 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 830 if (tl_nthreads <= 0) { 831 tl_nthreads = 1; 832 } 833 834 // If dyn-var is false, emit a 1-time warning. 835 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 836 __kmp_reserve_warn = 1; 837 __kmp_msg(kmp_ms_warning, 838 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 839 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 840 } 841 if (tl_nthreads == 1) { 842 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 843 "reduced reservation to 1 thread\n", 844 master_tid)); 845 return 1; 846 } 847 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 848 "reservation to %d threads\n", 849 master_tid, tl_nthreads)); 850 new_nthreads = tl_nthreads; 851 } 852 853 // Check if the threads array is large enough, or needs expanding. 854 // See comment in __kmp_register_root() about the adjustment if 855 // __kmp_threads[0] == NULL. 856 capacity = __kmp_threads_capacity; 857 if (TCR_PTR(__kmp_threads[0]) == NULL) { 858 --capacity; 859 } 860 // If it is not for initializing the hidden helper team, we need to take 861 // __kmp_hidden_helper_threads_num out of the capacity because it is included 862 // in __kmp_threads_capacity. 863 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 864 capacity -= __kmp_hidden_helper_threads_num; 865 } 866 if (__kmp_nth + new_nthreads - 867 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 868 capacity) { 869 // Expand the threads array. 870 int slotsRequired = __kmp_nth + new_nthreads - 871 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 872 capacity; 873 int slotsAdded = __kmp_expand_threads(slotsRequired); 874 if (slotsAdded < slotsRequired) { 875 // The threads array was not expanded enough. 876 new_nthreads -= (slotsRequired - slotsAdded); 877 KMP_ASSERT(new_nthreads >= 1); 878 879 // If dyn-var is false, emit a 1-time warning. 880 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 881 __kmp_reserve_warn = 1; 882 if (__kmp_tp_cached) { 883 __kmp_msg(kmp_ms_warning, 884 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 885 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 886 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 887 } else { 888 __kmp_msg(kmp_ms_warning, 889 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 890 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 891 } 892 } 893 } 894 } 895 896 #ifdef KMP_DEBUG 897 if (new_nthreads == 1) { 898 KC_TRACE(10, 899 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 900 "dead roots and rechecking; requested %d threads\n", 901 __kmp_get_gtid(), set_nthreads)); 902 } else { 903 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 904 " %d threads\n", 905 __kmp_get_gtid(), new_nthreads, set_nthreads)); 906 } 907 #endif // KMP_DEBUG 908 return new_nthreads; 909 } 910 911 /* Allocate threads from the thread pool and assign them to the new team. We are 912 assured that there are enough threads available, because we checked on that 913 earlier within critical section forkjoin */ 914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 915 kmp_info_t *master_th, int master_gtid) { 916 int i; 917 int use_hot_team; 918 919 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 920 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 921 KMP_MB(); 922 923 /* first, let's setup the primary thread */ 924 master_th->th.th_info.ds.ds_tid = 0; 925 master_th->th.th_team = team; 926 master_th->th.th_team_nproc = team->t.t_nproc; 927 master_th->th.th_team_master = master_th; 928 master_th->th.th_team_serialized = FALSE; 929 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 930 931 /* make sure we are not the optimized hot team */ 932 #if KMP_NESTED_HOT_TEAMS 933 use_hot_team = 0; 934 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 935 if (hot_teams) { // hot teams array is not allocated if 936 // KMP_HOT_TEAMS_MAX_LEVEL=0 937 int level = team->t.t_active_level - 1; // index in array of hot teams 938 if (master_th->th.th_teams_microtask) { // are we inside the teams? 939 if (master_th->th.th_teams_size.nteams > 1) { 940 ++level; // level was not increased in teams construct for 941 // team_of_masters 942 } 943 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 944 master_th->th.th_teams_level == team->t.t_level) { 945 ++level; // level was not increased in teams construct for 946 // team_of_workers before the parallel 947 } // team->t.t_level will be increased inside parallel 948 } 949 if (level < __kmp_hot_teams_max_level) { 950 if (hot_teams[level].hot_team) { 951 // hot team has already been allocated for given level 952 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 953 use_hot_team = 1; // the team is ready to use 954 } else { 955 use_hot_team = 0; // AC: threads are not allocated yet 956 hot_teams[level].hot_team = team; // remember new hot team 957 hot_teams[level].hot_team_nth = team->t.t_nproc; 958 } 959 } else { 960 use_hot_team = 0; 961 } 962 } 963 #else 964 use_hot_team = team == root->r.r_hot_team; 965 #endif 966 if (!use_hot_team) { 967 968 /* install the primary thread */ 969 team->t.t_threads[0] = master_th; 970 __kmp_initialize_info(master_th, team, 0, master_gtid); 971 972 /* now, install the worker threads */ 973 for (i = 1; i < team->t.t_nproc; i++) { 974 975 /* fork or reallocate a new thread and install it in team */ 976 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 977 team->t.t_threads[i] = thr; 978 KMP_DEBUG_ASSERT(thr); 979 KMP_DEBUG_ASSERT(thr->th.th_team == team); 980 /* align team and thread arrived states */ 981 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 982 "T#%d(%d:%d) join =%llu, plain=%llu\n", 983 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 984 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 985 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 986 team->t.t_bar[bs_plain_barrier].b_arrived)); 987 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 988 thr->th.th_teams_level = master_th->th.th_teams_level; 989 thr->th.th_teams_size = master_th->th.th_teams_size; 990 { // Initialize threads' barrier data. 991 int b; 992 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 993 for (b = 0; b < bs_last_barrier; ++b) { 994 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 995 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 996 #if USE_DEBUGGER 997 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 998 #endif 999 } 1000 } 1001 } 1002 1003 #if KMP_AFFINITY_SUPPORTED 1004 __kmp_partition_places(team); 1005 #endif 1006 } 1007 1008 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1009 for (i = 0; i < team->t.t_nproc; i++) { 1010 kmp_info_t *thr = team->t.t_threads[i]; 1011 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1012 thr->th.th_prev_level != team->t.t_level) { 1013 team->t.t_display_affinity = 1; 1014 break; 1015 } 1016 } 1017 } 1018 1019 KMP_MB(); 1020 } 1021 1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1023 // Propagate any changes to the floating point control registers out to the team 1024 // We try to avoid unnecessary writes to the relevant cache line in the team 1025 // structure, so we don't make changes unless they are needed. 1026 inline static void propagateFPControl(kmp_team_t *team) { 1027 if (__kmp_inherit_fp_control) { 1028 kmp_int16 x87_fpu_control_word; 1029 kmp_uint32 mxcsr; 1030 1031 // Get primary thread's values of FPU control flags (both X87 and vector) 1032 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1033 __kmp_store_mxcsr(&mxcsr); 1034 mxcsr &= KMP_X86_MXCSR_MASK; 1035 1036 // There is no point looking at t_fp_control_saved here. 1037 // If it is TRUE, we still have to update the values if they are different 1038 // from those we now have. If it is FALSE we didn't save anything yet, but 1039 // our objective is the same. We have to ensure that the values in the team 1040 // are the same as those we have. 1041 // So, this code achieves what we need whether or not t_fp_control_saved is 1042 // true. By checking whether the value needs updating we avoid unnecessary 1043 // writes that would put the cache-line into a written state, causing all 1044 // threads in the team to have to read it again. 1045 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1046 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1047 // Although we don't use this value, other code in the runtime wants to know 1048 // whether it should restore them. So we must ensure it is correct. 1049 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1050 } else { 1051 // Similarly here. Don't write to this cache-line in the team structure 1052 // unless we have to. 1053 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1054 } 1055 } 1056 1057 // Do the opposite, setting the hardware registers to the updated values from 1058 // the team. 1059 inline static void updateHWFPControl(kmp_team_t *team) { 1060 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1061 // Only reset the fp control regs if they have been changed in the team. 1062 // the parallel region that we are exiting. 1063 kmp_int16 x87_fpu_control_word; 1064 kmp_uint32 mxcsr; 1065 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1066 __kmp_store_mxcsr(&mxcsr); 1067 mxcsr &= KMP_X86_MXCSR_MASK; 1068 1069 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1070 __kmp_clear_x87_fpu_status_word(); 1071 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1072 } 1073 1074 if (team->t.t_mxcsr != mxcsr) { 1075 __kmp_load_mxcsr(&team->t.t_mxcsr); 1076 } 1077 } 1078 } 1079 #else 1080 #define propagateFPControl(x) ((void)0) 1081 #define updateHWFPControl(x) ((void)0) 1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1083 1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1085 int realloc); // forward declaration 1086 1087 /* Run a parallel region that has been serialized, so runs only in a team of the 1088 single primary thread. */ 1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1090 kmp_info_t *this_thr; 1091 kmp_team_t *serial_team; 1092 1093 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1094 1095 /* Skip all this code for autopar serialized loops since it results in 1096 unacceptable overhead */ 1097 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1098 return; 1099 1100 if (!TCR_4(__kmp_init_parallel)) 1101 __kmp_parallel_initialize(); 1102 __kmp_resume_if_soft_paused(); 1103 1104 this_thr = __kmp_threads[global_tid]; 1105 serial_team = this_thr->th.th_serial_team; 1106 1107 /* utilize the serialized team held by this thread */ 1108 KMP_DEBUG_ASSERT(serial_team); 1109 KMP_MB(); 1110 1111 if (__kmp_tasking_mode != tskm_immediate_exec) { 1112 KMP_DEBUG_ASSERT( 1113 this_thr->th.th_task_team == 1114 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1115 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1116 NULL); 1117 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1118 "team %p, new task_team = NULL\n", 1119 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1120 this_thr->th.th_task_team = NULL; 1121 } 1122 1123 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1124 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1125 proc_bind = proc_bind_false; 1126 } else if (proc_bind == proc_bind_default) { 1127 // No proc_bind clause was specified, so use the current value 1128 // of proc-bind-var for this parallel region. 1129 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1130 } 1131 // Reset for next parallel region 1132 this_thr->th.th_set_proc_bind = proc_bind_default; 1133 1134 #if OMPT_SUPPORT 1135 ompt_data_t ompt_parallel_data = ompt_data_none; 1136 ompt_data_t *implicit_task_data; 1137 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1138 if (ompt_enabled.enabled && 1139 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1140 1141 ompt_task_info_t *parent_task_info; 1142 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1143 1144 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1145 if (ompt_enabled.ompt_callback_parallel_begin) { 1146 int team_size = 1; 1147 1148 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1149 &(parent_task_info->task_data), &(parent_task_info->frame), 1150 &ompt_parallel_data, team_size, 1151 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1152 } 1153 } 1154 #endif // OMPT_SUPPORT 1155 1156 if (this_thr->th.th_team != serial_team) { 1157 // Nested level will be an index in the nested nthreads array 1158 int level = this_thr->th.th_team->t.t_level; 1159 1160 if (serial_team->t.t_serialized) { 1161 /* this serial team was already used 1162 TODO increase performance by making this locks more specific */ 1163 kmp_team_t *new_team; 1164 1165 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1166 1167 new_team = 1168 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1169 #if OMPT_SUPPORT 1170 ompt_parallel_data, 1171 #endif 1172 proc_bind, &this_thr->th.th_current_task->td_icvs, 1173 0 USE_NESTED_HOT_ARG(NULL)); 1174 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1175 KMP_ASSERT(new_team); 1176 1177 /* setup new serialized team and install it */ 1178 new_team->t.t_threads[0] = this_thr; 1179 new_team->t.t_parent = this_thr->th.th_team; 1180 serial_team = new_team; 1181 this_thr->th.th_serial_team = serial_team; 1182 1183 KF_TRACE( 1184 10, 1185 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1186 global_tid, serial_team)); 1187 1188 /* TODO the above breaks the requirement that if we run out of resources, 1189 then we can still guarantee that serialized teams are ok, since we may 1190 need to allocate a new one */ 1191 } else { 1192 KF_TRACE( 1193 10, 1194 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1195 global_tid, serial_team)); 1196 } 1197 1198 /* we have to initialize this serial team */ 1199 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1200 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1201 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1202 serial_team->t.t_ident = loc; 1203 serial_team->t.t_serialized = 1; 1204 serial_team->t.t_nproc = 1; 1205 serial_team->t.t_parent = this_thr->th.th_team; 1206 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1207 this_thr->th.th_team = serial_team; 1208 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1209 1210 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1211 this_thr->th.th_current_task)); 1212 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1213 this_thr->th.th_current_task->td_flags.executing = 0; 1214 1215 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1216 1217 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1218 implicit task for each serialized task represented by 1219 team->t.t_serialized? */ 1220 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1221 &this_thr->th.th_current_task->td_parent->td_icvs); 1222 1223 // Thread value exists in the nested nthreads array for the next nested 1224 // level 1225 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1226 this_thr->th.th_current_task->td_icvs.nproc = 1227 __kmp_nested_nth.nth[level + 1]; 1228 } 1229 1230 if (__kmp_nested_proc_bind.used && 1231 (level + 1 < __kmp_nested_proc_bind.used)) { 1232 this_thr->th.th_current_task->td_icvs.proc_bind = 1233 __kmp_nested_proc_bind.bind_types[level + 1]; 1234 } 1235 1236 #if USE_DEBUGGER 1237 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1238 #endif 1239 this_thr->th.th_info.ds.ds_tid = 0; 1240 1241 /* set thread cache values */ 1242 this_thr->th.th_team_nproc = 1; 1243 this_thr->th.th_team_master = this_thr; 1244 this_thr->th.th_team_serialized = 1; 1245 1246 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1247 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1248 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1249 1250 propagateFPControl(serial_team); 1251 1252 /* check if we need to allocate dispatch buffers stack */ 1253 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1254 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1255 serial_team->t.t_dispatch->th_disp_buffer = 1256 (dispatch_private_info_t *)__kmp_allocate( 1257 sizeof(dispatch_private_info_t)); 1258 } 1259 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1260 1261 KMP_MB(); 1262 1263 } else { 1264 /* this serialized team is already being used, 1265 * that's fine, just add another nested level */ 1266 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1267 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1268 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1269 ++serial_team->t.t_serialized; 1270 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1271 1272 // Nested level will be an index in the nested nthreads array 1273 int level = this_thr->th.th_team->t.t_level; 1274 // Thread value exists in the nested nthreads array for the next nested 1275 // level 1276 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1277 this_thr->th.th_current_task->td_icvs.nproc = 1278 __kmp_nested_nth.nth[level + 1]; 1279 } 1280 serial_team->t.t_level++; 1281 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1282 "of serial team %p to %d\n", 1283 global_tid, serial_team, serial_team->t.t_level)); 1284 1285 /* allocate/push dispatch buffers stack */ 1286 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1287 { 1288 dispatch_private_info_t *disp_buffer = 1289 (dispatch_private_info_t *)__kmp_allocate( 1290 sizeof(dispatch_private_info_t)); 1291 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1292 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1293 } 1294 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1295 1296 KMP_MB(); 1297 } 1298 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1299 1300 // Perform the display affinity functionality for 1301 // serialized parallel regions 1302 if (__kmp_display_affinity) { 1303 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1304 this_thr->th.th_prev_num_threads != 1) { 1305 // NULL means use the affinity-format-var ICV 1306 __kmp_aux_display_affinity(global_tid, NULL); 1307 this_thr->th.th_prev_level = serial_team->t.t_level; 1308 this_thr->th.th_prev_num_threads = 1; 1309 } 1310 } 1311 1312 if (__kmp_env_consistency_check) 1313 __kmp_push_parallel(global_tid, NULL); 1314 #if OMPT_SUPPORT 1315 serial_team->t.ompt_team_info.master_return_address = codeptr; 1316 if (ompt_enabled.enabled && 1317 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1318 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1319 OMPT_GET_FRAME_ADDRESS(0); 1320 1321 ompt_lw_taskteam_t lw_taskteam; 1322 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1323 &ompt_parallel_data, codeptr); 1324 1325 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1326 // don't use lw_taskteam after linking. content was swaped 1327 1328 /* OMPT implicit task begin */ 1329 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1330 if (ompt_enabled.ompt_callback_implicit_task) { 1331 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1332 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1333 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1334 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1335 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1336 __kmp_tid_from_gtid(global_tid); 1337 } 1338 1339 /* OMPT state */ 1340 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1341 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1342 OMPT_GET_FRAME_ADDRESS(0); 1343 } 1344 #endif 1345 } 1346 1347 /* most of the work for a fork */ 1348 /* return true if we really went parallel, false if serialized */ 1349 int __kmp_fork_call(ident_t *loc, int gtid, 1350 enum fork_context_e call_context, // Intel, GNU, ... 1351 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1352 kmp_va_list ap) { 1353 void **argv; 1354 int i; 1355 int master_tid; 1356 int master_this_cons; 1357 kmp_team_t *team; 1358 kmp_team_t *parent_team; 1359 kmp_info_t *master_th; 1360 kmp_root_t *root; 1361 int nthreads; 1362 int master_active; 1363 int master_set_numthreads; 1364 int level; 1365 int active_level; 1366 int teams_level; 1367 #if KMP_NESTED_HOT_TEAMS 1368 kmp_hot_team_ptr_t **p_hot_teams; 1369 #endif 1370 { // KMP_TIME_BLOCK 1371 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1372 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1373 1374 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1375 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1376 /* Some systems prefer the stack for the root thread(s) to start with */ 1377 /* some gap from the parent stack to prevent false sharing. */ 1378 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1379 /* These 2 lines below are so this does not get optimized out */ 1380 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1381 __kmp_stkpadding += (short)((kmp_int64)dummy); 1382 } 1383 1384 /* initialize if needed */ 1385 KMP_DEBUG_ASSERT( 1386 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1387 if (!TCR_4(__kmp_init_parallel)) 1388 __kmp_parallel_initialize(); 1389 __kmp_resume_if_soft_paused(); 1390 1391 /* setup current data */ 1392 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1393 // shutdown 1394 parent_team = master_th->th.th_team; 1395 master_tid = master_th->th.th_info.ds.ds_tid; 1396 master_this_cons = master_th->th.th_local.this_construct; 1397 root = master_th->th.th_root; 1398 master_active = root->r.r_active; 1399 master_set_numthreads = master_th->th.th_set_nproc; 1400 1401 #if OMPT_SUPPORT 1402 ompt_data_t ompt_parallel_data = ompt_data_none; 1403 ompt_data_t *parent_task_data; 1404 ompt_frame_t *ompt_frame; 1405 ompt_data_t *implicit_task_data; 1406 void *return_address = NULL; 1407 1408 if (ompt_enabled.enabled) { 1409 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1410 NULL, NULL); 1411 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1412 } 1413 #endif 1414 1415 // Nested level will be an index in the nested nthreads array 1416 level = parent_team->t.t_level; 1417 // used to launch non-serial teams even if nested is not allowed 1418 active_level = parent_team->t.t_active_level; 1419 // needed to check nesting inside the teams 1420 teams_level = master_th->th.th_teams_level; 1421 #if KMP_NESTED_HOT_TEAMS 1422 p_hot_teams = &master_th->th.th_hot_teams; 1423 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1424 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1425 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1426 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1427 // it is either actual or not needed (when active_level > 0) 1428 (*p_hot_teams)[0].hot_team_nth = 1; 1429 } 1430 #endif 1431 1432 #if OMPT_SUPPORT 1433 if (ompt_enabled.enabled) { 1434 if (ompt_enabled.ompt_callback_parallel_begin) { 1435 int team_size = master_set_numthreads 1436 ? master_set_numthreads 1437 : get__nproc_2(parent_team, master_tid); 1438 int flags = OMPT_INVOKER(call_context) | 1439 ((microtask == (microtask_t)__kmp_teams_master) 1440 ? ompt_parallel_league 1441 : ompt_parallel_team); 1442 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1443 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1444 return_address); 1445 } 1446 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1447 } 1448 #endif 1449 1450 master_th->th.th_ident = loc; 1451 1452 if (master_th->th.th_teams_microtask && ap && 1453 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1454 // AC: This is start of parallel that is nested inside teams construct. 1455 // The team is actual (hot), all workers are ready at the fork barrier. 1456 // No lock needed to initialize the team a bit, then free workers. 1457 parent_team->t.t_ident = loc; 1458 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1459 parent_team->t.t_argc = argc; 1460 argv = (void **)parent_team->t.t_argv; 1461 for (i = argc - 1; i >= 0; --i) 1462 *argv++ = va_arg(kmp_va_deref(ap), void *); 1463 // Increment our nested depth levels, but not increase the serialization 1464 if (parent_team == master_th->th.th_serial_team) { 1465 // AC: we are in serialized parallel 1466 __kmpc_serialized_parallel(loc, gtid); 1467 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1468 1469 if (call_context == fork_context_gnu) { 1470 // AC: need to decrement t_serialized for enquiry functions to work 1471 // correctly, will restore at join time 1472 parent_team->t.t_serialized--; 1473 return TRUE; 1474 } 1475 1476 #if OMPD_SUPPORT 1477 parent_team->t.t_pkfn = microtask; 1478 #endif 1479 1480 #if OMPT_SUPPORT 1481 void *dummy; 1482 void **exit_frame_p; 1483 1484 ompt_lw_taskteam_t lw_taskteam; 1485 1486 if (ompt_enabled.enabled) { 1487 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1488 &ompt_parallel_data, return_address); 1489 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1490 1491 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1492 // don't use lw_taskteam after linking. content was swaped 1493 1494 /* OMPT implicit task begin */ 1495 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1496 if (ompt_enabled.ompt_callback_implicit_task) { 1497 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1498 __kmp_tid_from_gtid(gtid); 1499 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1500 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1501 implicit_task_data, 1, 1502 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1503 } 1504 1505 /* OMPT state */ 1506 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1507 } else { 1508 exit_frame_p = &dummy; 1509 } 1510 #endif 1511 // AC: need to decrement t_serialized for enquiry functions to work 1512 // correctly, will restore at join time 1513 parent_team->t.t_serialized--; 1514 1515 { 1516 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1517 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1518 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1519 #if OMPT_SUPPORT 1520 , 1521 exit_frame_p 1522 #endif 1523 ); 1524 } 1525 1526 #if OMPT_SUPPORT 1527 if (ompt_enabled.enabled) { 1528 *exit_frame_p = NULL; 1529 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1530 if (ompt_enabled.ompt_callback_implicit_task) { 1531 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1532 ompt_scope_end, NULL, implicit_task_data, 1, 1533 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1534 } 1535 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1536 __ompt_lw_taskteam_unlink(master_th); 1537 if (ompt_enabled.ompt_callback_parallel_end) { 1538 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1539 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1540 OMPT_INVOKER(call_context) | ompt_parallel_team, 1541 return_address); 1542 } 1543 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1544 } 1545 #endif 1546 return TRUE; 1547 } 1548 1549 parent_team->t.t_pkfn = microtask; 1550 parent_team->t.t_invoke = invoker; 1551 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1552 parent_team->t.t_active_level++; 1553 parent_team->t.t_level++; 1554 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1555 1556 #if OMPT_SUPPORT 1557 if (ompt_enabled.enabled) { 1558 ompt_lw_taskteam_t lw_taskteam; 1559 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1560 &ompt_parallel_data, return_address); 1561 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1562 } 1563 #endif 1564 1565 /* Change number of threads in the team if requested */ 1566 if (master_set_numthreads) { // The parallel has num_threads clause 1567 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1568 // AC: only can reduce number of threads dynamically, can't increase 1569 kmp_info_t **other_threads = parent_team->t.t_threads; 1570 parent_team->t.t_nproc = master_set_numthreads; 1571 for (i = 0; i < master_set_numthreads; ++i) { 1572 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1573 } 1574 // Keep extra threads hot in the team for possible next parallels 1575 } 1576 master_th->th.th_set_nproc = 0; 1577 } 1578 1579 #if USE_DEBUGGER 1580 if (__kmp_debugging) { // Let debugger override number of threads. 1581 int nth = __kmp_omp_num_threads(loc); 1582 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1583 master_set_numthreads = nth; 1584 } 1585 } 1586 #endif 1587 1588 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1589 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1590 KMP_ITT_DEBUG) && 1591 __kmp_forkjoin_frames_mode == 3 && 1592 parent_team->t.t_active_level == 1 // only report frames at level 1 1593 && master_th->th.th_teams_size.nteams == 1) { 1594 kmp_uint64 tmp_time = __itt_get_timestamp(); 1595 master_th->th.th_frame_time = tmp_time; 1596 parent_team->t.t_region_time = tmp_time; 1597 } 1598 if (__itt_stack_caller_create_ptr) { 1599 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1600 // create new stack stitching id before entering fork barrier 1601 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1602 } 1603 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1604 1605 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1606 "master_th=%p, gtid=%d\n", 1607 root, parent_team, master_th, gtid)); 1608 __kmp_internal_fork(loc, gtid, parent_team); 1609 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1610 "master_th=%p, gtid=%d\n", 1611 root, parent_team, master_th, gtid)); 1612 1613 if (call_context == fork_context_gnu) 1614 return TRUE; 1615 1616 /* Invoke microtask for PRIMARY thread */ 1617 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1618 parent_team->t.t_id, parent_team->t.t_pkfn)); 1619 1620 if (!parent_team->t.t_invoke(gtid)) { 1621 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1622 } 1623 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1624 parent_team->t.t_id, parent_team->t.t_pkfn)); 1625 KMP_MB(); /* Flush all pending memory write invalidates. */ 1626 1627 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1628 1629 return TRUE; 1630 } // Parallel closely nested in teams construct 1631 1632 #if KMP_DEBUG 1633 if (__kmp_tasking_mode != tskm_immediate_exec) { 1634 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1635 parent_team->t.t_task_team[master_th->th.th_task_state]); 1636 } 1637 #endif 1638 1639 int enter_teams = 0; 1640 if (parent_team->t.t_active_level >= 1641 master_th->th.th_current_task->td_icvs.max_active_levels) { 1642 nthreads = 1; 1643 } else { 1644 enter_teams = ((ap == NULL && active_level == 0) || 1645 (ap && teams_level > 0 && teams_level == level)); 1646 nthreads = 1647 master_set_numthreads 1648 ? master_set_numthreads 1649 : get__nproc_2( 1650 parent_team, 1651 master_tid); // TODO: get nproc directly from current task 1652 1653 // Check if we need to take forkjoin lock? (no need for serialized 1654 // parallel out of teams construct). This code moved here from 1655 // __kmp_reserve_threads() to speedup nested serialized parallels. 1656 if (nthreads > 1) { 1657 if ((get__max_active_levels(master_th) == 1 && 1658 (root->r.r_in_parallel && !enter_teams)) || 1659 (__kmp_library == library_serial)) { 1660 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1661 " threads\n", 1662 gtid, nthreads)); 1663 nthreads = 1; 1664 } 1665 } 1666 if (nthreads > 1) { 1667 /* determine how many new threads we can use */ 1668 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1669 /* AC: If we execute teams from parallel region (on host), then teams 1670 should be created but each can only have 1 thread if nesting is 1671 disabled. If teams called from serial region, then teams and their 1672 threads should be created regardless of the nesting setting. */ 1673 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1674 nthreads, enter_teams); 1675 if (nthreads == 1) { 1676 // Free lock for single thread execution here; for multi-thread 1677 // execution it will be freed later after team of threads created 1678 // and initialized 1679 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1680 } 1681 } 1682 } 1683 KMP_DEBUG_ASSERT(nthreads > 0); 1684 1685 // If we temporarily changed the set number of threads then restore it now 1686 master_th->th.th_set_nproc = 0; 1687 1688 /* create a serialized parallel region? */ 1689 if (nthreads == 1) { 1690 /* josh todo: hypothetical question: what do we do for OS X*? */ 1691 #if KMP_OS_LINUX && \ 1692 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1693 void *args[argc]; 1694 #else 1695 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1696 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1697 KMP_ARCH_AARCH64) */ 1698 1699 KA_TRACE(20, 1700 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1701 1702 __kmpc_serialized_parallel(loc, gtid); 1703 1704 #if OMPD_SUPPORT 1705 master_th->th.th_serial_team->t.t_pkfn = microtask; 1706 #endif 1707 1708 if (call_context == fork_context_intel) { 1709 /* TODO this sucks, use the compiler itself to pass args! :) */ 1710 master_th->th.th_serial_team->t.t_ident = loc; 1711 if (!ap) { 1712 // revert change made in __kmpc_serialized_parallel() 1713 master_th->th.th_serial_team->t.t_level--; 1714 // Get args from parent team for teams construct 1715 1716 #if OMPT_SUPPORT 1717 void *dummy; 1718 void **exit_frame_p; 1719 ompt_task_info_t *task_info; 1720 1721 ompt_lw_taskteam_t lw_taskteam; 1722 1723 if (ompt_enabled.enabled) { 1724 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1725 &ompt_parallel_data, return_address); 1726 1727 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1728 // don't use lw_taskteam after linking. content was swaped 1729 1730 task_info = OMPT_CUR_TASK_INFO(master_th); 1731 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1732 if (ompt_enabled.ompt_callback_implicit_task) { 1733 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1734 __kmp_tid_from_gtid(gtid); 1735 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1736 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1737 &(task_info->task_data), 1, 1738 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1739 ompt_task_implicit); 1740 } 1741 1742 /* OMPT state */ 1743 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1744 } else { 1745 exit_frame_p = &dummy; 1746 } 1747 #endif 1748 1749 { 1750 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1751 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1752 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1753 parent_team->t.t_argv 1754 #if OMPT_SUPPORT 1755 , 1756 exit_frame_p 1757 #endif 1758 ); 1759 } 1760 1761 #if OMPT_SUPPORT 1762 if (ompt_enabled.enabled) { 1763 *exit_frame_p = NULL; 1764 if (ompt_enabled.ompt_callback_implicit_task) { 1765 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1766 ompt_scope_end, NULL, &(task_info->task_data), 1, 1767 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1768 ompt_task_implicit); 1769 } 1770 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1771 __ompt_lw_taskteam_unlink(master_th); 1772 if (ompt_enabled.ompt_callback_parallel_end) { 1773 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1774 &ompt_parallel_data, parent_task_data, 1775 OMPT_INVOKER(call_context) | ompt_parallel_team, 1776 return_address); 1777 } 1778 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1779 } 1780 #endif 1781 } else if (microtask == (microtask_t)__kmp_teams_master) { 1782 KMP_DEBUG_ASSERT(master_th->th.th_team == 1783 master_th->th.th_serial_team); 1784 team = master_th->th.th_team; 1785 // team->t.t_pkfn = microtask; 1786 team->t.t_invoke = invoker; 1787 __kmp_alloc_argv_entries(argc, team, TRUE); 1788 team->t.t_argc = argc; 1789 argv = (void **)team->t.t_argv; 1790 if (ap) { 1791 for (i = argc - 1; i >= 0; --i) 1792 *argv++ = va_arg(kmp_va_deref(ap), void *); 1793 } else { 1794 for (i = 0; i < argc; ++i) 1795 // Get args from parent team for teams construct 1796 argv[i] = parent_team->t.t_argv[i]; 1797 } 1798 // AC: revert change made in __kmpc_serialized_parallel() 1799 // because initial code in teams should have level=0 1800 team->t.t_level--; 1801 // AC: call special invoker for outer "parallel" of teams construct 1802 invoker(gtid); 1803 #if OMPT_SUPPORT 1804 if (ompt_enabled.enabled) { 1805 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1806 if (ompt_enabled.ompt_callback_implicit_task) { 1807 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1808 ompt_scope_end, NULL, &(task_info->task_data), 0, 1809 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1810 } 1811 if (ompt_enabled.ompt_callback_parallel_end) { 1812 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1813 &ompt_parallel_data, parent_task_data, 1814 OMPT_INVOKER(call_context) | ompt_parallel_league, 1815 return_address); 1816 } 1817 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1818 } 1819 #endif 1820 } else { 1821 argv = args; 1822 for (i = argc - 1; i >= 0; --i) 1823 *argv++ = va_arg(kmp_va_deref(ap), void *); 1824 KMP_MB(); 1825 1826 #if OMPT_SUPPORT 1827 void *dummy; 1828 void **exit_frame_p; 1829 ompt_task_info_t *task_info; 1830 1831 ompt_lw_taskteam_t lw_taskteam; 1832 1833 if (ompt_enabled.enabled) { 1834 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1835 &ompt_parallel_data, return_address); 1836 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1837 // don't use lw_taskteam after linking. content was swaped 1838 task_info = OMPT_CUR_TASK_INFO(master_th); 1839 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1840 1841 /* OMPT implicit task begin */ 1842 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1843 if (ompt_enabled.ompt_callback_implicit_task) { 1844 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1845 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1846 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1847 ompt_task_implicit); 1848 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1849 __kmp_tid_from_gtid(gtid); 1850 } 1851 1852 /* OMPT state */ 1853 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1854 } else { 1855 exit_frame_p = &dummy; 1856 } 1857 #endif 1858 1859 { 1860 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1861 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1862 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1863 #if OMPT_SUPPORT 1864 , 1865 exit_frame_p 1866 #endif 1867 ); 1868 } 1869 1870 #if OMPT_SUPPORT 1871 if (ompt_enabled.enabled) { 1872 *exit_frame_p = NULL; 1873 if (ompt_enabled.ompt_callback_implicit_task) { 1874 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1875 ompt_scope_end, NULL, &(task_info->task_data), 1, 1876 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1877 ompt_task_implicit); 1878 } 1879 1880 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1881 __ompt_lw_taskteam_unlink(master_th); 1882 if (ompt_enabled.ompt_callback_parallel_end) { 1883 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1884 &ompt_parallel_data, parent_task_data, 1885 OMPT_INVOKER(call_context) | ompt_parallel_team, 1886 return_address); 1887 } 1888 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1889 } 1890 #endif 1891 } 1892 } else if (call_context == fork_context_gnu) { 1893 #if OMPT_SUPPORT 1894 ompt_lw_taskteam_t lwt; 1895 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1896 return_address); 1897 1898 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1899 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1900 // don't use lw_taskteam after linking. content was swaped 1901 #endif 1902 1903 // we were called from GNU native code 1904 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1905 return FALSE; 1906 } else { 1907 KMP_ASSERT2(call_context < fork_context_last, 1908 "__kmp_fork_call: unknown fork_context parameter"); 1909 } 1910 1911 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1912 KMP_MB(); 1913 return FALSE; 1914 } // if (nthreads == 1) 1915 1916 // GEH: only modify the executing flag in the case when not serialized 1917 // serialized case is handled in kmpc_serialized_parallel 1918 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1919 "curtask=%p, curtask_max_aclevel=%d\n", 1920 parent_team->t.t_active_level, master_th, 1921 master_th->th.th_current_task, 1922 master_th->th.th_current_task->td_icvs.max_active_levels)); 1923 // TODO: GEH - cannot do this assertion because root thread not set up as 1924 // executing 1925 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1926 master_th->th.th_current_task->td_flags.executing = 0; 1927 1928 if (!master_th->th.th_teams_microtask || level > teams_level) { 1929 /* Increment our nested depth level */ 1930 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1931 } 1932 1933 // See if we need to make a copy of the ICVs. 1934 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1935 if ((level + 1 < __kmp_nested_nth.used) && 1936 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1937 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1938 } else { 1939 nthreads_icv = 0; // don't update 1940 } 1941 1942 // Figure out the proc_bind_policy for the new team. 1943 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1944 kmp_proc_bind_t proc_bind_icv = 1945 proc_bind_default; // proc_bind_default means don't update 1946 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1947 proc_bind = proc_bind_false; 1948 } else { 1949 if (proc_bind == proc_bind_default) { 1950 // No proc_bind clause specified; use current proc-bind-var for this 1951 // parallel region 1952 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1953 } 1954 /* else: The proc_bind policy was specified explicitly on parallel clause. 1955 This overrides proc-bind-var for this parallel region, but does not 1956 change proc-bind-var. */ 1957 // Figure the value of proc-bind-var for the child threads. 1958 if ((level + 1 < __kmp_nested_proc_bind.used) && 1959 (__kmp_nested_proc_bind.bind_types[level + 1] != 1960 master_th->th.th_current_task->td_icvs.proc_bind)) { 1961 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1962 } 1963 } 1964 1965 // Reset for next parallel region 1966 master_th->th.th_set_proc_bind = proc_bind_default; 1967 1968 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1969 kmp_internal_control_t new_icvs; 1970 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1971 new_icvs.next = NULL; 1972 if (nthreads_icv > 0) { 1973 new_icvs.nproc = nthreads_icv; 1974 } 1975 if (proc_bind_icv != proc_bind_default) { 1976 new_icvs.proc_bind = proc_bind_icv; 1977 } 1978 1979 /* allocate a new parallel team */ 1980 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1981 team = __kmp_allocate_team(root, nthreads, nthreads, 1982 #if OMPT_SUPPORT 1983 ompt_parallel_data, 1984 #endif 1985 proc_bind, &new_icvs, 1986 argc USE_NESTED_HOT_ARG(master_th)); 1987 } else { 1988 /* allocate a new parallel team */ 1989 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1990 team = __kmp_allocate_team(root, nthreads, nthreads, 1991 #if OMPT_SUPPORT 1992 ompt_parallel_data, 1993 #endif 1994 proc_bind, 1995 &master_th->th.th_current_task->td_icvs, 1996 argc USE_NESTED_HOT_ARG(master_th)); 1997 } 1998 KF_TRACE( 1999 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2000 2001 /* setup the new team */ 2002 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2003 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2004 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2005 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2006 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2007 #if OMPT_SUPPORT 2008 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2009 return_address); 2010 #endif 2011 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2012 // TODO: parent_team->t.t_level == INT_MAX ??? 2013 if (!master_th->th.th_teams_microtask || level > teams_level) { 2014 int new_level = parent_team->t.t_level + 1; 2015 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2016 new_level = parent_team->t.t_active_level + 1; 2017 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2018 } else { 2019 // AC: Do not increase parallel level at start of the teams construct 2020 int new_level = parent_team->t.t_level; 2021 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2022 new_level = parent_team->t.t_active_level; 2023 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2024 } 2025 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2026 // set primary thread's schedule as new run-time schedule 2027 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2028 2029 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2030 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2031 2032 // Update the floating point rounding in the team if required. 2033 propagateFPControl(team); 2034 #if OMPD_SUPPORT 2035 if (ompd_state & OMPD_ENABLE_BP) 2036 ompd_bp_parallel_begin(); 2037 #endif 2038 2039 if (__kmp_tasking_mode != tskm_immediate_exec) { 2040 // Set primary thread's task team to team's task team. Unless this is hot 2041 // team, it should be NULL. 2042 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2043 parent_team->t.t_task_team[master_th->th.th_task_state]); 2044 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2045 "%p, new task_team %p / team %p\n", 2046 __kmp_gtid_from_thread(master_th), 2047 master_th->th.th_task_team, parent_team, 2048 team->t.t_task_team[master_th->th.th_task_state], team)); 2049 2050 if (active_level || master_th->th.th_task_team) { 2051 // Take a memo of primary thread's task_state 2052 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2053 if (master_th->th.th_task_state_top >= 2054 master_th->th.th_task_state_stack_sz) { // increase size 2055 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2056 kmp_uint8 *old_stack, *new_stack; 2057 kmp_uint32 i; 2058 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2059 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2060 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2061 } 2062 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2063 ++i) { // zero-init rest of stack 2064 new_stack[i] = 0; 2065 } 2066 old_stack = master_th->th.th_task_state_memo_stack; 2067 master_th->th.th_task_state_memo_stack = new_stack; 2068 master_th->th.th_task_state_stack_sz = new_size; 2069 __kmp_free(old_stack); 2070 } 2071 // Store primary thread's task_state on stack 2072 master_th->th 2073 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2074 master_th->th.th_task_state; 2075 master_th->th.th_task_state_top++; 2076 #if KMP_NESTED_HOT_TEAMS 2077 if (master_th->th.th_hot_teams && 2078 active_level < __kmp_hot_teams_max_level && 2079 team == master_th->th.th_hot_teams[active_level].hot_team) { 2080 // Restore primary thread's nested state if nested hot team 2081 master_th->th.th_task_state = 2082 master_th->th 2083 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2084 } else { 2085 #endif 2086 master_th->th.th_task_state = 0; 2087 #if KMP_NESTED_HOT_TEAMS 2088 } 2089 #endif 2090 } 2091 #if !KMP_NESTED_HOT_TEAMS 2092 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2093 (team == root->r.r_hot_team)); 2094 #endif 2095 } 2096 2097 KA_TRACE( 2098 20, 2099 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2100 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2101 team->t.t_nproc)); 2102 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2103 (team->t.t_master_tid == 0 && 2104 (team->t.t_parent == root->r.r_root_team || 2105 team->t.t_parent->t.t_serialized))); 2106 KMP_MB(); 2107 2108 /* now, setup the arguments */ 2109 argv = (void **)team->t.t_argv; 2110 if (ap) { 2111 for (i = argc - 1; i >= 0; --i) { 2112 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2113 KMP_CHECK_UPDATE(*argv, new_argv); 2114 argv++; 2115 } 2116 } else { 2117 for (i = 0; i < argc; ++i) { 2118 // Get args from parent team for teams construct 2119 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2120 } 2121 } 2122 2123 /* now actually fork the threads */ 2124 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2125 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2126 root->r.r_active = TRUE; 2127 2128 __kmp_fork_team_threads(root, team, master_th, gtid); 2129 __kmp_setup_icv_copy(team, nthreads, 2130 &master_th->th.th_current_task->td_icvs, loc); 2131 2132 #if OMPT_SUPPORT 2133 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2134 #endif 2135 2136 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2137 2138 #if USE_ITT_BUILD 2139 if (team->t.t_active_level == 1 // only report frames at level 1 2140 && !master_th->th.th_teams_microtask) { // not in teams construct 2141 #if USE_ITT_NOTIFY 2142 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2143 (__kmp_forkjoin_frames_mode == 3 || 2144 __kmp_forkjoin_frames_mode == 1)) { 2145 kmp_uint64 tmp_time = 0; 2146 if (__itt_get_timestamp_ptr) 2147 tmp_time = __itt_get_timestamp(); 2148 // Internal fork - report frame begin 2149 master_th->th.th_frame_time = tmp_time; 2150 if (__kmp_forkjoin_frames_mode == 3) 2151 team->t.t_region_time = tmp_time; 2152 } else 2153 // only one notification scheme (either "submit" or "forking/joined", not both) 2154 #endif /* USE_ITT_NOTIFY */ 2155 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2156 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2157 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2158 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2159 } 2160 } 2161 #endif /* USE_ITT_BUILD */ 2162 2163 /* now go on and do the work */ 2164 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2165 KMP_MB(); 2166 KF_TRACE(10, 2167 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2168 root, team, master_th, gtid)); 2169 2170 #if USE_ITT_BUILD 2171 if (__itt_stack_caller_create_ptr) { 2172 // create new stack stitching id before entering fork barrier 2173 if (!enter_teams) { 2174 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2175 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2176 } else if (parent_team->t.t_serialized) { 2177 // keep stack stitching id in the serialized parent_team; 2178 // current team will be used for parallel inside the teams; 2179 // if parent_team is active, then it already keeps stack stitching id 2180 // for the league of teams 2181 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2182 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2183 } 2184 } 2185 #endif /* USE_ITT_BUILD */ 2186 2187 // AC: skip __kmp_internal_fork at teams construct, let only primary 2188 // threads execute 2189 if (ap) { 2190 __kmp_internal_fork(loc, gtid, team); 2191 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2192 "master_th=%p, gtid=%d\n", 2193 root, team, master_th, gtid)); 2194 } 2195 2196 if (call_context == fork_context_gnu) { 2197 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2198 return TRUE; 2199 } 2200 2201 /* Invoke microtask for PRIMARY thread */ 2202 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2203 team->t.t_id, team->t.t_pkfn)); 2204 } // END of timer KMP_fork_call block 2205 2206 #if KMP_STATS_ENABLED 2207 // If beginning a teams construct, then change thread state 2208 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2209 if (!ap) { 2210 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2211 } 2212 #endif 2213 2214 if (!team->t.t_invoke(gtid)) { 2215 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2216 } 2217 2218 #if KMP_STATS_ENABLED 2219 // If was beginning of a teams construct, then reset thread state 2220 if (!ap) { 2221 KMP_SET_THREAD_STATE(previous_state); 2222 } 2223 #endif 2224 2225 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2226 team->t.t_id, team->t.t_pkfn)); 2227 KMP_MB(); /* Flush all pending memory write invalidates. */ 2228 2229 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2230 #if OMPT_SUPPORT 2231 if (ompt_enabled.enabled) { 2232 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2233 } 2234 #endif 2235 2236 return TRUE; 2237 } 2238 2239 #if OMPT_SUPPORT 2240 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2241 kmp_team_t *team) { 2242 // restore state outside the region 2243 thread->th.ompt_thread_info.state = 2244 ((team->t.t_serialized) ? ompt_state_work_serial 2245 : ompt_state_work_parallel); 2246 } 2247 2248 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2249 kmp_team_t *team, ompt_data_t *parallel_data, 2250 int flags, void *codeptr) { 2251 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2252 if (ompt_enabled.ompt_callback_parallel_end) { 2253 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2254 parallel_data, &(task_info->task_data), flags, codeptr); 2255 } 2256 2257 task_info->frame.enter_frame = ompt_data_none; 2258 __kmp_join_restore_state(thread, team); 2259 } 2260 #endif 2261 2262 void __kmp_join_call(ident_t *loc, int gtid 2263 #if OMPT_SUPPORT 2264 , 2265 enum fork_context_e fork_context 2266 #endif 2267 , 2268 int exit_teams) { 2269 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2270 kmp_team_t *team; 2271 kmp_team_t *parent_team; 2272 kmp_info_t *master_th; 2273 kmp_root_t *root; 2274 int master_active; 2275 2276 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2277 2278 /* setup current data */ 2279 master_th = __kmp_threads[gtid]; 2280 root = master_th->th.th_root; 2281 team = master_th->th.th_team; 2282 parent_team = team->t.t_parent; 2283 2284 master_th->th.th_ident = loc; 2285 2286 #if OMPT_SUPPORT 2287 void *team_microtask = (void *)team->t.t_pkfn; 2288 // For GOMP interface with serialized parallel, need the 2289 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2290 // and end-parallel events. 2291 if (ompt_enabled.enabled && 2292 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2293 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2294 } 2295 #endif 2296 2297 #if KMP_DEBUG 2298 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2299 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2300 "th_task_team = %p\n", 2301 __kmp_gtid_from_thread(master_th), team, 2302 team->t.t_task_team[master_th->th.th_task_state], 2303 master_th->th.th_task_team)); 2304 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2305 team->t.t_task_team[master_th->th.th_task_state]); 2306 } 2307 #endif 2308 2309 if (team->t.t_serialized) { 2310 if (master_th->th.th_teams_microtask) { 2311 // We are in teams construct 2312 int level = team->t.t_level; 2313 int tlevel = master_th->th.th_teams_level; 2314 if (level == tlevel) { 2315 // AC: we haven't incremented it earlier at start of teams construct, 2316 // so do it here - at the end of teams construct 2317 team->t.t_level++; 2318 } else if (level == tlevel + 1) { 2319 // AC: we are exiting parallel inside teams, need to increment 2320 // serialization in order to restore it in the next call to 2321 // __kmpc_end_serialized_parallel 2322 team->t.t_serialized++; 2323 } 2324 } 2325 __kmpc_end_serialized_parallel(loc, gtid); 2326 2327 #if OMPT_SUPPORT 2328 if (ompt_enabled.enabled) { 2329 __kmp_join_restore_state(master_th, parent_team); 2330 } 2331 #endif 2332 2333 return; 2334 } 2335 2336 master_active = team->t.t_master_active; 2337 2338 if (!exit_teams) { 2339 // AC: No barrier for internal teams at exit from teams construct. 2340 // But there is barrier for external team (league). 2341 __kmp_internal_join(loc, gtid, team); 2342 #if USE_ITT_BUILD 2343 if (__itt_stack_caller_create_ptr) { 2344 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2345 // destroy the stack stitching id after join barrier 2346 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2347 team->t.t_stack_id = NULL; 2348 } 2349 #endif 2350 } else { 2351 master_th->th.th_task_state = 2352 0; // AC: no tasking in teams (out of any parallel) 2353 #if USE_ITT_BUILD 2354 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2355 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2356 // destroy the stack stitching id on exit from the teams construct 2357 // if parent_team is active, then the id will be destroyed later on 2358 // by master of the league of teams 2359 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2360 parent_team->t.t_stack_id = NULL; 2361 } 2362 #endif 2363 } 2364 2365 KMP_MB(); 2366 2367 #if OMPT_SUPPORT 2368 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2369 void *codeptr = team->t.ompt_team_info.master_return_address; 2370 #endif 2371 2372 #if USE_ITT_BUILD 2373 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2374 if (team->t.t_active_level == 1 && 2375 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2376 master_th->th.th_teams_size.nteams == 1)) { 2377 master_th->th.th_ident = loc; 2378 // only one notification scheme (either "submit" or "forking/joined", not 2379 // both) 2380 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2381 __kmp_forkjoin_frames_mode == 3) 2382 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2383 master_th->th.th_frame_time, 0, loc, 2384 master_th->th.th_team_nproc, 1); 2385 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2386 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2387 __kmp_itt_region_joined(gtid); 2388 } // active_level == 1 2389 #endif /* USE_ITT_BUILD */ 2390 2391 if (master_th->th.th_teams_microtask && !exit_teams && 2392 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2393 team->t.t_level == master_th->th.th_teams_level + 1) { 2394 // AC: We need to leave the team structure intact at the end of parallel 2395 // inside the teams construct, so that at the next parallel same (hot) team 2396 // works, only adjust nesting levels 2397 #if OMPT_SUPPORT 2398 ompt_data_t ompt_parallel_data = ompt_data_none; 2399 if (ompt_enabled.enabled) { 2400 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2401 if (ompt_enabled.ompt_callback_implicit_task) { 2402 int ompt_team_size = team->t.t_nproc; 2403 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2404 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2405 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2406 } 2407 task_info->frame.exit_frame = ompt_data_none; 2408 task_info->task_data = ompt_data_none; 2409 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2410 __ompt_lw_taskteam_unlink(master_th); 2411 } 2412 #endif 2413 /* Decrement our nested depth level */ 2414 team->t.t_level--; 2415 team->t.t_active_level--; 2416 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2417 2418 // Restore number of threads in the team if needed. This code relies on 2419 // the proper adjustment of th_teams_size.nth after the fork in 2420 // __kmp_teams_master on each teams primary thread in the case that 2421 // __kmp_reserve_threads reduced it. 2422 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2423 int old_num = master_th->th.th_team_nproc; 2424 int new_num = master_th->th.th_teams_size.nth; 2425 kmp_info_t **other_threads = team->t.t_threads; 2426 team->t.t_nproc = new_num; 2427 for (int i = 0; i < old_num; ++i) { 2428 other_threads[i]->th.th_team_nproc = new_num; 2429 } 2430 // Adjust states of non-used threads of the team 2431 for (int i = old_num; i < new_num; ++i) { 2432 // Re-initialize thread's barrier data. 2433 KMP_DEBUG_ASSERT(other_threads[i]); 2434 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2435 for (int b = 0; b < bs_last_barrier; ++b) { 2436 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2437 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2438 #if USE_DEBUGGER 2439 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2440 #endif 2441 } 2442 if (__kmp_tasking_mode != tskm_immediate_exec) { 2443 // Synchronize thread's task state 2444 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2445 } 2446 } 2447 } 2448 2449 #if OMPT_SUPPORT 2450 if (ompt_enabled.enabled) { 2451 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2452 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2453 } 2454 #endif 2455 2456 return; 2457 } 2458 2459 /* do cleanup and restore the parent team */ 2460 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2461 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2462 2463 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2464 2465 /* jc: The following lock has instructions with REL and ACQ semantics, 2466 separating the parallel user code called in this parallel region 2467 from the serial user code called after this function returns. */ 2468 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2469 2470 if (!master_th->th.th_teams_microtask || 2471 team->t.t_level > master_th->th.th_teams_level) { 2472 /* Decrement our nested depth level */ 2473 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2474 } 2475 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2476 2477 #if OMPT_SUPPORT 2478 if (ompt_enabled.enabled) { 2479 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2480 if (ompt_enabled.ompt_callback_implicit_task) { 2481 int flags = (team_microtask == (void *)__kmp_teams_master) 2482 ? ompt_task_initial 2483 : ompt_task_implicit; 2484 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2485 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2486 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2487 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2488 } 2489 task_info->frame.exit_frame = ompt_data_none; 2490 task_info->task_data = ompt_data_none; 2491 } 2492 #endif 2493 2494 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2495 master_th, team)); 2496 __kmp_pop_current_task_from_thread(master_th); 2497 2498 #if KMP_AFFINITY_SUPPORTED 2499 // Restore master thread's partition. 2500 master_th->th.th_first_place = team->t.t_first_place; 2501 master_th->th.th_last_place = team->t.t_last_place; 2502 #endif // KMP_AFFINITY_SUPPORTED 2503 master_th->th.th_def_allocator = team->t.t_def_allocator; 2504 2505 #if OMPD_SUPPORT 2506 if (ompd_state & OMPD_ENABLE_BP) 2507 ompd_bp_parallel_end(); 2508 #endif 2509 updateHWFPControl(team); 2510 2511 if (root->r.r_active != master_active) 2512 root->r.r_active = master_active; 2513 2514 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2515 master_th)); // this will free worker threads 2516 2517 /* this race was fun to find. make sure the following is in the critical 2518 region otherwise assertions may fail occasionally since the old team may be 2519 reallocated and the hierarchy appears inconsistent. it is actually safe to 2520 run and won't cause any bugs, but will cause those assertion failures. it's 2521 only one deref&assign so might as well put this in the critical region */ 2522 master_th->th.th_team = parent_team; 2523 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2524 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2525 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2526 2527 /* restore serialized team, if need be */ 2528 if (parent_team->t.t_serialized && 2529 parent_team != master_th->th.th_serial_team && 2530 parent_team != root->r.r_root_team) { 2531 __kmp_free_team(root, 2532 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2533 master_th->th.th_serial_team = parent_team; 2534 } 2535 2536 if (__kmp_tasking_mode != tskm_immediate_exec) { 2537 if (master_th->th.th_task_state_top > 2538 0) { // Restore task state from memo stack 2539 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2540 // Remember primary thread's state if we re-use this nested hot team 2541 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2542 master_th->th.th_task_state; 2543 --master_th->th.th_task_state_top; // pop 2544 // Now restore state at this level 2545 master_th->th.th_task_state = 2546 master_th->th 2547 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2548 } 2549 // Copy the task team from the parent team to the primary thread 2550 master_th->th.th_task_team = 2551 parent_team->t.t_task_team[master_th->th.th_task_state]; 2552 KA_TRACE(20, 2553 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2554 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2555 parent_team)); 2556 } 2557 2558 // TODO: GEH - cannot do this assertion because root thread not set up as 2559 // executing 2560 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2561 master_th->th.th_current_task->td_flags.executing = 1; 2562 2563 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2564 2565 #if OMPT_SUPPORT 2566 int flags = 2567 OMPT_INVOKER(fork_context) | 2568 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2569 : ompt_parallel_team); 2570 if (ompt_enabled.enabled) { 2571 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2572 codeptr); 2573 } 2574 #endif 2575 2576 KMP_MB(); 2577 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2578 } 2579 2580 /* Check whether we should push an internal control record onto the 2581 serial team stack. If so, do it. */ 2582 void __kmp_save_internal_controls(kmp_info_t *thread) { 2583 2584 if (thread->th.th_team != thread->th.th_serial_team) { 2585 return; 2586 } 2587 if (thread->th.th_team->t.t_serialized > 1) { 2588 int push = 0; 2589 2590 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2591 push = 1; 2592 } else { 2593 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2594 thread->th.th_team->t.t_serialized) { 2595 push = 1; 2596 } 2597 } 2598 if (push) { /* push a record on the serial team's stack */ 2599 kmp_internal_control_t *control = 2600 (kmp_internal_control_t *)__kmp_allocate( 2601 sizeof(kmp_internal_control_t)); 2602 2603 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2604 2605 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2606 2607 control->next = thread->th.th_team->t.t_control_stack_top; 2608 thread->th.th_team->t.t_control_stack_top = control; 2609 } 2610 } 2611 } 2612 2613 /* Changes set_nproc */ 2614 void __kmp_set_num_threads(int new_nth, int gtid) { 2615 kmp_info_t *thread; 2616 kmp_root_t *root; 2617 2618 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2619 KMP_DEBUG_ASSERT(__kmp_init_serial); 2620 2621 if (new_nth < 1) 2622 new_nth = 1; 2623 else if (new_nth > __kmp_max_nth) 2624 new_nth = __kmp_max_nth; 2625 2626 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2627 thread = __kmp_threads[gtid]; 2628 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2629 return; // nothing to do 2630 2631 __kmp_save_internal_controls(thread); 2632 2633 set__nproc(thread, new_nth); 2634 2635 // If this omp_set_num_threads() call will cause the hot team size to be 2636 // reduced (in the absence of a num_threads clause), then reduce it now, 2637 // rather than waiting for the next parallel region. 2638 root = thread->th.th_root; 2639 if (__kmp_init_parallel && (!root->r.r_active) && 2640 (root->r.r_hot_team->t.t_nproc > new_nth) 2641 #if KMP_NESTED_HOT_TEAMS 2642 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2643 #endif 2644 ) { 2645 kmp_team_t *hot_team = root->r.r_hot_team; 2646 int f; 2647 2648 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2649 2650 // Release the extra threads we don't need any more. 2651 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2652 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2653 if (__kmp_tasking_mode != tskm_immediate_exec) { 2654 // When decreasing team size, threads no longer in the team should unref 2655 // task team. 2656 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2657 } 2658 __kmp_free_thread(hot_team->t.t_threads[f]); 2659 hot_team->t.t_threads[f] = NULL; 2660 } 2661 hot_team->t.t_nproc = new_nth; 2662 #if KMP_NESTED_HOT_TEAMS 2663 if (thread->th.th_hot_teams) { 2664 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2665 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2666 } 2667 #endif 2668 2669 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2670 2671 // Update the t_nproc field in the threads that are still active. 2672 for (f = 0; f < new_nth; f++) { 2673 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2674 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2675 } 2676 // Special flag in case omp_set_num_threads() call 2677 hot_team->t.t_size_changed = -1; 2678 } 2679 } 2680 2681 /* Changes max_active_levels */ 2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2683 kmp_info_t *thread; 2684 2685 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2686 "%d = (%d)\n", 2687 gtid, max_active_levels)); 2688 KMP_DEBUG_ASSERT(__kmp_init_serial); 2689 2690 // validate max_active_levels 2691 if (max_active_levels < 0) { 2692 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2693 // We ignore this call if the user has specified a negative value. 2694 // The current setting won't be changed. The last valid setting will be 2695 // used. A warning will be issued (if warnings are allowed as controlled by 2696 // the KMP_WARNINGS env var). 2697 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2698 "max_active_levels for thread %d = (%d)\n", 2699 gtid, max_active_levels)); 2700 return; 2701 } 2702 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2703 // it's OK, the max_active_levels is within the valid range: [ 0; 2704 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2705 // We allow a zero value. (implementation defined behavior) 2706 } else { 2707 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2708 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2709 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2710 // Current upper limit is MAX_INT. (implementation defined behavior) 2711 // If the input exceeds the upper limit, we correct the input to be the 2712 // upper limit. (implementation defined behavior) 2713 // Actually, the flow should never get here until we use MAX_INT limit. 2714 } 2715 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2716 "max_active_levels for thread %d = (%d)\n", 2717 gtid, max_active_levels)); 2718 2719 thread = __kmp_threads[gtid]; 2720 2721 __kmp_save_internal_controls(thread); 2722 2723 set__max_active_levels(thread, max_active_levels); 2724 } 2725 2726 /* Gets max_active_levels */ 2727 int __kmp_get_max_active_levels(int gtid) { 2728 kmp_info_t *thread; 2729 2730 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2731 KMP_DEBUG_ASSERT(__kmp_init_serial); 2732 2733 thread = __kmp_threads[gtid]; 2734 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2735 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2736 "curtask_maxaclevel=%d\n", 2737 gtid, thread->th.th_current_task, 2738 thread->th.th_current_task->td_icvs.max_active_levels)); 2739 return thread->th.th_current_task->td_icvs.max_active_levels; 2740 } 2741 2742 // nteams-var per-device ICV 2743 void __kmp_set_num_teams(int num_teams) { 2744 if (num_teams > 0) 2745 __kmp_nteams = num_teams; 2746 } 2747 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2748 // teams-thread-limit-var per-device ICV 2749 void __kmp_set_teams_thread_limit(int limit) { 2750 if (limit > 0) 2751 __kmp_teams_thread_limit = limit; 2752 } 2753 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2754 2755 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2756 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2757 2758 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2759 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2760 kmp_info_t *thread; 2761 kmp_sched_t orig_kind; 2762 // kmp_team_t *team; 2763 2764 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2765 gtid, (int)kind, chunk)); 2766 KMP_DEBUG_ASSERT(__kmp_init_serial); 2767 2768 // Check if the kind parameter is valid, correct if needed. 2769 // Valid parameters should fit in one of two intervals - standard or extended: 2770 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2771 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2772 orig_kind = kind; 2773 kind = __kmp_sched_without_mods(kind); 2774 2775 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2776 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2777 // TODO: Hint needs attention in case we change the default schedule. 2778 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2779 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2780 __kmp_msg_null); 2781 kind = kmp_sched_default; 2782 chunk = 0; // ignore chunk value in case of bad kind 2783 } 2784 2785 thread = __kmp_threads[gtid]; 2786 2787 __kmp_save_internal_controls(thread); 2788 2789 if (kind < kmp_sched_upper_std) { 2790 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2791 // differ static chunked vs. unchunked: chunk should be invalid to 2792 // indicate unchunked schedule (which is the default) 2793 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2794 } else { 2795 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2796 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2797 } 2798 } else { 2799 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2800 // kmp_sched_lower - 2 ]; 2801 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2802 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2803 kmp_sched_lower - 2]; 2804 } 2805 __kmp_sched_apply_mods_intkind( 2806 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2807 if (kind == kmp_sched_auto || chunk < 1) { 2808 // ignore parameter chunk for schedule auto 2809 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2810 } else { 2811 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2812 } 2813 } 2814 2815 /* Gets def_sched_var ICV values */ 2816 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2817 kmp_info_t *thread; 2818 enum sched_type th_type; 2819 2820 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2821 KMP_DEBUG_ASSERT(__kmp_init_serial); 2822 2823 thread = __kmp_threads[gtid]; 2824 2825 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2826 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2827 case kmp_sch_static: 2828 case kmp_sch_static_greedy: 2829 case kmp_sch_static_balanced: 2830 *kind = kmp_sched_static; 2831 __kmp_sched_apply_mods_stdkind(kind, th_type); 2832 *chunk = 0; // chunk was not set, try to show this fact via zero value 2833 return; 2834 case kmp_sch_static_chunked: 2835 *kind = kmp_sched_static; 2836 break; 2837 case kmp_sch_dynamic_chunked: 2838 *kind = kmp_sched_dynamic; 2839 break; 2840 case kmp_sch_guided_chunked: 2841 case kmp_sch_guided_iterative_chunked: 2842 case kmp_sch_guided_analytical_chunked: 2843 *kind = kmp_sched_guided; 2844 break; 2845 case kmp_sch_auto: 2846 *kind = kmp_sched_auto; 2847 break; 2848 case kmp_sch_trapezoidal: 2849 *kind = kmp_sched_trapezoidal; 2850 break; 2851 #if KMP_STATIC_STEAL_ENABLED 2852 case kmp_sch_static_steal: 2853 *kind = kmp_sched_static_steal; 2854 break; 2855 #endif 2856 default: 2857 KMP_FATAL(UnknownSchedulingType, th_type); 2858 } 2859 2860 __kmp_sched_apply_mods_stdkind(kind, th_type); 2861 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2862 } 2863 2864 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2865 2866 int ii, dd; 2867 kmp_team_t *team; 2868 kmp_info_t *thr; 2869 2870 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2871 KMP_DEBUG_ASSERT(__kmp_init_serial); 2872 2873 // validate level 2874 if (level == 0) 2875 return 0; 2876 if (level < 0) 2877 return -1; 2878 thr = __kmp_threads[gtid]; 2879 team = thr->th.th_team; 2880 ii = team->t.t_level; 2881 if (level > ii) 2882 return -1; 2883 2884 if (thr->th.th_teams_microtask) { 2885 // AC: we are in teams region where multiple nested teams have same level 2886 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2887 if (level <= 2888 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2889 KMP_DEBUG_ASSERT(ii >= tlevel); 2890 // AC: As we need to pass by the teams league, we need to artificially 2891 // increase ii 2892 if (ii == tlevel) { 2893 ii += 2; // three teams have same level 2894 } else { 2895 ii++; // two teams have same level 2896 } 2897 } 2898 } 2899 2900 if (ii == level) 2901 return __kmp_tid_from_gtid(gtid); 2902 2903 dd = team->t.t_serialized; 2904 level++; 2905 while (ii > level) { 2906 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2907 } 2908 if ((team->t.t_serialized) && (!dd)) { 2909 team = team->t.t_parent; 2910 continue; 2911 } 2912 if (ii > level) { 2913 team = team->t.t_parent; 2914 dd = team->t.t_serialized; 2915 ii--; 2916 } 2917 } 2918 2919 return (dd > 1) ? (0) : (team->t.t_master_tid); 2920 } 2921 2922 int __kmp_get_team_size(int gtid, int level) { 2923 2924 int ii, dd; 2925 kmp_team_t *team; 2926 kmp_info_t *thr; 2927 2928 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2929 KMP_DEBUG_ASSERT(__kmp_init_serial); 2930 2931 // validate level 2932 if (level == 0) 2933 return 1; 2934 if (level < 0) 2935 return -1; 2936 thr = __kmp_threads[gtid]; 2937 team = thr->th.th_team; 2938 ii = team->t.t_level; 2939 if (level > ii) 2940 return -1; 2941 2942 if (thr->th.th_teams_microtask) { 2943 // AC: we are in teams region where multiple nested teams have same level 2944 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2945 if (level <= 2946 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2947 KMP_DEBUG_ASSERT(ii >= tlevel); 2948 // AC: As we need to pass by the teams league, we need to artificially 2949 // increase ii 2950 if (ii == tlevel) { 2951 ii += 2; // three teams have same level 2952 } else { 2953 ii++; // two teams have same level 2954 } 2955 } 2956 } 2957 2958 while (ii > level) { 2959 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2960 } 2961 if (team->t.t_serialized && (!dd)) { 2962 team = team->t.t_parent; 2963 continue; 2964 } 2965 if (ii > level) { 2966 team = team->t.t_parent; 2967 ii--; 2968 } 2969 } 2970 2971 return team->t.t_nproc; 2972 } 2973 2974 kmp_r_sched_t __kmp_get_schedule_global() { 2975 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2976 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2977 // independently. So one can get the updated schedule here. 2978 2979 kmp_r_sched_t r_sched; 2980 2981 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2982 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2983 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2984 // different roots (even in OMP 2.5) 2985 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2986 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2987 if (s == kmp_sch_static) { 2988 // replace STATIC with more detailed schedule (balanced or greedy) 2989 r_sched.r_sched_type = __kmp_static; 2990 } else if (s == kmp_sch_guided_chunked) { 2991 // replace GUIDED with more detailed schedule (iterative or analytical) 2992 r_sched.r_sched_type = __kmp_guided; 2993 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2994 r_sched.r_sched_type = __kmp_sched; 2995 } 2996 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2997 2998 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2999 // __kmp_chunk may be wrong here (if it was not ever set) 3000 r_sched.chunk = KMP_DEFAULT_CHUNK; 3001 } else { 3002 r_sched.chunk = __kmp_chunk; 3003 } 3004 3005 return r_sched; 3006 } 3007 3008 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3009 at least argc number of *t_argv entries for the requested team. */ 3010 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3011 3012 KMP_DEBUG_ASSERT(team); 3013 if (!realloc || argc > team->t.t_max_argc) { 3014 3015 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3016 "current entries=%d\n", 3017 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3018 /* if previously allocated heap space for args, free them */ 3019 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3020 __kmp_free((void *)team->t.t_argv); 3021 3022 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3023 /* use unused space in the cache line for arguments */ 3024 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3025 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3026 "argv entries\n", 3027 team->t.t_id, team->t.t_max_argc)); 3028 team->t.t_argv = &team->t.t_inline_argv[0]; 3029 if (__kmp_storage_map) { 3030 __kmp_print_storage_map_gtid( 3031 -1, &team->t.t_inline_argv[0], 3032 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3033 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3034 team->t.t_id); 3035 } 3036 } else { 3037 /* allocate space for arguments in the heap */ 3038 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3039 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3040 : 2 * argc; 3041 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3042 "argv entries\n", 3043 team->t.t_id, team->t.t_max_argc)); 3044 team->t.t_argv = 3045 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3046 if (__kmp_storage_map) { 3047 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3048 &team->t.t_argv[team->t.t_max_argc], 3049 sizeof(void *) * team->t.t_max_argc, 3050 "team_%d.t_argv", team->t.t_id); 3051 } 3052 } 3053 } 3054 } 3055 3056 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3057 int i; 3058 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3059 team->t.t_threads = 3060 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3061 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3062 sizeof(dispatch_shared_info_t) * num_disp_buff); 3063 team->t.t_dispatch = 3064 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3065 team->t.t_implicit_task_taskdata = 3066 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3067 team->t.t_max_nproc = max_nth; 3068 3069 /* setup dispatch buffers */ 3070 for (i = 0; i < num_disp_buff; ++i) { 3071 team->t.t_disp_buffer[i].buffer_index = i; 3072 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3073 } 3074 } 3075 3076 static void __kmp_free_team_arrays(kmp_team_t *team) { 3077 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3078 int i; 3079 for (i = 0; i < team->t.t_max_nproc; ++i) { 3080 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3081 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3082 team->t.t_dispatch[i].th_disp_buffer = NULL; 3083 } 3084 } 3085 #if KMP_USE_HIER_SCHED 3086 __kmp_dispatch_free_hierarchies(team); 3087 #endif 3088 __kmp_free(team->t.t_threads); 3089 __kmp_free(team->t.t_disp_buffer); 3090 __kmp_free(team->t.t_dispatch); 3091 __kmp_free(team->t.t_implicit_task_taskdata); 3092 team->t.t_threads = NULL; 3093 team->t.t_disp_buffer = NULL; 3094 team->t.t_dispatch = NULL; 3095 team->t.t_implicit_task_taskdata = 0; 3096 } 3097 3098 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3099 kmp_info_t **oldThreads = team->t.t_threads; 3100 3101 __kmp_free(team->t.t_disp_buffer); 3102 __kmp_free(team->t.t_dispatch); 3103 __kmp_free(team->t.t_implicit_task_taskdata); 3104 __kmp_allocate_team_arrays(team, max_nth); 3105 3106 KMP_MEMCPY(team->t.t_threads, oldThreads, 3107 team->t.t_nproc * sizeof(kmp_info_t *)); 3108 3109 __kmp_free(oldThreads); 3110 } 3111 3112 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3113 3114 kmp_r_sched_t r_sched = 3115 __kmp_get_schedule_global(); // get current state of scheduling globals 3116 3117 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3118 3119 kmp_internal_control_t g_icvs = { 3120 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3121 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3122 // adjustment of threads (per thread) 3123 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3124 // whether blocktime is explicitly set 3125 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3126 #if KMP_USE_MONITOR 3127 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3128 // intervals 3129 #endif 3130 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3131 // next parallel region (per thread) 3132 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3133 __kmp_cg_max_nth, // int thread_limit; 3134 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3135 // for max_active_levels 3136 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3137 // {sched,chunk} pair 3138 __kmp_nested_proc_bind.bind_types[0], 3139 __kmp_default_device, 3140 NULL // struct kmp_internal_control *next; 3141 }; 3142 3143 return g_icvs; 3144 } 3145 3146 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3147 3148 kmp_internal_control_t gx_icvs; 3149 gx_icvs.serial_nesting_level = 3150 0; // probably =team->t.t_serial like in save_inter_controls 3151 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3152 gx_icvs.next = NULL; 3153 3154 return gx_icvs; 3155 } 3156 3157 static void __kmp_initialize_root(kmp_root_t *root) { 3158 int f; 3159 kmp_team_t *root_team; 3160 kmp_team_t *hot_team; 3161 int hot_team_max_nth; 3162 kmp_r_sched_t r_sched = 3163 __kmp_get_schedule_global(); // get current state of scheduling globals 3164 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3165 KMP_DEBUG_ASSERT(root); 3166 KMP_ASSERT(!root->r.r_begin); 3167 3168 /* setup the root state structure */ 3169 __kmp_init_lock(&root->r.r_begin_lock); 3170 root->r.r_begin = FALSE; 3171 root->r.r_active = FALSE; 3172 root->r.r_in_parallel = 0; 3173 root->r.r_blocktime = __kmp_dflt_blocktime; 3174 3175 /* setup the root team for this task */ 3176 /* allocate the root team structure */ 3177 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3178 3179 root_team = 3180 __kmp_allocate_team(root, 3181 1, // new_nproc 3182 1, // max_nproc 3183 #if OMPT_SUPPORT 3184 ompt_data_none, // root parallel id 3185 #endif 3186 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3187 0 // argc 3188 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3189 ); 3190 #if USE_DEBUGGER 3191 // Non-NULL value should be assigned to make the debugger display the root 3192 // team. 3193 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3194 #endif 3195 3196 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3197 3198 root->r.r_root_team = root_team; 3199 root_team->t.t_control_stack_top = NULL; 3200 3201 /* initialize root team */ 3202 root_team->t.t_threads[0] = NULL; 3203 root_team->t.t_nproc = 1; 3204 root_team->t.t_serialized = 1; 3205 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3206 root_team->t.t_sched.sched = r_sched.sched; 3207 KA_TRACE( 3208 20, 3209 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3210 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3211 3212 /* setup the hot team for this task */ 3213 /* allocate the hot team structure */ 3214 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3215 3216 hot_team = 3217 __kmp_allocate_team(root, 3218 1, // new_nproc 3219 __kmp_dflt_team_nth_ub * 2, // max_nproc 3220 #if OMPT_SUPPORT 3221 ompt_data_none, // root parallel id 3222 #endif 3223 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3224 0 // argc 3225 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3226 ); 3227 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3228 3229 root->r.r_hot_team = hot_team; 3230 root_team->t.t_control_stack_top = NULL; 3231 3232 /* first-time initialization */ 3233 hot_team->t.t_parent = root_team; 3234 3235 /* initialize hot team */ 3236 hot_team_max_nth = hot_team->t.t_max_nproc; 3237 for (f = 0; f < hot_team_max_nth; ++f) { 3238 hot_team->t.t_threads[f] = NULL; 3239 } 3240 hot_team->t.t_nproc = 1; 3241 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3242 hot_team->t.t_sched.sched = r_sched.sched; 3243 hot_team->t.t_size_changed = 0; 3244 } 3245 3246 #ifdef KMP_DEBUG 3247 3248 typedef struct kmp_team_list_item { 3249 kmp_team_p const *entry; 3250 struct kmp_team_list_item *next; 3251 } kmp_team_list_item_t; 3252 typedef kmp_team_list_item_t *kmp_team_list_t; 3253 3254 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3255 kmp_team_list_t list, // List of teams. 3256 kmp_team_p const *team // Team to add. 3257 ) { 3258 3259 // List must terminate with item where both entry and next are NULL. 3260 // Team is added to the list only once. 3261 // List is sorted in ascending order by team id. 3262 // Team id is *not* a key. 3263 3264 kmp_team_list_t l; 3265 3266 KMP_DEBUG_ASSERT(list != NULL); 3267 if (team == NULL) { 3268 return; 3269 } 3270 3271 __kmp_print_structure_team_accum(list, team->t.t_parent); 3272 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3273 3274 // Search list for the team. 3275 l = list; 3276 while (l->next != NULL && l->entry != team) { 3277 l = l->next; 3278 } 3279 if (l->next != NULL) { 3280 return; // Team has been added before, exit. 3281 } 3282 3283 // Team is not found. Search list again for insertion point. 3284 l = list; 3285 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3286 l = l->next; 3287 } 3288 3289 // Insert team. 3290 { 3291 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3292 sizeof(kmp_team_list_item_t)); 3293 *item = *l; 3294 l->entry = team; 3295 l->next = item; 3296 } 3297 } 3298 3299 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3300 3301 ) { 3302 __kmp_printf("%s", title); 3303 if (team != NULL) { 3304 __kmp_printf("%2x %p\n", team->t.t_id, team); 3305 } else { 3306 __kmp_printf(" - (nil)\n"); 3307 } 3308 } 3309 3310 static void __kmp_print_structure_thread(char const *title, 3311 kmp_info_p const *thread) { 3312 __kmp_printf("%s", title); 3313 if (thread != NULL) { 3314 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3315 } else { 3316 __kmp_printf(" - (nil)\n"); 3317 } 3318 } 3319 3320 void __kmp_print_structure(void) { 3321 3322 kmp_team_list_t list; 3323 3324 // Initialize list of teams. 3325 list = 3326 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3327 list->entry = NULL; 3328 list->next = NULL; 3329 3330 __kmp_printf("\n------------------------------\nGlobal Thread " 3331 "Table\n------------------------------\n"); 3332 { 3333 int gtid; 3334 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3335 __kmp_printf("%2d", gtid); 3336 if (__kmp_threads != NULL) { 3337 __kmp_printf(" %p", __kmp_threads[gtid]); 3338 } 3339 if (__kmp_root != NULL) { 3340 __kmp_printf(" %p", __kmp_root[gtid]); 3341 } 3342 __kmp_printf("\n"); 3343 } 3344 } 3345 3346 // Print out __kmp_threads array. 3347 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3348 "----------\n"); 3349 if (__kmp_threads != NULL) { 3350 int gtid; 3351 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3352 kmp_info_t const *thread = __kmp_threads[gtid]; 3353 if (thread != NULL) { 3354 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3355 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3356 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3357 __kmp_print_structure_team(" Serial Team: ", 3358 thread->th.th_serial_team); 3359 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3360 __kmp_print_structure_thread(" Primary: ", 3361 thread->th.th_team_master); 3362 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3363 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3364 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3365 __kmp_print_structure_thread(" Next in pool: ", 3366 thread->th.th_next_pool); 3367 __kmp_printf("\n"); 3368 __kmp_print_structure_team_accum(list, thread->th.th_team); 3369 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3370 } 3371 } 3372 } else { 3373 __kmp_printf("Threads array is not allocated.\n"); 3374 } 3375 3376 // Print out __kmp_root array. 3377 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3378 "--------\n"); 3379 if (__kmp_root != NULL) { 3380 int gtid; 3381 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3382 kmp_root_t const *root = __kmp_root[gtid]; 3383 if (root != NULL) { 3384 __kmp_printf("GTID %2d %p:\n", gtid, root); 3385 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3386 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3387 __kmp_print_structure_thread(" Uber Thread: ", 3388 root->r.r_uber_thread); 3389 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3390 __kmp_printf(" In Parallel: %2d\n", 3391 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3392 __kmp_printf("\n"); 3393 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3394 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3395 } 3396 } 3397 } else { 3398 __kmp_printf("Ubers array is not allocated.\n"); 3399 } 3400 3401 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3402 "--------\n"); 3403 while (list->next != NULL) { 3404 kmp_team_p const *team = list->entry; 3405 int i; 3406 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3407 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3408 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3409 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3410 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3411 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3412 for (i = 0; i < team->t.t_nproc; ++i) { 3413 __kmp_printf(" Thread %2d: ", i); 3414 __kmp_print_structure_thread("", team->t.t_threads[i]); 3415 } 3416 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3417 __kmp_printf("\n"); 3418 list = list->next; 3419 } 3420 3421 // Print out __kmp_thread_pool and __kmp_team_pool. 3422 __kmp_printf("\n------------------------------\nPools\n----------------------" 3423 "--------\n"); 3424 __kmp_print_structure_thread("Thread pool: ", 3425 CCAST(kmp_info_t *, __kmp_thread_pool)); 3426 __kmp_print_structure_team("Team pool: ", 3427 CCAST(kmp_team_t *, __kmp_team_pool)); 3428 __kmp_printf("\n"); 3429 3430 // Free team list. 3431 while (list != NULL) { 3432 kmp_team_list_item_t *item = list; 3433 list = list->next; 3434 KMP_INTERNAL_FREE(item); 3435 } 3436 } 3437 3438 #endif 3439 3440 //--------------------------------------------------------------------------- 3441 // Stuff for per-thread fast random number generator 3442 // Table of primes 3443 static const unsigned __kmp_primes[] = { 3444 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3445 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3446 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3447 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3448 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3449 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3450 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3451 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3452 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3453 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3454 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3455 3456 //--------------------------------------------------------------------------- 3457 // __kmp_get_random: Get a random number using a linear congruential method. 3458 unsigned short __kmp_get_random(kmp_info_t *thread) { 3459 unsigned x = thread->th.th_x; 3460 unsigned short r = (unsigned short)(x >> 16); 3461 3462 thread->th.th_x = x * thread->th.th_a + 1; 3463 3464 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3465 thread->th.th_info.ds.ds_tid, r)); 3466 3467 return r; 3468 } 3469 //-------------------------------------------------------- 3470 // __kmp_init_random: Initialize a random number generator 3471 void __kmp_init_random(kmp_info_t *thread) { 3472 unsigned seed = thread->th.th_info.ds.ds_tid; 3473 3474 thread->th.th_a = 3475 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3476 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3477 KA_TRACE(30, 3478 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3479 } 3480 3481 #if KMP_OS_WINDOWS 3482 /* reclaim array entries for root threads that are already dead, returns number 3483 * reclaimed */ 3484 static int __kmp_reclaim_dead_roots(void) { 3485 int i, r = 0; 3486 3487 for (i = 0; i < __kmp_threads_capacity; ++i) { 3488 if (KMP_UBER_GTID(i) && 3489 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3490 !__kmp_root[i] 3491 ->r.r_active) { // AC: reclaim only roots died in non-active state 3492 r += __kmp_unregister_root_other_thread(i); 3493 } 3494 } 3495 return r; 3496 } 3497 #endif 3498 3499 /* This function attempts to create free entries in __kmp_threads and 3500 __kmp_root, and returns the number of free entries generated. 3501 3502 For Windows* OS static library, the first mechanism used is to reclaim array 3503 entries for root threads that are already dead. 3504 3505 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3506 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3507 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3508 threadprivate cache array has been created. Synchronization with 3509 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3510 3511 After any dead root reclamation, if the clipping value allows array expansion 3512 to result in the generation of a total of nNeed free slots, the function does 3513 that expansion. If not, nothing is done beyond the possible initial root 3514 thread reclamation. 3515 3516 If any argument is negative, the behavior is undefined. */ 3517 static int __kmp_expand_threads(int nNeed) { 3518 int added = 0; 3519 int minimumRequiredCapacity; 3520 int newCapacity; 3521 kmp_info_t **newThreads; 3522 kmp_root_t **newRoot; 3523 3524 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3525 // resizing __kmp_threads does not need additional protection if foreign 3526 // threads are present 3527 3528 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3529 /* only for Windows static library */ 3530 /* reclaim array entries for root threads that are already dead */ 3531 added = __kmp_reclaim_dead_roots(); 3532 3533 if (nNeed) { 3534 nNeed -= added; 3535 if (nNeed < 0) 3536 nNeed = 0; 3537 } 3538 #endif 3539 if (nNeed <= 0) 3540 return added; 3541 3542 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3543 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3544 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3545 // > __kmp_max_nth in one of two ways: 3546 // 3547 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3548 // may not be reused by another thread, so we may need to increase 3549 // __kmp_threads_capacity to __kmp_max_nth + 1. 3550 // 3551 // 2) New foreign root(s) are encountered. We always register new foreign 3552 // roots. This may cause a smaller # of threads to be allocated at 3553 // subsequent parallel regions, but the worker threads hang around (and 3554 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3555 // 3556 // Anyway, that is the reason for moving the check to see if 3557 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3558 // instead of having it performed here. -BB 3559 3560 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3561 3562 /* compute expansion headroom to check if we can expand */ 3563 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3564 /* possible expansion too small -- give up */ 3565 return added; 3566 } 3567 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3568 3569 newCapacity = __kmp_threads_capacity; 3570 do { 3571 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3572 : __kmp_sys_max_nth; 3573 } while (newCapacity < minimumRequiredCapacity); 3574 newThreads = (kmp_info_t **)__kmp_allocate( 3575 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3576 newRoot = 3577 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3578 KMP_MEMCPY(newThreads, __kmp_threads, 3579 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3580 KMP_MEMCPY(newRoot, __kmp_root, 3581 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3582 3583 kmp_info_t **temp_threads = __kmp_threads; 3584 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3585 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3586 __kmp_free(temp_threads); 3587 added += newCapacity - __kmp_threads_capacity; 3588 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3589 3590 if (newCapacity > __kmp_tp_capacity) { 3591 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3592 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3593 __kmp_threadprivate_resize_cache(newCapacity); 3594 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3595 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3596 } 3597 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3598 } 3599 3600 return added; 3601 } 3602 3603 /* Register the current thread as a root thread and obtain our gtid. We must 3604 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3605 thread that calls from __kmp_do_serial_initialize() */ 3606 int __kmp_register_root(int initial_thread) { 3607 kmp_info_t *root_thread; 3608 kmp_root_t *root; 3609 int gtid; 3610 int capacity; 3611 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3612 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3613 KMP_MB(); 3614 3615 /* 2007-03-02: 3616 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3617 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3618 work as expected -- it may return false (that means there is at least one 3619 empty slot in __kmp_threads array), but it is possible the only free slot 3620 is #0, which is reserved for initial thread and so cannot be used for this 3621 one. Following code workarounds this bug. 3622 3623 However, right solution seems to be not reserving slot #0 for initial 3624 thread because: 3625 (1) there is no magic in slot #0, 3626 (2) we cannot detect initial thread reliably (the first thread which does 3627 serial initialization may be not a real initial thread). 3628 */ 3629 capacity = __kmp_threads_capacity; 3630 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3631 --capacity; 3632 } 3633 3634 // If it is not for initializing the hidden helper team, we need to take 3635 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3636 // in __kmp_threads_capacity. 3637 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3638 capacity -= __kmp_hidden_helper_threads_num; 3639 } 3640 3641 /* see if there are too many threads */ 3642 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3643 if (__kmp_tp_cached) { 3644 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3645 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3646 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3647 } else { 3648 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3649 __kmp_msg_null); 3650 } 3651 } 3652 3653 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3654 // 0: initial thread, also a regular OpenMP thread. 3655 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3656 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3657 // regular OpenMP threads. 3658 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3659 // Find an available thread slot for hidden helper thread. Slots for hidden 3660 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3661 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3662 gtid <= __kmp_hidden_helper_threads_num; 3663 gtid++) 3664 ; 3665 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3666 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3667 "hidden helper thread: T#%d\n", 3668 gtid)); 3669 } else { 3670 /* find an available thread slot */ 3671 // Don't reassign the zero slot since we need that to only be used by 3672 // initial thread. Slots for hidden helper threads should also be skipped. 3673 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3674 gtid = 0; 3675 } else { 3676 for (gtid = __kmp_hidden_helper_threads_num + 1; 3677 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3678 ; 3679 } 3680 KA_TRACE( 3681 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3682 KMP_ASSERT(gtid < __kmp_threads_capacity); 3683 } 3684 3685 /* update global accounting */ 3686 __kmp_all_nth++; 3687 TCW_4(__kmp_nth, __kmp_nth + 1); 3688 3689 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3690 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3691 if (__kmp_adjust_gtid_mode) { 3692 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3693 if (TCR_4(__kmp_gtid_mode) != 2) { 3694 TCW_4(__kmp_gtid_mode, 2); 3695 } 3696 } else { 3697 if (TCR_4(__kmp_gtid_mode) != 1) { 3698 TCW_4(__kmp_gtid_mode, 1); 3699 } 3700 } 3701 } 3702 3703 #ifdef KMP_ADJUST_BLOCKTIME 3704 /* Adjust blocktime to zero if necessary */ 3705 /* Middle initialization might not have occurred yet */ 3706 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3707 if (__kmp_nth > __kmp_avail_proc) { 3708 __kmp_zero_bt = TRUE; 3709 } 3710 } 3711 #endif /* KMP_ADJUST_BLOCKTIME */ 3712 3713 /* setup this new hierarchy */ 3714 if (!(root = __kmp_root[gtid])) { 3715 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3716 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3717 } 3718 3719 #if KMP_STATS_ENABLED 3720 // Initialize stats as soon as possible (right after gtid assignment). 3721 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3722 __kmp_stats_thread_ptr->startLife(); 3723 KMP_SET_THREAD_STATE(SERIAL_REGION); 3724 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3725 #endif 3726 __kmp_initialize_root(root); 3727 3728 /* setup new root thread structure */ 3729 if (root->r.r_uber_thread) { 3730 root_thread = root->r.r_uber_thread; 3731 } else { 3732 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3733 if (__kmp_storage_map) { 3734 __kmp_print_thread_storage_map(root_thread, gtid); 3735 } 3736 root_thread->th.th_info.ds.ds_gtid = gtid; 3737 #if OMPT_SUPPORT 3738 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3739 #endif 3740 root_thread->th.th_root = root; 3741 if (__kmp_env_consistency_check) { 3742 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3743 } 3744 #if USE_FAST_MEMORY 3745 __kmp_initialize_fast_memory(root_thread); 3746 #endif /* USE_FAST_MEMORY */ 3747 3748 #if KMP_USE_BGET 3749 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3750 __kmp_initialize_bget(root_thread); 3751 #endif 3752 __kmp_init_random(root_thread); // Initialize random number generator 3753 } 3754 3755 /* setup the serial team held in reserve by the root thread */ 3756 if (!root_thread->th.th_serial_team) { 3757 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3758 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3759 root_thread->th.th_serial_team = __kmp_allocate_team( 3760 root, 1, 1, 3761 #if OMPT_SUPPORT 3762 ompt_data_none, // root parallel id 3763 #endif 3764 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3765 } 3766 KMP_ASSERT(root_thread->th.th_serial_team); 3767 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3768 root_thread->th.th_serial_team)); 3769 3770 /* drop root_thread into place */ 3771 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3772 3773 root->r.r_root_team->t.t_threads[0] = root_thread; 3774 root->r.r_hot_team->t.t_threads[0] = root_thread; 3775 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3776 // AC: the team created in reserve, not for execution (it is unused for now). 3777 root_thread->th.th_serial_team->t.t_serialized = 0; 3778 root->r.r_uber_thread = root_thread; 3779 3780 /* initialize the thread, get it ready to go */ 3781 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3782 TCW_4(__kmp_init_gtid, TRUE); 3783 3784 /* prepare the primary thread for get_gtid() */ 3785 __kmp_gtid_set_specific(gtid); 3786 3787 #if USE_ITT_BUILD 3788 __kmp_itt_thread_name(gtid); 3789 #endif /* USE_ITT_BUILD */ 3790 3791 #ifdef KMP_TDATA_GTID 3792 __kmp_gtid = gtid; 3793 #endif 3794 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3795 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3796 3797 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3798 "plain=%u\n", 3799 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3800 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3801 KMP_INIT_BARRIER_STATE)); 3802 { // Initialize barrier data. 3803 int b; 3804 for (b = 0; b < bs_last_barrier; ++b) { 3805 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3806 #if USE_DEBUGGER 3807 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3808 #endif 3809 } 3810 } 3811 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3812 KMP_INIT_BARRIER_STATE); 3813 3814 #if KMP_AFFINITY_SUPPORTED 3815 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3816 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3817 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3818 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3819 if (TCR_4(__kmp_init_middle)) { 3820 __kmp_affinity_set_init_mask(gtid, TRUE); 3821 } 3822 #endif /* KMP_AFFINITY_SUPPORTED */ 3823 root_thread->th.th_def_allocator = __kmp_def_allocator; 3824 root_thread->th.th_prev_level = 0; 3825 root_thread->th.th_prev_num_threads = 1; 3826 3827 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3828 tmp->cg_root = root_thread; 3829 tmp->cg_thread_limit = __kmp_cg_max_nth; 3830 tmp->cg_nthreads = 1; 3831 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3832 " cg_nthreads init to 1\n", 3833 root_thread, tmp)); 3834 tmp->up = NULL; 3835 root_thread->th.th_cg_roots = tmp; 3836 3837 __kmp_root_counter++; 3838 3839 #if OMPT_SUPPORT 3840 if (!initial_thread && ompt_enabled.enabled) { 3841 3842 kmp_info_t *root_thread = ompt_get_thread(); 3843 3844 ompt_set_thread_state(root_thread, ompt_state_overhead); 3845 3846 if (ompt_enabled.ompt_callback_thread_begin) { 3847 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3848 ompt_thread_initial, __ompt_get_thread_data_internal()); 3849 } 3850 ompt_data_t *task_data; 3851 ompt_data_t *parallel_data; 3852 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3853 NULL); 3854 if (ompt_enabled.ompt_callback_implicit_task) { 3855 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3856 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3857 } 3858 3859 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3860 } 3861 #endif 3862 #if OMPD_SUPPORT 3863 if (ompd_state & OMPD_ENABLE_BP) 3864 ompd_bp_thread_begin(); 3865 #endif 3866 3867 KMP_MB(); 3868 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3869 3870 return gtid; 3871 } 3872 3873 #if KMP_NESTED_HOT_TEAMS 3874 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3875 const int max_level) { 3876 int i, n, nth; 3877 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3878 if (!hot_teams || !hot_teams[level].hot_team) { 3879 return 0; 3880 } 3881 KMP_DEBUG_ASSERT(level < max_level); 3882 kmp_team_t *team = hot_teams[level].hot_team; 3883 nth = hot_teams[level].hot_team_nth; 3884 n = nth - 1; // primary thread is not freed 3885 if (level < max_level - 1) { 3886 for (i = 0; i < nth; ++i) { 3887 kmp_info_t *th = team->t.t_threads[i]; 3888 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3889 if (i > 0 && th->th.th_hot_teams) { 3890 __kmp_free(th->th.th_hot_teams); 3891 th->th.th_hot_teams = NULL; 3892 } 3893 } 3894 } 3895 __kmp_free_team(root, team, NULL); 3896 return n; 3897 } 3898 #endif 3899 3900 // Resets a root thread and clear its root and hot teams. 3901 // Returns the number of __kmp_threads entries directly and indirectly freed. 3902 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3903 kmp_team_t *root_team = root->r.r_root_team; 3904 kmp_team_t *hot_team = root->r.r_hot_team; 3905 int n = hot_team->t.t_nproc; 3906 int i; 3907 3908 KMP_DEBUG_ASSERT(!root->r.r_active); 3909 3910 root->r.r_root_team = NULL; 3911 root->r.r_hot_team = NULL; 3912 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3913 // before call to __kmp_free_team(). 3914 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3915 #if KMP_NESTED_HOT_TEAMS 3916 if (__kmp_hot_teams_max_level > 3917 0) { // need to free nested hot teams and their threads if any 3918 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3919 kmp_info_t *th = hot_team->t.t_threads[i]; 3920 if (__kmp_hot_teams_max_level > 1) { 3921 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3922 } 3923 if (th->th.th_hot_teams) { 3924 __kmp_free(th->th.th_hot_teams); 3925 th->th.th_hot_teams = NULL; 3926 } 3927 } 3928 } 3929 #endif 3930 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3931 3932 // Before we can reap the thread, we need to make certain that all other 3933 // threads in the teams that had this root as ancestor have stopped trying to 3934 // steal tasks. 3935 if (__kmp_tasking_mode != tskm_immediate_exec) { 3936 __kmp_wait_to_unref_task_teams(); 3937 } 3938 3939 #if KMP_OS_WINDOWS 3940 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3941 KA_TRACE( 3942 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3943 "\n", 3944 (LPVOID) & (root->r.r_uber_thread->th), 3945 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3946 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3947 #endif /* KMP_OS_WINDOWS */ 3948 3949 #if OMPD_SUPPORT 3950 if (ompd_state & OMPD_ENABLE_BP) 3951 ompd_bp_thread_end(); 3952 #endif 3953 3954 #if OMPT_SUPPORT 3955 ompt_data_t *task_data; 3956 ompt_data_t *parallel_data; 3957 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3958 NULL); 3959 if (ompt_enabled.ompt_callback_implicit_task) { 3960 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3961 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3962 } 3963 if (ompt_enabled.ompt_callback_thread_end) { 3964 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3965 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3966 } 3967 #endif 3968 3969 TCW_4(__kmp_nth, 3970 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3971 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3972 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3973 " to %d\n", 3974 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3975 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3976 if (i == 1) { 3977 // need to free contention group structure 3978 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3979 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3980 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3981 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3982 root->r.r_uber_thread->th.th_cg_roots = NULL; 3983 } 3984 __kmp_reap_thread(root->r.r_uber_thread, 1); 3985 3986 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3987 // instead of freeing. 3988 root->r.r_uber_thread = NULL; 3989 /* mark root as no longer in use */ 3990 root->r.r_begin = FALSE; 3991 3992 return n; 3993 } 3994 3995 void __kmp_unregister_root_current_thread(int gtid) { 3996 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3997 /* this lock should be ok, since unregister_root_current_thread is never 3998 called during an abort, only during a normal close. furthermore, if you 3999 have the forkjoin lock, you should never try to get the initz lock */ 4000 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4001 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4002 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4003 "exiting T#%d\n", 4004 gtid)); 4005 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4006 return; 4007 } 4008 kmp_root_t *root = __kmp_root[gtid]; 4009 4010 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4011 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4012 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4013 KMP_ASSERT(root->r.r_active == FALSE); 4014 4015 KMP_MB(); 4016 4017 kmp_info_t *thread = __kmp_threads[gtid]; 4018 kmp_team_t *team = thread->th.th_team; 4019 kmp_task_team_t *task_team = thread->th.th_task_team; 4020 4021 // we need to wait for the proxy tasks before finishing the thread 4022 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4023 #if OMPT_SUPPORT 4024 // the runtime is shutting down so we won't report any events 4025 thread->th.ompt_thread_info.state = ompt_state_undefined; 4026 #endif 4027 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4028 } 4029 4030 __kmp_reset_root(gtid, root); 4031 4032 KMP_MB(); 4033 KC_TRACE(10, 4034 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4035 4036 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4037 } 4038 4039 #if KMP_OS_WINDOWS 4040 /* __kmp_forkjoin_lock must be already held 4041 Unregisters a root thread that is not the current thread. Returns the number 4042 of __kmp_threads entries freed as a result. */ 4043 static int __kmp_unregister_root_other_thread(int gtid) { 4044 kmp_root_t *root = __kmp_root[gtid]; 4045 int r; 4046 4047 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4048 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4049 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4050 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4051 KMP_ASSERT(root->r.r_active == FALSE); 4052 4053 r = __kmp_reset_root(gtid, root); 4054 KC_TRACE(10, 4055 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4056 return r; 4057 } 4058 #endif 4059 4060 #if KMP_DEBUG 4061 void __kmp_task_info() { 4062 4063 kmp_int32 gtid = __kmp_entry_gtid(); 4064 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4065 kmp_info_t *this_thr = __kmp_threads[gtid]; 4066 kmp_team_t *steam = this_thr->th.th_serial_team; 4067 kmp_team_t *team = this_thr->th.th_team; 4068 4069 __kmp_printf( 4070 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4071 "ptask=%p\n", 4072 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4073 team->t.t_implicit_task_taskdata[tid].td_parent); 4074 } 4075 #endif // KMP_DEBUG 4076 4077 /* TODO optimize with one big memclr, take out what isn't needed, split 4078 responsibility to workers as much as possible, and delay initialization of 4079 features as much as possible */ 4080 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4081 int tid, int gtid) { 4082 /* this_thr->th.th_info.ds.ds_gtid is setup in 4083 kmp_allocate_thread/create_worker. 4084 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4085 KMP_DEBUG_ASSERT(this_thr != NULL); 4086 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4087 KMP_DEBUG_ASSERT(team); 4088 KMP_DEBUG_ASSERT(team->t.t_threads); 4089 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4090 kmp_info_t *master = team->t.t_threads[0]; 4091 KMP_DEBUG_ASSERT(master); 4092 KMP_DEBUG_ASSERT(master->th.th_root); 4093 4094 KMP_MB(); 4095 4096 TCW_SYNC_PTR(this_thr->th.th_team, team); 4097 4098 this_thr->th.th_info.ds.ds_tid = tid; 4099 this_thr->th.th_set_nproc = 0; 4100 if (__kmp_tasking_mode != tskm_immediate_exec) 4101 // When tasking is possible, threads are not safe to reap until they are 4102 // done tasking; this will be set when tasking code is exited in wait 4103 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4104 else // no tasking --> always safe to reap 4105 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4106 this_thr->th.th_set_proc_bind = proc_bind_default; 4107 #if KMP_AFFINITY_SUPPORTED 4108 this_thr->th.th_new_place = this_thr->th.th_current_place; 4109 #endif 4110 this_thr->th.th_root = master->th.th_root; 4111 4112 /* setup the thread's cache of the team structure */ 4113 this_thr->th.th_team_nproc = team->t.t_nproc; 4114 this_thr->th.th_team_master = master; 4115 this_thr->th.th_team_serialized = team->t.t_serialized; 4116 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4117 4118 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4119 4120 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4121 tid, gtid, this_thr, this_thr->th.th_current_task)); 4122 4123 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4124 team, tid, TRUE); 4125 4126 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4127 tid, gtid, this_thr, this_thr->th.th_current_task)); 4128 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4129 // __kmp_initialize_team()? 4130 4131 /* TODO no worksharing in speculative threads */ 4132 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4133 4134 this_thr->th.th_local.this_construct = 0; 4135 4136 if (!this_thr->th.th_pri_common) { 4137 this_thr->th.th_pri_common = 4138 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4139 if (__kmp_storage_map) { 4140 __kmp_print_storage_map_gtid( 4141 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4142 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4143 } 4144 this_thr->th.th_pri_head = NULL; 4145 } 4146 4147 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4148 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4149 // Make new thread's CG root same as primary thread's 4150 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4151 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4152 if (tmp) { 4153 // worker changes CG, need to check if old CG should be freed 4154 int i = tmp->cg_nthreads--; 4155 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4156 " on node %p of thread %p to %d\n", 4157 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4158 if (i == 1) { 4159 __kmp_free(tmp); // last thread left CG --> free it 4160 } 4161 } 4162 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4163 // Increment new thread's CG root's counter to add the new thread 4164 this_thr->th.th_cg_roots->cg_nthreads++; 4165 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4166 " node %p of thread %p to %d\n", 4167 this_thr, this_thr->th.th_cg_roots, 4168 this_thr->th.th_cg_roots->cg_root, 4169 this_thr->th.th_cg_roots->cg_nthreads)); 4170 this_thr->th.th_current_task->td_icvs.thread_limit = 4171 this_thr->th.th_cg_roots->cg_thread_limit; 4172 } 4173 4174 /* Initialize dynamic dispatch */ 4175 { 4176 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4177 // Use team max_nproc since this will never change for the team. 4178 size_t disp_size = 4179 sizeof(dispatch_private_info_t) * 4180 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4181 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4182 team->t.t_max_nproc)); 4183 KMP_ASSERT(dispatch); 4184 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4185 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4186 4187 dispatch->th_disp_index = 0; 4188 dispatch->th_doacross_buf_idx = 0; 4189 if (!dispatch->th_disp_buffer) { 4190 dispatch->th_disp_buffer = 4191 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4192 4193 if (__kmp_storage_map) { 4194 __kmp_print_storage_map_gtid( 4195 gtid, &dispatch->th_disp_buffer[0], 4196 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4197 ? 1 4198 : __kmp_dispatch_num_buffers], 4199 disp_size, 4200 "th_%d.th_dispatch.th_disp_buffer " 4201 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4202 gtid, team->t.t_id, gtid); 4203 } 4204 } else { 4205 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4206 } 4207 4208 dispatch->th_dispatch_pr_current = 0; 4209 dispatch->th_dispatch_sh_current = 0; 4210 4211 dispatch->th_deo_fcn = 0; /* ORDERED */ 4212 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4213 } 4214 4215 this_thr->th.th_next_pool = NULL; 4216 4217 if (!this_thr->th.th_task_state_memo_stack) { 4218 size_t i; 4219 this_thr->th.th_task_state_memo_stack = 4220 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4221 this_thr->th.th_task_state_top = 0; 4222 this_thr->th.th_task_state_stack_sz = 4; 4223 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4224 ++i) // zero init the stack 4225 this_thr->th.th_task_state_memo_stack[i] = 0; 4226 } 4227 4228 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4229 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4230 4231 KMP_MB(); 4232 } 4233 4234 /* allocate a new thread for the requesting team. this is only called from 4235 within a forkjoin critical section. we will first try to get an available 4236 thread from the thread pool. if none is available, we will fork a new one 4237 assuming we are able to create a new one. this should be assured, as the 4238 caller should check on this first. */ 4239 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4240 int new_tid) { 4241 kmp_team_t *serial_team; 4242 kmp_info_t *new_thr; 4243 int new_gtid; 4244 4245 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4246 KMP_DEBUG_ASSERT(root && team); 4247 #if !KMP_NESTED_HOT_TEAMS 4248 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4249 #endif 4250 KMP_MB(); 4251 4252 /* first, try to get one from the thread pool */ 4253 if (__kmp_thread_pool) { 4254 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4255 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4256 if (new_thr == __kmp_thread_pool_insert_pt) { 4257 __kmp_thread_pool_insert_pt = NULL; 4258 } 4259 TCW_4(new_thr->th.th_in_pool, FALSE); 4260 __kmp_suspend_initialize_thread(new_thr); 4261 __kmp_lock_suspend_mx(new_thr); 4262 if (new_thr->th.th_active_in_pool == TRUE) { 4263 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4264 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4265 new_thr->th.th_active_in_pool = FALSE; 4266 } 4267 __kmp_unlock_suspend_mx(new_thr); 4268 4269 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4270 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4271 KMP_ASSERT(!new_thr->th.th_team); 4272 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4273 4274 /* setup the thread structure */ 4275 __kmp_initialize_info(new_thr, team, new_tid, 4276 new_thr->th.th_info.ds.ds_gtid); 4277 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4278 4279 TCW_4(__kmp_nth, __kmp_nth + 1); 4280 4281 new_thr->th.th_task_state = 0; 4282 new_thr->th.th_task_state_top = 0; 4283 new_thr->th.th_task_state_stack_sz = 4; 4284 4285 #ifdef KMP_ADJUST_BLOCKTIME 4286 /* Adjust blocktime back to zero if necessary */ 4287 /* Middle initialization might not have occurred yet */ 4288 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4289 if (__kmp_nth > __kmp_avail_proc) { 4290 __kmp_zero_bt = TRUE; 4291 } 4292 } 4293 #endif /* KMP_ADJUST_BLOCKTIME */ 4294 4295 #if KMP_DEBUG 4296 // If thread entered pool via __kmp_free_thread, wait_flag should != 4297 // KMP_BARRIER_PARENT_FLAG. 4298 int b; 4299 kmp_balign_t *balign = new_thr->th.th_bar; 4300 for (b = 0; b < bs_last_barrier; ++b) 4301 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4302 #endif 4303 4304 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4305 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4306 4307 KMP_MB(); 4308 return new_thr; 4309 } 4310 4311 /* no, well fork a new one */ 4312 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4313 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4314 4315 #if KMP_USE_MONITOR 4316 // If this is the first worker thread the RTL is creating, then also 4317 // launch the monitor thread. We try to do this as early as possible. 4318 if (!TCR_4(__kmp_init_monitor)) { 4319 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4320 if (!TCR_4(__kmp_init_monitor)) { 4321 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4322 TCW_4(__kmp_init_monitor, 1); 4323 __kmp_create_monitor(&__kmp_monitor); 4324 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4325 #if KMP_OS_WINDOWS 4326 // AC: wait until monitor has started. This is a fix for CQ232808. 4327 // The reason is that if the library is loaded/unloaded in a loop with 4328 // small (parallel) work in between, then there is high probability that 4329 // monitor thread started after the library shutdown. At shutdown it is 4330 // too late to cope with the problem, because when the primary thread is 4331 // in DllMain (process detach) the monitor has no chances to start (it is 4332 // blocked), and primary thread has no means to inform the monitor that 4333 // the library has gone, because all the memory which the monitor can 4334 // access is going to be released/reset. 4335 while (TCR_4(__kmp_init_monitor) < 2) { 4336 KMP_YIELD(TRUE); 4337 } 4338 KF_TRACE(10, ("after monitor thread has started\n")); 4339 #endif 4340 } 4341 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4342 } 4343 #endif 4344 4345 KMP_MB(); 4346 4347 { 4348 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4349 ? 1 4350 : __kmp_hidden_helper_threads_num + 1; 4351 4352 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4353 ++new_gtid) { 4354 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4355 } 4356 4357 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4358 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4359 } 4360 } 4361 4362 /* allocate space for it. */ 4363 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4364 4365 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4366 4367 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4368 // suppress race conditions detection on synchronization flags in debug mode 4369 // this helps to analyze library internals eliminating false positives 4370 __itt_suppress_mark_range( 4371 __itt_suppress_range, __itt_suppress_threading_errors, 4372 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4373 __itt_suppress_mark_range( 4374 __itt_suppress_range, __itt_suppress_threading_errors, 4375 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4376 #if KMP_OS_WINDOWS 4377 __itt_suppress_mark_range( 4378 __itt_suppress_range, __itt_suppress_threading_errors, 4379 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4380 #else 4381 __itt_suppress_mark_range(__itt_suppress_range, 4382 __itt_suppress_threading_errors, 4383 &new_thr->th.th_suspend_init_count, 4384 sizeof(new_thr->th.th_suspend_init_count)); 4385 #endif 4386 // TODO: check if we need to also suppress b_arrived flags 4387 __itt_suppress_mark_range(__itt_suppress_range, 4388 __itt_suppress_threading_errors, 4389 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4390 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4391 __itt_suppress_mark_range(__itt_suppress_range, 4392 __itt_suppress_threading_errors, 4393 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4394 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4395 __itt_suppress_mark_range(__itt_suppress_range, 4396 __itt_suppress_threading_errors, 4397 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4398 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4399 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4400 if (__kmp_storage_map) { 4401 __kmp_print_thread_storage_map(new_thr, new_gtid); 4402 } 4403 4404 // add the reserve serialized team, initialized from the team's primary thread 4405 { 4406 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4407 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4408 new_thr->th.th_serial_team = serial_team = 4409 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4410 #if OMPT_SUPPORT 4411 ompt_data_none, // root parallel id 4412 #endif 4413 proc_bind_default, &r_icvs, 4414 0 USE_NESTED_HOT_ARG(NULL)); 4415 } 4416 KMP_ASSERT(serial_team); 4417 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4418 // execution (it is unused for now). 4419 serial_team->t.t_threads[0] = new_thr; 4420 KF_TRACE(10, 4421 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4422 new_thr)); 4423 4424 /* setup the thread structures */ 4425 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4426 4427 #if USE_FAST_MEMORY 4428 __kmp_initialize_fast_memory(new_thr); 4429 #endif /* USE_FAST_MEMORY */ 4430 4431 #if KMP_USE_BGET 4432 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4433 __kmp_initialize_bget(new_thr); 4434 #endif 4435 4436 __kmp_init_random(new_thr); // Initialize random number generator 4437 4438 /* Initialize these only once when thread is grabbed for a team allocation */ 4439 KA_TRACE(20, 4440 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4441 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4442 4443 int b; 4444 kmp_balign_t *balign = new_thr->th.th_bar; 4445 for (b = 0; b < bs_last_barrier; ++b) { 4446 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4447 balign[b].bb.team = NULL; 4448 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4449 balign[b].bb.use_oncore_barrier = 0; 4450 } 4451 4452 new_thr->th.th_spin_here = FALSE; 4453 new_thr->th.th_next_waiting = 0; 4454 #if KMP_OS_UNIX 4455 new_thr->th.th_blocking = false; 4456 #endif 4457 4458 #if KMP_AFFINITY_SUPPORTED 4459 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4460 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4461 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4462 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4463 #endif 4464 new_thr->th.th_def_allocator = __kmp_def_allocator; 4465 new_thr->th.th_prev_level = 0; 4466 new_thr->th.th_prev_num_threads = 1; 4467 4468 TCW_4(new_thr->th.th_in_pool, FALSE); 4469 new_thr->th.th_active_in_pool = FALSE; 4470 TCW_4(new_thr->th.th_active, TRUE); 4471 4472 /* adjust the global counters */ 4473 __kmp_all_nth++; 4474 __kmp_nth++; 4475 4476 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4477 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4478 if (__kmp_adjust_gtid_mode) { 4479 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4480 if (TCR_4(__kmp_gtid_mode) != 2) { 4481 TCW_4(__kmp_gtid_mode, 2); 4482 } 4483 } else { 4484 if (TCR_4(__kmp_gtid_mode) != 1) { 4485 TCW_4(__kmp_gtid_mode, 1); 4486 } 4487 } 4488 } 4489 4490 #ifdef KMP_ADJUST_BLOCKTIME 4491 /* Adjust blocktime back to zero if necessary */ 4492 /* Middle initialization might not have occurred yet */ 4493 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4494 if (__kmp_nth > __kmp_avail_proc) { 4495 __kmp_zero_bt = TRUE; 4496 } 4497 } 4498 #endif /* KMP_ADJUST_BLOCKTIME */ 4499 4500 /* actually fork it and create the new worker thread */ 4501 KF_TRACE( 4502 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4503 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4504 KF_TRACE(10, 4505 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4506 4507 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4508 new_gtid)); 4509 KMP_MB(); 4510 return new_thr; 4511 } 4512 4513 /* Reinitialize team for reuse. 4514 The hot team code calls this case at every fork barrier, so EPCC barrier 4515 test are extremely sensitive to changes in it, esp. writes to the team 4516 struct, which cause a cache invalidation in all threads. 4517 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4518 static void __kmp_reinitialize_team(kmp_team_t *team, 4519 kmp_internal_control_t *new_icvs, 4520 ident_t *loc) { 4521 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4522 team->t.t_threads[0], team)); 4523 KMP_DEBUG_ASSERT(team && new_icvs); 4524 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4525 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4526 4527 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4528 // Copy ICVs to the primary thread's implicit taskdata 4529 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4530 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4531 4532 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4533 team->t.t_threads[0], team)); 4534 } 4535 4536 /* Initialize the team data structure. 4537 This assumes the t_threads and t_max_nproc are already set. 4538 Also, we don't touch the arguments */ 4539 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4540 kmp_internal_control_t *new_icvs, 4541 ident_t *loc) { 4542 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4543 4544 /* verify */ 4545 KMP_DEBUG_ASSERT(team); 4546 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4547 KMP_DEBUG_ASSERT(team->t.t_threads); 4548 KMP_MB(); 4549 4550 team->t.t_master_tid = 0; /* not needed */ 4551 /* team->t.t_master_bar; not needed */ 4552 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4553 team->t.t_nproc = new_nproc; 4554 4555 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4556 team->t.t_next_pool = NULL; 4557 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4558 * up hot team */ 4559 4560 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4561 team->t.t_invoke = NULL; /* not needed */ 4562 4563 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4564 team->t.t_sched.sched = new_icvs->sched.sched; 4565 4566 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4567 team->t.t_fp_control_saved = FALSE; /* not needed */ 4568 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4569 team->t.t_mxcsr = 0; /* not needed */ 4570 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4571 4572 team->t.t_construct = 0; 4573 4574 team->t.t_ordered.dt.t_value = 0; 4575 team->t.t_master_active = FALSE; 4576 4577 #ifdef KMP_DEBUG 4578 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4579 #endif 4580 #if KMP_OS_WINDOWS 4581 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4582 #endif 4583 4584 team->t.t_control_stack_top = NULL; 4585 4586 __kmp_reinitialize_team(team, new_icvs, loc); 4587 4588 KMP_MB(); 4589 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4590 } 4591 4592 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4593 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4594 static void 4595 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4596 if (KMP_AFFINITY_CAPABLE()) { 4597 int status; 4598 if (old_mask != NULL) { 4599 status = __kmp_get_system_affinity(old_mask, TRUE); 4600 int error = errno; 4601 if (status != 0) { 4602 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4603 __kmp_msg_null); 4604 } 4605 } 4606 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4607 } 4608 } 4609 #endif 4610 4611 #if KMP_AFFINITY_SUPPORTED 4612 4613 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4614 // It calculates the worker + primary thread's partition based upon the parent 4615 // thread's partition, and binds each worker to a thread in their partition. 4616 // The primary thread's partition should already include its current binding. 4617 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4618 // Do not partition places for the hidden helper team 4619 if (KMP_HIDDEN_HELPER_TEAM(team)) 4620 return; 4621 // Copy the primary thread's place partition to the team struct 4622 kmp_info_t *master_th = team->t.t_threads[0]; 4623 KMP_DEBUG_ASSERT(master_th != NULL); 4624 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4625 int first_place = master_th->th.th_first_place; 4626 int last_place = master_th->th.th_last_place; 4627 int masters_place = master_th->th.th_current_place; 4628 team->t.t_first_place = first_place; 4629 team->t.t_last_place = last_place; 4630 4631 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4632 "bound to place %d partition = [%d,%d]\n", 4633 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4634 team->t.t_id, masters_place, first_place, last_place)); 4635 4636 switch (proc_bind) { 4637 4638 case proc_bind_default: 4639 // Serial teams might have the proc_bind policy set to proc_bind_default. 4640 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4641 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4642 break; 4643 4644 case proc_bind_primary: { 4645 int f; 4646 int n_th = team->t.t_nproc; 4647 for (f = 1; f < n_th; f++) { 4648 kmp_info_t *th = team->t.t_threads[f]; 4649 KMP_DEBUG_ASSERT(th != NULL); 4650 th->th.th_first_place = first_place; 4651 th->th.th_last_place = last_place; 4652 th->th.th_new_place = masters_place; 4653 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4654 team->t.t_display_affinity != 1) { 4655 team->t.t_display_affinity = 1; 4656 } 4657 4658 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4659 "partition = [%d,%d]\n", 4660 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4661 f, masters_place, first_place, last_place)); 4662 } 4663 } break; 4664 4665 case proc_bind_close: { 4666 int f; 4667 int n_th = team->t.t_nproc; 4668 int n_places; 4669 if (first_place <= last_place) { 4670 n_places = last_place - first_place + 1; 4671 } else { 4672 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4673 } 4674 if (n_th <= n_places) { 4675 int place = masters_place; 4676 for (f = 1; f < n_th; f++) { 4677 kmp_info_t *th = team->t.t_threads[f]; 4678 KMP_DEBUG_ASSERT(th != NULL); 4679 4680 if (place == last_place) { 4681 place = first_place; 4682 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4683 place = 0; 4684 } else { 4685 place++; 4686 } 4687 th->th.th_first_place = first_place; 4688 th->th.th_last_place = last_place; 4689 th->th.th_new_place = place; 4690 if (__kmp_display_affinity && place != th->th.th_current_place && 4691 team->t.t_display_affinity != 1) { 4692 team->t.t_display_affinity = 1; 4693 } 4694 4695 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4696 "partition = [%d,%d]\n", 4697 __kmp_gtid_from_thread(team->t.t_threads[f]), 4698 team->t.t_id, f, place, first_place, last_place)); 4699 } 4700 } else { 4701 int S, rem, gap, s_count; 4702 S = n_th / n_places; 4703 s_count = 0; 4704 rem = n_th - (S * n_places); 4705 gap = rem > 0 ? n_places / rem : n_places; 4706 int place = masters_place; 4707 int gap_ct = gap; 4708 for (f = 0; f < n_th; f++) { 4709 kmp_info_t *th = team->t.t_threads[f]; 4710 KMP_DEBUG_ASSERT(th != NULL); 4711 4712 th->th.th_first_place = first_place; 4713 th->th.th_last_place = last_place; 4714 th->th.th_new_place = place; 4715 if (__kmp_display_affinity && place != th->th.th_current_place && 4716 team->t.t_display_affinity != 1) { 4717 team->t.t_display_affinity = 1; 4718 } 4719 s_count++; 4720 4721 if ((s_count == S) && rem && (gap_ct == gap)) { 4722 // do nothing, add an extra thread to place on next iteration 4723 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4724 // we added an extra thread to this place; move to next place 4725 if (place == last_place) { 4726 place = first_place; 4727 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4728 place = 0; 4729 } else { 4730 place++; 4731 } 4732 s_count = 0; 4733 gap_ct = 1; 4734 rem--; 4735 } else if (s_count == S) { // place full; don't add extra 4736 if (place == last_place) { 4737 place = first_place; 4738 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4739 place = 0; 4740 } else { 4741 place++; 4742 } 4743 gap_ct++; 4744 s_count = 0; 4745 } 4746 4747 KA_TRACE(100, 4748 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4749 "partition = [%d,%d]\n", 4750 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4751 th->th.th_new_place, first_place, last_place)); 4752 } 4753 KMP_DEBUG_ASSERT(place == masters_place); 4754 } 4755 } break; 4756 4757 case proc_bind_spread: { 4758 int f; 4759 int n_th = team->t.t_nproc; 4760 int n_places; 4761 int thidx; 4762 if (first_place <= last_place) { 4763 n_places = last_place - first_place + 1; 4764 } else { 4765 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4766 } 4767 if (n_th <= n_places) { 4768 int place = -1; 4769 4770 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4771 int S = n_places / n_th; 4772 int s_count, rem, gap, gap_ct; 4773 4774 place = masters_place; 4775 rem = n_places - n_th * S; 4776 gap = rem ? n_th / rem : 1; 4777 gap_ct = gap; 4778 thidx = n_th; 4779 if (update_master_only == 1) 4780 thidx = 1; 4781 for (f = 0; f < thidx; f++) { 4782 kmp_info_t *th = team->t.t_threads[f]; 4783 KMP_DEBUG_ASSERT(th != NULL); 4784 4785 th->th.th_first_place = place; 4786 th->th.th_new_place = place; 4787 if (__kmp_display_affinity && place != th->th.th_current_place && 4788 team->t.t_display_affinity != 1) { 4789 team->t.t_display_affinity = 1; 4790 } 4791 s_count = 1; 4792 while (s_count < S) { 4793 if (place == last_place) { 4794 place = first_place; 4795 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4796 place = 0; 4797 } else { 4798 place++; 4799 } 4800 s_count++; 4801 } 4802 if (rem && (gap_ct == gap)) { 4803 if (place == last_place) { 4804 place = first_place; 4805 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4806 place = 0; 4807 } else { 4808 place++; 4809 } 4810 rem--; 4811 gap_ct = 0; 4812 } 4813 th->th.th_last_place = place; 4814 gap_ct++; 4815 4816 if (place == last_place) { 4817 place = first_place; 4818 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4819 place = 0; 4820 } else { 4821 place++; 4822 } 4823 4824 KA_TRACE(100, 4825 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4826 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4827 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4828 f, th->th.th_new_place, th->th.th_first_place, 4829 th->th.th_last_place, __kmp_affinity_num_masks)); 4830 } 4831 } else { 4832 /* Having uniform space of available computation places I can create 4833 T partitions of round(P/T) size and put threads into the first 4834 place of each partition. */ 4835 double current = static_cast<double>(masters_place); 4836 double spacing = 4837 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4838 int first, last; 4839 kmp_info_t *th; 4840 4841 thidx = n_th + 1; 4842 if (update_master_only == 1) 4843 thidx = 1; 4844 for (f = 0; f < thidx; f++) { 4845 first = static_cast<int>(current); 4846 last = static_cast<int>(current + spacing) - 1; 4847 KMP_DEBUG_ASSERT(last >= first); 4848 if (first >= n_places) { 4849 if (masters_place) { 4850 first -= n_places; 4851 last -= n_places; 4852 if (first == (masters_place + 1)) { 4853 KMP_DEBUG_ASSERT(f == n_th); 4854 first--; 4855 } 4856 if (last == masters_place) { 4857 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4858 last--; 4859 } 4860 } else { 4861 KMP_DEBUG_ASSERT(f == n_th); 4862 first = 0; 4863 last = 0; 4864 } 4865 } 4866 if (last >= n_places) { 4867 last = (n_places - 1); 4868 } 4869 place = first; 4870 current += spacing; 4871 if (f < n_th) { 4872 KMP_DEBUG_ASSERT(0 <= first); 4873 KMP_DEBUG_ASSERT(n_places > first); 4874 KMP_DEBUG_ASSERT(0 <= last); 4875 KMP_DEBUG_ASSERT(n_places > last); 4876 KMP_DEBUG_ASSERT(last_place >= first_place); 4877 th = team->t.t_threads[f]; 4878 KMP_DEBUG_ASSERT(th); 4879 th->th.th_first_place = first; 4880 th->th.th_new_place = place; 4881 th->th.th_last_place = last; 4882 if (__kmp_display_affinity && place != th->th.th_current_place && 4883 team->t.t_display_affinity != 1) { 4884 team->t.t_display_affinity = 1; 4885 } 4886 KA_TRACE(100, 4887 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4888 "partition = [%d,%d], spacing = %.4f\n", 4889 __kmp_gtid_from_thread(team->t.t_threads[f]), 4890 team->t.t_id, f, th->th.th_new_place, 4891 th->th.th_first_place, th->th.th_last_place, spacing)); 4892 } 4893 } 4894 } 4895 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4896 } else { 4897 int S, rem, gap, s_count; 4898 S = n_th / n_places; 4899 s_count = 0; 4900 rem = n_th - (S * n_places); 4901 gap = rem > 0 ? n_places / rem : n_places; 4902 int place = masters_place; 4903 int gap_ct = gap; 4904 thidx = n_th; 4905 if (update_master_only == 1) 4906 thidx = 1; 4907 for (f = 0; f < thidx; f++) { 4908 kmp_info_t *th = team->t.t_threads[f]; 4909 KMP_DEBUG_ASSERT(th != NULL); 4910 4911 th->th.th_first_place = place; 4912 th->th.th_last_place = place; 4913 th->th.th_new_place = place; 4914 if (__kmp_display_affinity && place != th->th.th_current_place && 4915 team->t.t_display_affinity != 1) { 4916 team->t.t_display_affinity = 1; 4917 } 4918 s_count++; 4919 4920 if ((s_count == S) && rem && (gap_ct == gap)) { 4921 // do nothing, add an extra thread to place on next iteration 4922 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4923 // we added an extra thread to this place; move on to next place 4924 if (place == last_place) { 4925 place = first_place; 4926 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4927 place = 0; 4928 } else { 4929 place++; 4930 } 4931 s_count = 0; 4932 gap_ct = 1; 4933 rem--; 4934 } else if (s_count == S) { // place is full; don't add extra thread 4935 if (place == last_place) { 4936 place = first_place; 4937 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4938 place = 0; 4939 } else { 4940 place++; 4941 } 4942 gap_ct++; 4943 s_count = 0; 4944 } 4945 4946 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4947 "partition = [%d,%d]\n", 4948 __kmp_gtid_from_thread(team->t.t_threads[f]), 4949 team->t.t_id, f, th->th.th_new_place, 4950 th->th.th_first_place, th->th.th_last_place)); 4951 } 4952 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4953 } 4954 } break; 4955 4956 default: 4957 break; 4958 } 4959 4960 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4961 } 4962 4963 #endif // KMP_AFFINITY_SUPPORTED 4964 4965 /* allocate a new team data structure to use. take one off of the free pool if 4966 available */ 4967 kmp_team_t * 4968 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4969 #if OMPT_SUPPORT 4970 ompt_data_t ompt_parallel_data, 4971 #endif 4972 kmp_proc_bind_t new_proc_bind, 4973 kmp_internal_control_t *new_icvs, 4974 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4975 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4976 int f; 4977 kmp_team_t *team; 4978 int use_hot_team = !root->r.r_active; 4979 int level = 0; 4980 4981 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4982 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4983 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4984 KMP_MB(); 4985 4986 #if KMP_NESTED_HOT_TEAMS 4987 kmp_hot_team_ptr_t *hot_teams; 4988 if (master) { 4989 team = master->th.th_team; 4990 level = team->t.t_active_level; 4991 if (master->th.th_teams_microtask) { // in teams construct? 4992 if (master->th.th_teams_size.nteams > 1 && 4993 ( // #teams > 1 4994 team->t.t_pkfn == 4995 (microtask_t)__kmp_teams_master || // inner fork of the teams 4996 master->th.th_teams_level < 4997 team->t.t_level)) { // or nested parallel inside the teams 4998 ++level; // not increment if #teams==1, or for outer fork of the teams; 4999 // increment otherwise 5000 } 5001 } 5002 hot_teams = master->th.th_hot_teams; 5003 if (level < __kmp_hot_teams_max_level && hot_teams && 5004 hot_teams[level].hot_team) { 5005 // hot team has already been allocated for given level 5006 use_hot_team = 1; 5007 } else { 5008 use_hot_team = 0; 5009 } 5010 } else { 5011 // check we won't access uninitialized hot_teams, just in case 5012 KMP_DEBUG_ASSERT(new_nproc == 1); 5013 } 5014 #endif 5015 // Optimization to use a "hot" team 5016 if (use_hot_team && new_nproc > 1) { 5017 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5018 #if KMP_NESTED_HOT_TEAMS 5019 team = hot_teams[level].hot_team; 5020 #else 5021 team = root->r.r_hot_team; 5022 #endif 5023 #if KMP_DEBUG 5024 if (__kmp_tasking_mode != tskm_immediate_exec) { 5025 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5026 "task_team[1] = %p before reinit\n", 5027 team->t.t_task_team[0], team->t.t_task_team[1])); 5028 } 5029 #endif 5030 5031 // Has the number of threads changed? 5032 /* Let's assume the most common case is that the number of threads is 5033 unchanged, and put that case first. */ 5034 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5035 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5036 // This case can mean that omp_set_num_threads() was called and the hot 5037 // team size was already reduced, so we check the special flag 5038 if (team->t.t_size_changed == -1) { 5039 team->t.t_size_changed = 1; 5040 } else { 5041 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5042 } 5043 5044 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5045 kmp_r_sched_t new_sched = new_icvs->sched; 5046 // set primary thread's schedule as new run-time schedule 5047 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5048 5049 __kmp_reinitialize_team(team, new_icvs, 5050 root->r.r_uber_thread->th.th_ident); 5051 5052 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5053 team->t.t_threads[0], team)); 5054 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5055 5056 #if KMP_AFFINITY_SUPPORTED 5057 if ((team->t.t_size_changed == 0) && 5058 (team->t.t_proc_bind == new_proc_bind)) { 5059 if (new_proc_bind == proc_bind_spread) { 5060 __kmp_partition_places( 5061 team, 1); // add flag to update only master for spread 5062 } 5063 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5064 "proc_bind = %d, partition = [%d,%d]\n", 5065 team->t.t_id, new_proc_bind, team->t.t_first_place, 5066 team->t.t_last_place)); 5067 } else { 5068 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5069 __kmp_partition_places(team); 5070 } 5071 #else 5072 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5073 #endif /* KMP_AFFINITY_SUPPORTED */ 5074 } else if (team->t.t_nproc > new_nproc) { 5075 KA_TRACE(20, 5076 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5077 new_nproc)); 5078 5079 team->t.t_size_changed = 1; 5080 #if KMP_NESTED_HOT_TEAMS 5081 if (__kmp_hot_teams_mode == 0) { 5082 // AC: saved number of threads should correspond to team's value in this 5083 // mode, can be bigger in mode 1, when hot team has threads in reserve 5084 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5085 hot_teams[level].hot_team_nth = new_nproc; 5086 #endif // KMP_NESTED_HOT_TEAMS 5087 /* release the extra threads we don't need any more */ 5088 for (f = new_nproc; f < team->t.t_nproc; f++) { 5089 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5090 if (__kmp_tasking_mode != tskm_immediate_exec) { 5091 // When decreasing team size, threads no longer in the team should 5092 // unref task team. 5093 team->t.t_threads[f]->th.th_task_team = NULL; 5094 } 5095 __kmp_free_thread(team->t.t_threads[f]); 5096 team->t.t_threads[f] = NULL; 5097 } 5098 #if KMP_NESTED_HOT_TEAMS 5099 } // (__kmp_hot_teams_mode == 0) 5100 else { 5101 // When keeping extra threads in team, switch threads to wait on own 5102 // b_go flag 5103 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5104 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5105 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5106 for (int b = 0; b < bs_last_barrier; ++b) { 5107 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5108 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5109 } 5110 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5111 } 5112 } 5113 } 5114 #endif // KMP_NESTED_HOT_TEAMS 5115 team->t.t_nproc = new_nproc; 5116 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5117 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5118 __kmp_reinitialize_team(team, new_icvs, 5119 root->r.r_uber_thread->th.th_ident); 5120 5121 // Update remaining threads 5122 for (f = 0; f < new_nproc; ++f) { 5123 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5124 } 5125 5126 // restore the current task state of the primary thread: should be the 5127 // implicit task 5128 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5129 team->t.t_threads[0], team)); 5130 5131 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5132 5133 #ifdef KMP_DEBUG 5134 for (f = 0; f < team->t.t_nproc; f++) { 5135 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5136 team->t.t_threads[f]->th.th_team_nproc == 5137 team->t.t_nproc); 5138 } 5139 #endif 5140 5141 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5142 #if KMP_AFFINITY_SUPPORTED 5143 __kmp_partition_places(team); 5144 #endif 5145 } else { // team->t.t_nproc < new_nproc 5146 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5147 kmp_affin_mask_t *old_mask; 5148 if (KMP_AFFINITY_CAPABLE()) { 5149 KMP_CPU_ALLOC(old_mask); 5150 } 5151 #endif 5152 5153 KA_TRACE(20, 5154 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5155 new_nproc)); 5156 5157 team->t.t_size_changed = 1; 5158 5159 #if KMP_NESTED_HOT_TEAMS 5160 int avail_threads = hot_teams[level].hot_team_nth; 5161 if (new_nproc < avail_threads) 5162 avail_threads = new_nproc; 5163 kmp_info_t **other_threads = team->t.t_threads; 5164 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5165 // Adjust barrier data of reserved threads (if any) of the team 5166 // Other data will be set in __kmp_initialize_info() below. 5167 int b; 5168 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5169 for (b = 0; b < bs_last_barrier; ++b) { 5170 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5171 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5172 #if USE_DEBUGGER 5173 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5174 #endif 5175 } 5176 } 5177 if (hot_teams[level].hot_team_nth >= new_nproc) { 5178 // we have all needed threads in reserve, no need to allocate any 5179 // this only possible in mode 1, cannot have reserved threads in mode 0 5180 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5181 team->t.t_nproc = new_nproc; // just get reserved threads involved 5182 } else { 5183 // we may have some threads in reserve, but not enough 5184 team->t.t_nproc = 5185 hot_teams[level] 5186 .hot_team_nth; // get reserved threads involved if any 5187 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5188 #endif // KMP_NESTED_HOT_TEAMS 5189 if (team->t.t_max_nproc < new_nproc) { 5190 /* reallocate larger arrays */ 5191 __kmp_reallocate_team_arrays(team, new_nproc); 5192 __kmp_reinitialize_team(team, new_icvs, NULL); 5193 } 5194 5195 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5196 /* Temporarily set full mask for primary thread before creation of 5197 workers. The reason is that workers inherit the affinity from the 5198 primary thread, so if a lot of workers are created on the single 5199 core quickly, they don't get a chance to set their own affinity for 5200 a long time. */ 5201 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5202 #endif 5203 5204 /* allocate new threads for the hot team */ 5205 for (f = team->t.t_nproc; f < new_nproc; f++) { 5206 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5207 KMP_DEBUG_ASSERT(new_worker); 5208 team->t.t_threads[f] = new_worker; 5209 5210 KA_TRACE(20, 5211 ("__kmp_allocate_team: team %d init T#%d arrived: " 5212 "join=%llu, plain=%llu\n", 5213 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5214 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5215 team->t.t_bar[bs_plain_barrier].b_arrived)); 5216 5217 { // Initialize barrier data for new threads. 5218 int b; 5219 kmp_balign_t *balign = new_worker->th.th_bar; 5220 for (b = 0; b < bs_last_barrier; ++b) { 5221 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5222 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5223 KMP_BARRIER_PARENT_FLAG); 5224 #if USE_DEBUGGER 5225 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5226 #endif 5227 } 5228 } 5229 } 5230 5231 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5232 if (KMP_AFFINITY_CAPABLE()) { 5233 /* Restore initial primary thread's affinity mask */ 5234 __kmp_set_system_affinity(old_mask, TRUE); 5235 KMP_CPU_FREE(old_mask); 5236 } 5237 #endif 5238 #if KMP_NESTED_HOT_TEAMS 5239 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5240 #endif // KMP_NESTED_HOT_TEAMS 5241 /* make sure everyone is syncronized */ 5242 int old_nproc = team->t.t_nproc; // save old value and use to update only 5243 // new threads below 5244 __kmp_initialize_team(team, new_nproc, new_icvs, 5245 root->r.r_uber_thread->th.th_ident); 5246 5247 /* reinitialize the threads */ 5248 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5249 for (f = 0; f < team->t.t_nproc; ++f) 5250 __kmp_initialize_info(team->t.t_threads[f], team, f, 5251 __kmp_gtid_from_tid(f, team)); 5252 5253 if (level) { // set th_task_state for new threads in nested hot team 5254 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5255 // only need to set the th_task_state for the new threads. th_task_state 5256 // for primary thread will not be accurate until after this in 5257 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5258 // get the correct value. 5259 for (f = old_nproc; f < team->t.t_nproc; ++f) 5260 team->t.t_threads[f]->th.th_task_state = 5261 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5262 } else { // set th_task_state for new threads in non-nested hot team 5263 // copy primary thread's state 5264 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5265 for (f = old_nproc; f < team->t.t_nproc; ++f) 5266 team->t.t_threads[f]->th.th_task_state = old_state; 5267 } 5268 5269 #ifdef KMP_DEBUG 5270 for (f = 0; f < team->t.t_nproc; ++f) { 5271 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5272 team->t.t_threads[f]->th.th_team_nproc == 5273 team->t.t_nproc); 5274 } 5275 #endif 5276 5277 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5278 #if KMP_AFFINITY_SUPPORTED 5279 __kmp_partition_places(team); 5280 #endif 5281 } // Check changes in number of threads 5282 5283 kmp_info_t *master = team->t.t_threads[0]; 5284 if (master->th.th_teams_microtask) { 5285 for (f = 1; f < new_nproc; ++f) { 5286 // propagate teams construct specific info to workers 5287 kmp_info_t *thr = team->t.t_threads[f]; 5288 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5289 thr->th.th_teams_level = master->th.th_teams_level; 5290 thr->th.th_teams_size = master->th.th_teams_size; 5291 } 5292 } 5293 #if KMP_NESTED_HOT_TEAMS 5294 if (level) { 5295 // Sync barrier state for nested hot teams, not needed for outermost hot 5296 // team. 5297 for (f = 1; f < new_nproc; ++f) { 5298 kmp_info_t *thr = team->t.t_threads[f]; 5299 int b; 5300 kmp_balign_t *balign = thr->th.th_bar; 5301 for (b = 0; b < bs_last_barrier; ++b) { 5302 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5303 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5304 #if USE_DEBUGGER 5305 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5306 #endif 5307 } 5308 } 5309 } 5310 #endif // KMP_NESTED_HOT_TEAMS 5311 5312 /* reallocate space for arguments if necessary */ 5313 __kmp_alloc_argv_entries(argc, team, TRUE); 5314 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5315 // The hot team re-uses the previous task team, 5316 // if untouched during the previous release->gather phase. 5317 5318 KF_TRACE(10, (" hot_team = %p\n", team)); 5319 5320 #if KMP_DEBUG 5321 if (__kmp_tasking_mode != tskm_immediate_exec) { 5322 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5323 "task_team[1] = %p after reinit\n", 5324 team->t.t_task_team[0], team->t.t_task_team[1])); 5325 } 5326 #endif 5327 5328 #if OMPT_SUPPORT 5329 __ompt_team_assign_id(team, ompt_parallel_data); 5330 #endif 5331 5332 KMP_MB(); 5333 5334 return team; 5335 } 5336 5337 /* next, let's try to take one from the team pool */ 5338 KMP_MB(); 5339 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5340 /* TODO: consider resizing undersized teams instead of reaping them, now 5341 that we have a resizing mechanism */ 5342 if (team->t.t_max_nproc >= max_nproc) { 5343 /* take this team from the team pool */ 5344 __kmp_team_pool = team->t.t_next_pool; 5345 5346 /* setup the team for fresh use */ 5347 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5348 5349 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5350 "task_team[1] %p to NULL\n", 5351 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5352 team->t.t_task_team[0] = NULL; 5353 team->t.t_task_team[1] = NULL; 5354 5355 /* reallocate space for arguments if necessary */ 5356 __kmp_alloc_argv_entries(argc, team, TRUE); 5357 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5358 5359 KA_TRACE( 5360 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5361 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5362 { // Initialize barrier data. 5363 int b; 5364 for (b = 0; b < bs_last_barrier; ++b) { 5365 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5366 #if USE_DEBUGGER 5367 team->t.t_bar[b].b_master_arrived = 0; 5368 team->t.t_bar[b].b_team_arrived = 0; 5369 #endif 5370 } 5371 } 5372 5373 team->t.t_proc_bind = new_proc_bind; 5374 5375 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5376 team->t.t_id)); 5377 5378 #if OMPT_SUPPORT 5379 __ompt_team_assign_id(team, ompt_parallel_data); 5380 #endif 5381 5382 KMP_MB(); 5383 5384 return team; 5385 } 5386 5387 /* reap team if it is too small, then loop back and check the next one */ 5388 // not sure if this is wise, but, will be redone during the hot-teams 5389 // rewrite. 5390 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5391 team = __kmp_reap_team(team); 5392 __kmp_team_pool = team; 5393 } 5394 5395 /* nothing available in the pool, no matter, make a new team! */ 5396 KMP_MB(); 5397 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5398 5399 /* and set it up */ 5400 team->t.t_max_nproc = max_nproc; 5401 /* NOTE well, for some reason allocating one big buffer and dividing it up 5402 seems to really hurt performance a lot on the P4, so, let's not use this */ 5403 __kmp_allocate_team_arrays(team, max_nproc); 5404 5405 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5406 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5407 5408 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5409 "%p to NULL\n", 5410 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5411 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5412 // memory, no need to duplicate 5413 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5414 // memory, no need to duplicate 5415 5416 if (__kmp_storage_map) { 5417 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5418 } 5419 5420 /* allocate space for arguments */ 5421 __kmp_alloc_argv_entries(argc, team, FALSE); 5422 team->t.t_argc = argc; 5423 5424 KA_TRACE(20, 5425 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5426 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5427 { // Initialize barrier data. 5428 int b; 5429 for (b = 0; b < bs_last_barrier; ++b) { 5430 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5431 #if USE_DEBUGGER 5432 team->t.t_bar[b].b_master_arrived = 0; 5433 team->t.t_bar[b].b_team_arrived = 0; 5434 #endif 5435 } 5436 } 5437 5438 team->t.t_proc_bind = new_proc_bind; 5439 5440 #if OMPT_SUPPORT 5441 __ompt_team_assign_id(team, ompt_parallel_data); 5442 team->t.ompt_serialized_team_info = NULL; 5443 #endif 5444 5445 KMP_MB(); 5446 5447 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5448 team->t.t_id)); 5449 5450 return team; 5451 } 5452 5453 /* TODO implement hot-teams at all levels */ 5454 /* TODO implement lazy thread release on demand (disband request) */ 5455 5456 /* free the team. return it to the team pool. release all the threads 5457 * associated with it */ 5458 void __kmp_free_team(kmp_root_t *root, 5459 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5460 int f; 5461 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5462 team->t.t_id)); 5463 5464 /* verify state */ 5465 KMP_DEBUG_ASSERT(root); 5466 KMP_DEBUG_ASSERT(team); 5467 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5468 KMP_DEBUG_ASSERT(team->t.t_threads); 5469 5470 int use_hot_team = team == root->r.r_hot_team; 5471 #if KMP_NESTED_HOT_TEAMS 5472 int level; 5473 kmp_hot_team_ptr_t *hot_teams; 5474 if (master) { 5475 level = team->t.t_active_level - 1; 5476 if (master->th.th_teams_microtask) { // in teams construct? 5477 if (master->th.th_teams_size.nteams > 1) { 5478 ++level; // level was not increased in teams construct for 5479 // team_of_masters 5480 } 5481 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5482 master->th.th_teams_level == team->t.t_level) { 5483 ++level; // level was not increased in teams construct for 5484 // team_of_workers before the parallel 5485 } // team->t.t_level will be increased inside parallel 5486 } 5487 hot_teams = master->th.th_hot_teams; 5488 if (level < __kmp_hot_teams_max_level) { 5489 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5490 use_hot_team = 1; 5491 } 5492 } 5493 #endif // KMP_NESTED_HOT_TEAMS 5494 5495 /* team is done working */ 5496 TCW_SYNC_PTR(team->t.t_pkfn, 5497 NULL); // Important for Debugging Support Library. 5498 #if KMP_OS_WINDOWS 5499 team->t.t_copyin_counter = 0; // init counter for possible reuse 5500 #endif 5501 // Do not reset pointer to parent team to NULL for hot teams. 5502 5503 /* if we are non-hot team, release our threads */ 5504 if (!use_hot_team) { 5505 if (__kmp_tasking_mode != tskm_immediate_exec) { 5506 // Wait for threads to reach reapable state 5507 for (f = 1; f < team->t.t_nproc; ++f) { 5508 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5509 kmp_info_t *th = team->t.t_threads[f]; 5510 volatile kmp_uint32 *state = &th->th.th_reap_state; 5511 while (*state != KMP_SAFE_TO_REAP) { 5512 #if KMP_OS_WINDOWS 5513 // On Windows a thread can be killed at any time, check this 5514 DWORD ecode; 5515 if (!__kmp_is_thread_alive(th, &ecode)) { 5516 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5517 break; 5518 } 5519 #endif 5520 // first check if thread is sleeping 5521 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5522 if (fl.is_sleeping()) 5523 fl.resume(__kmp_gtid_from_thread(th)); 5524 KMP_CPU_PAUSE(); 5525 } 5526 } 5527 5528 // Delete task teams 5529 int tt_idx; 5530 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5531 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5532 if (task_team != NULL) { 5533 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5534 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5535 team->t.t_threads[f]->th.th_task_team = NULL; 5536 } 5537 KA_TRACE( 5538 20, 5539 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5540 __kmp_get_gtid(), task_team, team->t.t_id)); 5541 #if KMP_NESTED_HOT_TEAMS 5542 __kmp_free_task_team(master, task_team); 5543 #endif 5544 team->t.t_task_team[tt_idx] = NULL; 5545 } 5546 } 5547 } 5548 5549 // Reset pointer to parent team only for non-hot teams. 5550 team->t.t_parent = NULL; 5551 team->t.t_level = 0; 5552 team->t.t_active_level = 0; 5553 5554 /* free the worker threads */ 5555 for (f = 1; f < team->t.t_nproc; ++f) { 5556 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5557 __kmp_free_thread(team->t.t_threads[f]); 5558 team->t.t_threads[f] = NULL; 5559 } 5560 5561 /* put the team back in the team pool */ 5562 /* TODO limit size of team pool, call reap_team if pool too large */ 5563 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5564 __kmp_team_pool = (volatile kmp_team_t *)team; 5565 } else { // Check if team was created for primary threads in teams construct 5566 // See if first worker is a CG root 5567 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5568 team->t.t_threads[1]->th.th_cg_roots); 5569 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5570 // Clean up the CG root nodes on workers so that this team can be re-used 5571 for (f = 1; f < team->t.t_nproc; ++f) { 5572 kmp_info_t *thr = team->t.t_threads[f]; 5573 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5574 thr->th.th_cg_roots->cg_root == thr); 5575 // Pop current CG root off list 5576 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5577 thr->th.th_cg_roots = tmp->up; 5578 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5579 " up to node %p. cg_nthreads was %d\n", 5580 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5581 int i = tmp->cg_nthreads--; 5582 if (i == 1) { 5583 __kmp_free(tmp); // free CG if we are the last thread in it 5584 } 5585 // Restore current task's thread_limit from CG root 5586 if (thr->th.th_cg_roots) 5587 thr->th.th_current_task->td_icvs.thread_limit = 5588 thr->th.th_cg_roots->cg_thread_limit; 5589 } 5590 } 5591 } 5592 5593 KMP_MB(); 5594 } 5595 5596 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5597 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5598 kmp_team_t *next_pool = team->t.t_next_pool; 5599 5600 KMP_DEBUG_ASSERT(team); 5601 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5602 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5603 KMP_DEBUG_ASSERT(team->t.t_threads); 5604 KMP_DEBUG_ASSERT(team->t.t_argv); 5605 5606 /* TODO clean the threads that are a part of this? */ 5607 5608 /* free stuff */ 5609 __kmp_free_team_arrays(team); 5610 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5611 __kmp_free((void *)team->t.t_argv); 5612 __kmp_free(team); 5613 5614 KMP_MB(); 5615 return next_pool; 5616 } 5617 5618 // Free the thread. Don't reap it, just place it on the pool of available 5619 // threads. 5620 // 5621 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5622 // binding for the affinity mechanism to be useful. 5623 // 5624 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5625 // However, we want to avoid a potential performance problem by always 5626 // scanning through the list to find the correct point at which to insert 5627 // the thread (potential N**2 behavior). To do this we keep track of the 5628 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5629 // With single-level parallelism, threads will always be added to the tail 5630 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5631 // parallelism, all bets are off and we may need to scan through the entire 5632 // free list. 5633 // 5634 // This change also has a potentially large performance benefit, for some 5635 // applications. Previously, as threads were freed from the hot team, they 5636 // would be placed back on the free list in inverse order. If the hot team 5637 // grew back to it's original size, then the freed thread would be placed 5638 // back on the hot team in reverse order. This could cause bad cache 5639 // locality problems on programs where the size of the hot team regularly 5640 // grew and shrunk. 5641 // 5642 // Now, for single-level parallelism, the OMP tid is always == gtid. 5643 void __kmp_free_thread(kmp_info_t *this_th) { 5644 int gtid; 5645 kmp_info_t **scan; 5646 5647 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5648 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5649 5650 KMP_DEBUG_ASSERT(this_th); 5651 5652 // When moving thread to pool, switch thread to wait on own b_go flag, and 5653 // uninitialized (NULL team). 5654 int b; 5655 kmp_balign_t *balign = this_th->th.th_bar; 5656 for (b = 0; b < bs_last_barrier; ++b) { 5657 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5658 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5659 balign[b].bb.team = NULL; 5660 balign[b].bb.leaf_kids = 0; 5661 } 5662 this_th->th.th_task_state = 0; 5663 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5664 5665 /* put thread back on the free pool */ 5666 TCW_PTR(this_th->th.th_team, NULL); 5667 TCW_PTR(this_th->th.th_root, NULL); 5668 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5669 5670 while (this_th->th.th_cg_roots) { 5671 this_th->th.th_cg_roots->cg_nthreads--; 5672 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5673 " %p of thread %p to %d\n", 5674 this_th, this_th->th.th_cg_roots, 5675 this_th->th.th_cg_roots->cg_root, 5676 this_th->th.th_cg_roots->cg_nthreads)); 5677 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5678 if (tmp->cg_root == this_th) { // Thread is a cg_root 5679 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5680 KA_TRACE( 5681 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5682 this_th->th.th_cg_roots = tmp->up; 5683 __kmp_free(tmp); 5684 } else { // Worker thread 5685 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5686 __kmp_free(tmp); 5687 } 5688 this_th->th.th_cg_roots = NULL; 5689 break; 5690 } 5691 } 5692 5693 /* If the implicit task assigned to this thread can be used by other threads 5694 * -> multiple threads can share the data and try to free the task at 5695 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5696 * with higher probability when hot team is disabled but can occurs even when 5697 * the hot team is enabled */ 5698 __kmp_free_implicit_task(this_th); 5699 this_th->th.th_current_task = NULL; 5700 5701 // If the __kmp_thread_pool_insert_pt is already past the new insert 5702 // point, then we need to re-scan the entire list. 5703 gtid = this_th->th.th_info.ds.ds_gtid; 5704 if (__kmp_thread_pool_insert_pt != NULL) { 5705 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5706 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5707 __kmp_thread_pool_insert_pt = NULL; 5708 } 5709 } 5710 5711 // Scan down the list to find the place to insert the thread. 5712 // scan is the address of a link in the list, possibly the address of 5713 // __kmp_thread_pool itself. 5714 // 5715 // In the absence of nested parallelism, the for loop will have 0 iterations. 5716 if (__kmp_thread_pool_insert_pt != NULL) { 5717 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5718 } else { 5719 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5720 } 5721 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5722 scan = &((*scan)->th.th_next_pool)) 5723 ; 5724 5725 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5726 // to its address. 5727 TCW_PTR(this_th->th.th_next_pool, *scan); 5728 __kmp_thread_pool_insert_pt = *scan = this_th; 5729 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5730 (this_th->th.th_info.ds.ds_gtid < 5731 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5732 TCW_4(this_th->th.th_in_pool, TRUE); 5733 __kmp_suspend_initialize_thread(this_th); 5734 __kmp_lock_suspend_mx(this_th); 5735 if (this_th->th.th_active == TRUE) { 5736 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5737 this_th->th.th_active_in_pool = TRUE; 5738 } 5739 #if KMP_DEBUG 5740 else { 5741 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5742 } 5743 #endif 5744 __kmp_unlock_suspend_mx(this_th); 5745 5746 TCW_4(__kmp_nth, __kmp_nth - 1); 5747 5748 #ifdef KMP_ADJUST_BLOCKTIME 5749 /* Adjust blocktime back to user setting or default if necessary */ 5750 /* Middle initialization might never have occurred */ 5751 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5752 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5753 if (__kmp_nth <= __kmp_avail_proc) { 5754 __kmp_zero_bt = FALSE; 5755 } 5756 } 5757 #endif /* KMP_ADJUST_BLOCKTIME */ 5758 5759 KMP_MB(); 5760 } 5761 5762 /* ------------------------------------------------------------------------ */ 5763 5764 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5765 #if OMP_PROFILING_SUPPORT 5766 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5767 // TODO: add a configuration option for time granularity 5768 if (ProfileTraceFile) 5769 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5770 #endif 5771 5772 int gtid = this_thr->th.th_info.ds.ds_gtid; 5773 /* void *stack_data;*/ 5774 kmp_team_t **volatile pteam; 5775 5776 KMP_MB(); 5777 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5778 5779 if (__kmp_env_consistency_check) { 5780 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5781 } 5782 5783 #if OMPD_SUPPORT 5784 if (ompd_state & OMPD_ENABLE_BP) 5785 ompd_bp_thread_begin(); 5786 #endif 5787 5788 #if OMPT_SUPPORT 5789 ompt_data_t *thread_data = nullptr; 5790 if (ompt_enabled.enabled) { 5791 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5792 *thread_data = ompt_data_none; 5793 5794 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5795 this_thr->th.ompt_thread_info.wait_id = 0; 5796 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5797 this_thr->th.ompt_thread_info.parallel_flags = 0; 5798 if (ompt_enabled.ompt_callback_thread_begin) { 5799 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5800 ompt_thread_worker, thread_data); 5801 } 5802 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5803 } 5804 #endif 5805 5806 /* This is the place where threads wait for work */ 5807 while (!TCR_4(__kmp_global.g.g_done)) { 5808 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5809 KMP_MB(); 5810 5811 /* wait for work to do */ 5812 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5813 5814 /* No tid yet since not part of a team */ 5815 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5816 5817 #if OMPT_SUPPORT 5818 if (ompt_enabled.enabled) { 5819 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5820 } 5821 #endif 5822 5823 pteam = &this_thr->th.th_team; 5824 5825 /* have we been allocated? */ 5826 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5827 /* we were just woken up, so run our new task */ 5828 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5829 int rc; 5830 KA_TRACE(20, 5831 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5832 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5833 (*pteam)->t.t_pkfn)); 5834 5835 updateHWFPControl(*pteam); 5836 5837 #if OMPT_SUPPORT 5838 if (ompt_enabled.enabled) { 5839 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5840 } 5841 #endif 5842 5843 rc = (*pteam)->t.t_invoke(gtid); 5844 KMP_ASSERT(rc); 5845 5846 KMP_MB(); 5847 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5848 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5849 (*pteam)->t.t_pkfn)); 5850 } 5851 #if OMPT_SUPPORT 5852 if (ompt_enabled.enabled) { 5853 /* no frame set while outside task */ 5854 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5855 5856 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5857 } 5858 #endif 5859 /* join barrier after parallel region */ 5860 __kmp_join_barrier(gtid); 5861 } 5862 } 5863 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5864 5865 #if OMPD_SUPPORT 5866 if (ompd_state & OMPD_ENABLE_BP) 5867 ompd_bp_thread_end(); 5868 #endif 5869 5870 #if OMPT_SUPPORT 5871 if (ompt_enabled.ompt_callback_thread_end) { 5872 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5873 } 5874 #endif 5875 5876 this_thr->th.th_task_team = NULL; 5877 /* run the destructors for the threadprivate data for this thread */ 5878 __kmp_common_destroy_gtid(gtid); 5879 5880 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5881 KMP_MB(); 5882 5883 #if OMP_PROFILING_SUPPORT 5884 llvm::timeTraceProfilerFinishThread(); 5885 #endif 5886 return this_thr; 5887 } 5888 5889 /* ------------------------------------------------------------------------ */ 5890 5891 void __kmp_internal_end_dest(void *specific_gtid) { 5892 // Make sure no significant bits are lost 5893 int gtid; 5894 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5895 5896 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5897 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5898 * this is because 0 is reserved for the nothing-stored case */ 5899 5900 __kmp_internal_end_thread(gtid); 5901 } 5902 5903 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5904 5905 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5906 __kmp_internal_end_atexit(); 5907 } 5908 5909 #endif 5910 5911 /* [Windows] josh: when the atexit handler is called, there may still be more 5912 than one thread alive */ 5913 void __kmp_internal_end_atexit(void) { 5914 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5915 /* [Windows] 5916 josh: ideally, we want to completely shutdown the library in this atexit 5917 handler, but stat code that depends on thread specific data for gtid fails 5918 because that data becomes unavailable at some point during the shutdown, so 5919 we call __kmp_internal_end_thread instead. We should eventually remove the 5920 dependency on __kmp_get_specific_gtid in the stat code and use 5921 __kmp_internal_end_library to cleanly shutdown the library. 5922 5923 // TODO: Can some of this comment about GVS be removed? 5924 I suspect that the offending stat code is executed when the calling thread 5925 tries to clean up a dead root thread's data structures, resulting in GVS 5926 code trying to close the GVS structures for that thread, but since the stat 5927 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5928 the calling thread is cleaning up itself instead of another thread, it get 5929 confused. This happens because allowing a thread to unregister and cleanup 5930 another thread is a recent modification for addressing an issue. 5931 Based on the current design (20050722), a thread may end up 5932 trying to unregister another thread only if thread death does not trigger 5933 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5934 thread specific data destructor function to detect thread death. For 5935 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5936 is nothing. Thus, the workaround is applicable only for Windows static 5937 stat library. */ 5938 __kmp_internal_end_library(-1); 5939 #if KMP_OS_WINDOWS 5940 __kmp_close_console(); 5941 #endif 5942 } 5943 5944 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5945 // It is assumed __kmp_forkjoin_lock is acquired. 5946 5947 int gtid; 5948 5949 KMP_DEBUG_ASSERT(thread != NULL); 5950 5951 gtid = thread->th.th_info.ds.ds_gtid; 5952 5953 if (!is_root) { 5954 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5955 /* Assume the threads are at the fork barrier here */ 5956 KA_TRACE( 5957 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5958 gtid)); 5959 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5960 * (GEH) */ 5961 ANNOTATE_HAPPENS_BEFORE(thread); 5962 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5963 thread); 5964 __kmp_release_64(&flag); 5965 } 5966 5967 // Terminate OS thread. 5968 __kmp_reap_worker(thread); 5969 5970 // The thread was killed asynchronously. If it was actively 5971 // spinning in the thread pool, decrement the global count. 5972 // 5973 // There is a small timing hole here - if the worker thread was just waking 5974 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5975 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5976 // the global counter might not get updated. 5977 // 5978 // Currently, this can only happen as the library is unloaded, 5979 // so there are no harmful side effects. 5980 if (thread->th.th_active_in_pool) { 5981 thread->th.th_active_in_pool = FALSE; 5982 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5983 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5984 } 5985 } 5986 5987 __kmp_free_implicit_task(thread); 5988 5989 // Free the fast memory for tasking 5990 #if USE_FAST_MEMORY 5991 __kmp_free_fast_memory(thread); 5992 #endif /* USE_FAST_MEMORY */ 5993 5994 __kmp_suspend_uninitialize_thread(thread); 5995 5996 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5997 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5998 5999 --__kmp_all_nth; 6000 // __kmp_nth was decremented when thread is added to the pool. 6001 6002 #ifdef KMP_ADJUST_BLOCKTIME 6003 /* Adjust blocktime back to user setting or default if necessary */ 6004 /* Middle initialization might never have occurred */ 6005 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6006 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6007 if (__kmp_nth <= __kmp_avail_proc) { 6008 __kmp_zero_bt = FALSE; 6009 } 6010 } 6011 #endif /* KMP_ADJUST_BLOCKTIME */ 6012 6013 /* free the memory being used */ 6014 if (__kmp_env_consistency_check) { 6015 if (thread->th.th_cons) { 6016 __kmp_free_cons_stack(thread->th.th_cons); 6017 thread->th.th_cons = NULL; 6018 } 6019 } 6020 6021 if (thread->th.th_pri_common != NULL) { 6022 __kmp_free(thread->th.th_pri_common); 6023 thread->th.th_pri_common = NULL; 6024 } 6025 6026 if (thread->th.th_task_state_memo_stack != NULL) { 6027 __kmp_free(thread->th.th_task_state_memo_stack); 6028 thread->th.th_task_state_memo_stack = NULL; 6029 } 6030 6031 #if KMP_USE_BGET 6032 if (thread->th.th_local.bget_data != NULL) { 6033 __kmp_finalize_bget(thread); 6034 } 6035 #endif 6036 6037 #if KMP_AFFINITY_SUPPORTED 6038 if (thread->th.th_affin_mask != NULL) { 6039 KMP_CPU_FREE(thread->th.th_affin_mask); 6040 thread->th.th_affin_mask = NULL; 6041 } 6042 #endif /* KMP_AFFINITY_SUPPORTED */ 6043 6044 #if KMP_USE_HIER_SCHED 6045 if (thread->th.th_hier_bar_data != NULL) { 6046 __kmp_free(thread->th.th_hier_bar_data); 6047 thread->th.th_hier_bar_data = NULL; 6048 } 6049 #endif 6050 6051 __kmp_reap_team(thread->th.th_serial_team); 6052 thread->th.th_serial_team = NULL; 6053 __kmp_free(thread); 6054 6055 KMP_MB(); 6056 6057 } // __kmp_reap_thread 6058 6059 static void __kmp_internal_end(void) { 6060 int i; 6061 6062 /* First, unregister the library */ 6063 __kmp_unregister_library(); 6064 6065 #if KMP_OS_WINDOWS 6066 /* In Win static library, we can't tell when a root actually dies, so we 6067 reclaim the data structures for any root threads that have died but not 6068 unregistered themselves, in order to shut down cleanly. 6069 In Win dynamic library we also can't tell when a thread dies. */ 6070 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6071 // dead roots 6072 #endif 6073 6074 for (i = 0; i < __kmp_threads_capacity; i++) 6075 if (__kmp_root[i]) 6076 if (__kmp_root[i]->r.r_active) 6077 break; 6078 KMP_MB(); /* Flush all pending memory write invalidates. */ 6079 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6080 6081 if (i < __kmp_threads_capacity) { 6082 #if KMP_USE_MONITOR 6083 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6084 KMP_MB(); /* Flush all pending memory write invalidates. */ 6085 6086 // Need to check that monitor was initialized before reaping it. If we are 6087 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6088 // __kmp_monitor will appear to contain valid data, but it is only valid in 6089 // the parent process, not the child. 6090 // New behavior (201008): instead of keying off of the flag 6091 // __kmp_init_parallel, the monitor thread creation is keyed off 6092 // of the new flag __kmp_init_monitor. 6093 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6094 if (TCR_4(__kmp_init_monitor)) { 6095 __kmp_reap_monitor(&__kmp_monitor); 6096 TCW_4(__kmp_init_monitor, 0); 6097 } 6098 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6099 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6100 #endif // KMP_USE_MONITOR 6101 } else { 6102 /* TODO move this to cleanup code */ 6103 #ifdef KMP_DEBUG 6104 /* make sure that everything has properly ended */ 6105 for (i = 0; i < __kmp_threads_capacity; i++) { 6106 if (__kmp_root[i]) { 6107 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6108 // there can be uber threads alive here 6109 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6110 } 6111 } 6112 #endif 6113 6114 KMP_MB(); 6115 6116 // Reap the worker threads. 6117 // This is valid for now, but be careful if threads are reaped sooner. 6118 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6119 // Get the next thread from the pool. 6120 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6121 __kmp_thread_pool = thread->th.th_next_pool; 6122 // Reap it. 6123 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6124 thread->th.th_next_pool = NULL; 6125 thread->th.th_in_pool = FALSE; 6126 __kmp_reap_thread(thread, 0); 6127 } 6128 __kmp_thread_pool_insert_pt = NULL; 6129 6130 // Reap teams. 6131 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6132 // Get the next team from the pool. 6133 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6134 __kmp_team_pool = team->t.t_next_pool; 6135 // Reap it. 6136 team->t.t_next_pool = NULL; 6137 __kmp_reap_team(team); 6138 } 6139 6140 __kmp_reap_task_teams(); 6141 6142 #if KMP_OS_UNIX 6143 // Threads that are not reaped should not access any resources since they 6144 // are going to be deallocated soon, so the shutdown sequence should wait 6145 // until all threads either exit the final spin-waiting loop or begin 6146 // sleeping after the given blocktime. 6147 for (i = 0; i < __kmp_threads_capacity; i++) { 6148 kmp_info_t *thr = __kmp_threads[i]; 6149 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6150 KMP_CPU_PAUSE(); 6151 } 6152 #endif 6153 6154 for (i = 0; i < __kmp_threads_capacity; ++i) { 6155 // TBD: Add some checking... 6156 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6157 } 6158 6159 /* Make sure all threadprivate destructors get run by joining with all 6160 worker threads before resetting this flag */ 6161 TCW_SYNC_4(__kmp_init_common, FALSE); 6162 6163 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6164 KMP_MB(); 6165 6166 #if KMP_USE_MONITOR 6167 // See note above: One of the possible fixes for CQ138434 / CQ140126 6168 // 6169 // FIXME: push both code fragments down and CSE them? 6170 // push them into __kmp_cleanup() ? 6171 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6172 if (TCR_4(__kmp_init_monitor)) { 6173 __kmp_reap_monitor(&__kmp_monitor); 6174 TCW_4(__kmp_init_monitor, 0); 6175 } 6176 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6177 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6178 #endif 6179 } /* else !__kmp_global.t_active */ 6180 TCW_4(__kmp_init_gtid, FALSE); 6181 KMP_MB(); /* Flush all pending memory write invalidates. */ 6182 6183 __kmp_cleanup(); 6184 #if OMPT_SUPPORT 6185 ompt_fini(); 6186 #endif 6187 } 6188 6189 void __kmp_internal_end_library(int gtid_req) { 6190 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6191 /* this shouldn't be a race condition because __kmp_internal_end() is the 6192 only place to clear __kmp_serial_init */ 6193 /* we'll check this later too, after we get the lock */ 6194 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6195 // redundant, because the next check will work in any case. 6196 if (__kmp_global.g.g_abort) { 6197 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6198 /* TODO abort? */ 6199 return; 6200 } 6201 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6202 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6203 return; 6204 } 6205 6206 KMP_MB(); /* Flush all pending memory write invalidates. */ 6207 /* find out who we are and what we should do */ 6208 { 6209 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6210 KA_TRACE( 6211 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6212 if (gtid == KMP_GTID_SHUTDOWN) { 6213 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6214 "already shutdown\n")); 6215 return; 6216 } else if (gtid == KMP_GTID_MONITOR) { 6217 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6218 "registered, or system shutdown\n")); 6219 return; 6220 } else if (gtid == KMP_GTID_DNE) { 6221 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6222 "shutdown\n")); 6223 /* we don't know who we are, but we may still shutdown the library */ 6224 } else if (KMP_UBER_GTID(gtid)) { 6225 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6226 if (__kmp_root[gtid]->r.r_active) { 6227 __kmp_global.g.g_abort = -1; 6228 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6229 __kmp_unregister_library(); 6230 KA_TRACE(10, 6231 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6232 gtid)); 6233 return; 6234 } else { 6235 KA_TRACE( 6236 10, 6237 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6238 __kmp_unregister_root_current_thread(gtid); 6239 } 6240 } else { 6241 /* worker threads may call this function through the atexit handler, if they 6242 * call exit() */ 6243 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6244 TODO: do a thorough shutdown instead */ 6245 #ifdef DUMP_DEBUG_ON_EXIT 6246 if (__kmp_debug_buf) 6247 __kmp_dump_debug_buffer(); 6248 #endif 6249 // added unregister library call here when we switch to shm linux 6250 // if we don't, it will leave lots of files in /dev/shm 6251 // cleanup shared memory file before exiting. 6252 __kmp_unregister_library(); 6253 return; 6254 } 6255 } 6256 /* synchronize the termination process */ 6257 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6258 6259 /* have we already finished */ 6260 if (__kmp_global.g.g_abort) { 6261 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6262 /* TODO abort? */ 6263 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6264 return; 6265 } 6266 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6267 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6268 return; 6269 } 6270 6271 /* We need this lock to enforce mutex between this reading of 6272 __kmp_threads_capacity and the writing by __kmp_register_root. 6273 Alternatively, we can use a counter of roots that is atomically updated by 6274 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6275 __kmp_internal_end_*. */ 6276 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6277 6278 /* now we can safely conduct the actual termination */ 6279 __kmp_internal_end(); 6280 6281 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6282 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6283 6284 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6285 6286 #ifdef DUMP_DEBUG_ON_EXIT 6287 if (__kmp_debug_buf) 6288 __kmp_dump_debug_buffer(); 6289 #endif 6290 6291 #if KMP_OS_WINDOWS 6292 __kmp_close_console(); 6293 #endif 6294 6295 __kmp_fini_allocator(); 6296 6297 } // __kmp_internal_end_library 6298 6299 void __kmp_internal_end_thread(int gtid_req) { 6300 int i; 6301 6302 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6303 /* this shouldn't be a race condition because __kmp_internal_end() is the 6304 * only place to clear __kmp_serial_init */ 6305 /* we'll check this later too, after we get the lock */ 6306 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6307 // redundant, because the next check will work in any case. 6308 if (__kmp_global.g.g_abort) { 6309 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6310 /* TODO abort? */ 6311 return; 6312 } 6313 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6314 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6315 return; 6316 } 6317 6318 // If hidden helper team has been initialized, we need to deinit it 6319 if (TCR_4(__kmp_init_hidden_helper)) { 6320 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6321 // First release the main thread to let it continue its work 6322 __kmp_hidden_helper_main_thread_release(); 6323 // Wait until the hidden helper team has been destroyed 6324 __kmp_hidden_helper_threads_deinitz_wait(); 6325 } 6326 6327 KMP_MB(); /* Flush all pending memory write invalidates. */ 6328 6329 /* find out who we are and what we should do */ 6330 { 6331 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6332 KA_TRACE(10, 6333 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6334 if (gtid == KMP_GTID_SHUTDOWN) { 6335 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6336 "already shutdown\n")); 6337 return; 6338 } else if (gtid == KMP_GTID_MONITOR) { 6339 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6340 "registered, or system shutdown\n")); 6341 return; 6342 } else if (gtid == KMP_GTID_DNE) { 6343 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6344 "shutdown\n")); 6345 return; 6346 /* we don't know who we are */ 6347 } else if (KMP_UBER_GTID(gtid)) { 6348 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6349 if (__kmp_root[gtid]->r.r_active) { 6350 __kmp_global.g.g_abort = -1; 6351 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6352 KA_TRACE(10, 6353 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6354 gtid)); 6355 return; 6356 } else { 6357 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6358 gtid)); 6359 __kmp_unregister_root_current_thread(gtid); 6360 } 6361 } else { 6362 /* just a worker thread, let's leave */ 6363 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6364 6365 if (gtid >= 0) { 6366 __kmp_threads[gtid]->th.th_task_team = NULL; 6367 } 6368 6369 KA_TRACE(10, 6370 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6371 gtid)); 6372 return; 6373 } 6374 } 6375 #if KMP_DYNAMIC_LIB 6376 if (__kmp_pause_status != kmp_hard_paused) 6377 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6378 // because we will better shutdown later in the library destructor. 6379 { 6380 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6381 return; 6382 } 6383 #endif 6384 /* synchronize the termination process */ 6385 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6386 6387 /* have we already finished */ 6388 if (__kmp_global.g.g_abort) { 6389 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6390 /* TODO abort? */ 6391 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6392 return; 6393 } 6394 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6395 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6396 return; 6397 } 6398 6399 /* We need this lock to enforce mutex between this reading of 6400 __kmp_threads_capacity and the writing by __kmp_register_root. 6401 Alternatively, we can use a counter of roots that is atomically updated by 6402 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6403 __kmp_internal_end_*. */ 6404 6405 /* should we finish the run-time? are all siblings done? */ 6406 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6407 6408 for (i = 0; i < __kmp_threads_capacity; ++i) { 6409 if (KMP_UBER_GTID(i)) { 6410 KA_TRACE( 6411 10, 6412 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6413 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6414 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6415 return; 6416 } 6417 } 6418 6419 /* now we can safely conduct the actual termination */ 6420 6421 __kmp_internal_end(); 6422 6423 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6424 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6425 6426 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6427 6428 #ifdef DUMP_DEBUG_ON_EXIT 6429 if (__kmp_debug_buf) 6430 __kmp_dump_debug_buffer(); 6431 #endif 6432 } // __kmp_internal_end_thread 6433 6434 // ----------------------------------------------------------------------------- 6435 // Library registration stuff. 6436 6437 static long __kmp_registration_flag = 0; 6438 // Random value used to indicate library initialization. 6439 static char *__kmp_registration_str = NULL; 6440 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6441 6442 static inline char *__kmp_reg_status_name() { 6443 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6444 each thread. If registration and unregistration go in different threads 6445 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6446 env var can not be found, because the name will contain different pid. */ 6447 // macOS* complains about name being too long with additional getuid() 6448 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6449 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6450 (int)getuid()); 6451 #else 6452 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6453 #endif 6454 } // __kmp_reg_status_get 6455 6456 void __kmp_register_library_startup(void) { 6457 6458 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6459 int done = 0; 6460 union { 6461 double dtime; 6462 long ltime; 6463 } time; 6464 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6465 __kmp_initialize_system_tick(); 6466 #endif 6467 __kmp_read_system_time(&time.dtime); 6468 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6469 __kmp_registration_str = 6470 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6471 __kmp_registration_flag, KMP_LIBRARY_FILE); 6472 6473 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6474 __kmp_registration_str)); 6475 6476 while (!done) { 6477 6478 char *value = NULL; // Actual value of the environment variable. 6479 6480 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6481 char *shm_name = __kmp_str_format("/%s", name); 6482 int shm_preexist = 0; 6483 char *data1; 6484 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6485 if ((fd1 == -1) && (errno == EEXIST)) { 6486 // file didn't open because it already exists. 6487 // try opening existing file 6488 fd1 = shm_open(shm_name, O_RDWR, 0666); 6489 if (fd1 == -1) { // file didn't open 6490 // error out here 6491 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6492 __kmp_msg_null); 6493 } else { 6494 // able to open existing file 6495 shm_preexist = 1; 6496 } 6497 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6498 // already exists. 6499 // error out here. 6500 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6501 __kmp_msg_null); 6502 } 6503 if (shm_preexist == 0) { 6504 // we created SHM now set size 6505 if (ftruncate(fd1, SHM_SIZE) == -1) { 6506 // error occured setting size; 6507 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6508 KMP_ERR(errno), __kmp_msg_null); 6509 } 6510 } 6511 data1 = 6512 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6513 if (data1 == MAP_FAILED) { 6514 // failed to map shared memory 6515 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6516 __kmp_msg_null); 6517 } 6518 if (shm_preexist == 0) { // set data to SHM, set value 6519 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6520 } 6521 // Read value from either what we just wrote or existing file. 6522 value = __kmp_str_format("%s", data1); // read value from SHM 6523 munmap(data1, SHM_SIZE); 6524 close(fd1); 6525 #else // Windows and unix with static library 6526 // Set environment variable, but do not overwrite if it is exist. 6527 __kmp_env_set(name, __kmp_registration_str, 0); 6528 // read value to see if it got set 6529 value = __kmp_env_get(name); 6530 #endif 6531 6532 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6533 done = 1; // Ok, environment variable set successfully, exit the loop. 6534 } else { 6535 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6536 // Check whether it alive or dead. 6537 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6538 char *tail = value; 6539 char *flag_addr_str = NULL; 6540 char *flag_val_str = NULL; 6541 char const *file_name = NULL; 6542 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6543 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6544 file_name = tail; 6545 if (tail != NULL) { 6546 long *flag_addr = 0; 6547 unsigned long flag_val = 0; 6548 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6549 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6550 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6551 // First, check whether environment-encoded address is mapped into 6552 // addr space. 6553 // If so, dereference it to see if it still has the right value. 6554 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6555 neighbor = 1; 6556 } else { 6557 // If not, then we know the other copy of the library is no longer 6558 // running. 6559 neighbor = 2; 6560 } 6561 } 6562 } 6563 switch (neighbor) { 6564 case 0: // Cannot parse environment variable -- neighbor status unknown. 6565 // Assume it is the incompatible format of future version of the 6566 // library. Assume the other library is alive. 6567 // WARN( ... ); // TODO: Issue a warning. 6568 file_name = "unknown library"; 6569 KMP_FALLTHROUGH(); 6570 // Attention! Falling to the next case. That's intentional. 6571 case 1: { // Neighbor is alive. 6572 // Check it is allowed. 6573 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6574 if (!__kmp_str_match_true(duplicate_ok)) { 6575 // That's not allowed. Issue fatal error. 6576 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6577 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6578 } 6579 KMP_INTERNAL_FREE(duplicate_ok); 6580 __kmp_duplicate_library_ok = 1; 6581 done = 1; // Exit the loop. 6582 } break; 6583 case 2: { // Neighbor is dead. 6584 6585 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6586 // close shared memory. 6587 shm_unlink(shm_name); // this removes file in /dev/shm 6588 #else 6589 // Clear the variable and try to register library again. 6590 __kmp_env_unset(name); 6591 #endif 6592 } break; 6593 default: { 6594 KMP_DEBUG_ASSERT(0); 6595 } break; 6596 } 6597 } 6598 KMP_INTERNAL_FREE((void *)value); 6599 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6600 KMP_INTERNAL_FREE((void *)shm_name); 6601 #endif 6602 } // while 6603 KMP_INTERNAL_FREE((void *)name); 6604 6605 } // func __kmp_register_library_startup 6606 6607 void __kmp_unregister_library(void) { 6608 6609 char *name = __kmp_reg_status_name(); 6610 char *value = NULL; 6611 6612 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6613 char *shm_name = __kmp_str_format("/%s", name); 6614 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6615 if (fd1 == -1) { 6616 // file did not open. return. 6617 return; 6618 } 6619 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6620 if (data1 != MAP_FAILED) { 6621 value = __kmp_str_format("%s", data1); // read value from SHM 6622 munmap(data1, SHM_SIZE); 6623 } 6624 close(fd1); 6625 #else 6626 value = __kmp_env_get(name); 6627 #endif 6628 6629 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6630 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6631 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6632 // Ok, this is our variable. Delete it. 6633 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6634 shm_unlink(shm_name); // this removes file in /dev/shm 6635 #else 6636 __kmp_env_unset(name); 6637 #endif 6638 } 6639 6640 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6641 KMP_INTERNAL_FREE(shm_name); 6642 #endif 6643 6644 KMP_INTERNAL_FREE(__kmp_registration_str); 6645 KMP_INTERNAL_FREE(value); 6646 KMP_INTERNAL_FREE(name); 6647 6648 __kmp_registration_flag = 0; 6649 __kmp_registration_str = NULL; 6650 6651 } // __kmp_unregister_library 6652 6653 // End of Library registration stuff. 6654 // ----------------------------------------------------------------------------- 6655 6656 #if KMP_MIC_SUPPORTED 6657 6658 static void __kmp_check_mic_type() { 6659 kmp_cpuid_t cpuid_state = {0}; 6660 kmp_cpuid_t *cs_p = &cpuid_state; 6661 __kmp_x86_cpuid(1, 0, cs_p); 6662 // We don't support mic1 at the moment 6663 if ((cs_p->eax & 0xff0) == 0xB10) { 6664 __kmp_mic_type = mic2; 6665 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6666 __kmp_mic_type = mic3; 6667 } else { 6668 __kmp_mic_type = non_mic; 6669 } 6670 } 6671 6672 #endif /* KMP_MIC_SUPPORTED */ 6673 6674 #if KMP_HAVE_UMWAIT 6675 static void __kmp_user_level_mwait_init() { 6676 struct kmp_cpuid buf; 6677 __kmp_x86_cpuid(7, 0, &buf); 6678 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6679 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6680 __kmp_umwait_enabled)); 6681 } 6682 #elif KMP_HAVE_MWAIT 6683 #ifndef AT_INTELPHIUSERMWAIT 6684 // Spurious, non-existent value that should always fail to return anything. 6685 // Will be replaced with the correct value when we know that. 6686 #define AT_INTELPHIUSERMWAIT 10000 6687 #endif 6688 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6689 // earlier OS is used to build the RTL, we'll use the following internal 6690 // function when the entry is not found. 6691 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6692 unsigned long getauxval(unsigned long) { return 0; } 6693 6694 static void __kmp_user_level_mwait_init() { 6695 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6696 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6697 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6698 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6699 if (__kmp_mic_type == mic3) { 6700 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6701 if ((res & 0x1) || __kmp_user_level_mwait) { 6702 __kmp_mwait_enabled = TRUE; 6703 if (__kmp_user_level_mwait) { 6704 KMP_INFORM(EnvMwaitWarn); 6705 } 6706 } else { 6707 __kmp_mwait_enabled = FALSE; 6708 } 6709 } 6710 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6711 "__kmp_mwait_enabled = %d\n", 6712 __kmp_mic_type, __kmp_mwait_enabled)); 6713 } 6714 #endif /* KMP_HAVE_UMWAIT */ 6715 6716 static void __kmp_do_serial_initialize(void) { 6717 int i, gtid; 6718 size_t size; 6719 6720 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6721 6722 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6723 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6724 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6725 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6726 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6727 6728 #if OMPT_SUPPORT 6729 ompt_pre_init(); 6730 #endif 6731 #if OMPD_SUPPORT 6732 __kmp_env_dump(); 6733 ompd_init(); 6734 #endif 6735 6736 __kmp_validate_locks(); 6737 6738 /* Initialize internal memory allocator */ 6739 __kmp_init_allocator(); 6740 6741 /* Register the library startup via an environment variable and check to see 6742 whether another copy of the library is already registered. */ 6743 6744 __kmp_register_library_startup(); 6745 6746 /* TODO reinitialization of library */ 6747 if (TCR_4(__kmp_global.g.g_done)) { 6748 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6749 } 6750 6751 __kmp_global.g.g_abort = 0; 6752 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6753 6754 /* initialize the locks */ 6755 #if KMP_USE_ADAPTIVE_LOCKS 6756 #if KMP_DEBUG_ADAPTIVE_LOCKS 6757 __kmp_init_speculative_stats(); 6758 #endif 6759 #endif 6760 #if KMP_STATS_ENABLED 6761 __kmp_stats_init(); 6762 #endif 6763 __kmp_init_lock(&__kmp_global_lock); 6764 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6765 __kmp_init_lock(&__kmp_debug_lock); 6766 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6767 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6768 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6769 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6770 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6771 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6772 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6773 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6774 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6775 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6776 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6777 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6778 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6779 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6780 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6781 #if KMP_USE_MONITOR 6782 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6783 #endif 6784 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6785 6786 /* conduct initialization and initial setup of configuration */ 6787 6788 __kmp_runtime_initialize(); 6789 6790 #if KMP_MIC_SUPPORTED 6791 __kmp_check_mic_type(); 6792 #endif 6793 6794 // Some global variable initialization moved here from kmp_env_initialize() 6795 #ifdef KMP_DEBUG 6796 kmp_diag = 0; 6797 #endif 6798 __kmp_abort_delay = 0; 6799 6800 // From __kmp_init_dflt_team_nth() 6801 /* assume the entire machine will be used */ 6802 __kmp_dflt_team_nth_ub = __kmp_xproc; 6803 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6804 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6805 } 6806 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6807 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6808 } 6809 __kmp_max_nth = __kmp_sys_max_nth; 6810 __kmp_cg_max_nth = __kmp_sys_max_nth; 6811 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6812 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6813 __kmp_teams_max_nth = __kmp_sys_max_nth; 6814 } 6815 6816 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6817 // part 6818 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6819 #if KMP_USE_MONITOR 6820 __kmp_monitor_wakeups = 6821 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6822 __kmp_bt_intervals = 6823 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6824 #endif 6825 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6826 __kmp_library = library_throughput; 6827 // From KMP_SCHEDULE initialization 6828 __kmp_static = kmp_sch_static_balanced; 6829 // AC: do not use analytical here, because it is non-monotonous 6830 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6831 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6832 // need to repeat assignment 6833 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6834 // bit control and barrier method control parts 6835 #if KMP_FAST_REDUCTION_BARRIER 6836 #define kmp_reduction_barrier_gather_bb ((int)1) 6837 #define kmp_reduction_barrier_release_bb ((int)1) 6838 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6839 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6840 #endif // KMP_FAST_REDUCTION_BARRIER 6841 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6842 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6843 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6844 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6845 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6846 #if KMP_FAST_REDUCTION_BARRIER 6847 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6848 // lin_64 ): hyper,1 6849 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6850 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6851 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6852 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6853 } 6854 #endif // KMP_FAST_REDUCTION_BARRIER 6855 } 6856 #if KMP_FAST_REDUCTION_BARRIER 6857 #undef kmp_reduction_barrier_release_pat 6858 #undef kmp_reduction_barrier_gather_pat 6859 #undef kmp_reduction_barrier_release_bb 6860 #undef kmp_reduction_barrier_gather_bb 6861 #endif // KMP_FAST_REDUCTION_BARRIER 6862 #if KMP_MIC_SUPPORTED 6863 if (__kmp_mic_type == mic2) { // KNC 6864 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6865 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6866 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6867 1; // forkjoin release 6868 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6869 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6870 } 6871 #if KMP_FAST_REDUCTION_BARRIER 6872 if (__kmp_mic_type == mic2) { // KNC 6873 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6874 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6875 } 6876 #endif // KMP_FAST_REDUCTION_BARRIER 6877 #endif // KMP_MIC_SUPPORTED 6878 6879 // From KMP_CHECKS initialization 6880 #ifdef KMP_DEBUG 6881 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6882 #else 6883 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6884 #endif 6885 6886 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6887 __kmp_foreign_tp = TRUE; 6888 6889 __kmp_global.g.g_dynamic = FALSE; 6890 __kmp_global.g.g_dynamic_mode = dynamic_default; 6891 6892 __kmp_init_nesting_mode(); 6893 6894 __kmp_env_initialize(NULL); 6895 6896 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6897 __kmp_user_level_mwait_init(); 6898 #endif 6899 // Print all messages in message catalog for testing purposes. 6900 #ifdef KMP_DEBUG 6901 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6902 if (__kmp_str_match_true(val)) { 6903 kmp_str_buf_t buffer; 6904 __kmp_str_buf_init(&buffer); 6905 __kmp_i18n_dump_catalog(&buffer); 6906 __kmp_printf("%s", buffer.str); 6907 __kmp_str_buf_free(&buffer); 6908 } 6909 __kmp_env_free(&val); 6910 #endif 6911 6912 __kmp_threads_capacity = 6913 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6914 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6915 __kmp_tp_capacity = __kmp_default_tp_capacity( 6916 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6917 6918 // If the library is shut down properly, both pools must be NULL. Just in 6919 // case, set them to NULL -- some memory may leak, but subsequent code will 6920 // work even if pools are not freed. 6921 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6922 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6923 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6924 __kmp_thread_pool = NULL; 6925 __kmp_thread_pool_insert_pt = NULL; 6926 __kmp_team_pool = NULL; 6927 6928 /* Allocate all of the variable sized records */ 6929 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6930 * expandable */ 6931 /* Since allocation is cache-aligned, just add extra padding at the end */ 6932 size = 6933 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6934 CACHE_LINE; 6935 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6936 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6937 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6938 6939 /* init thread counts */ 6940 KMP_DEBUG_ASSERT(__kmp_all_nth == 6941 0); // Asserts fail if the library is reinitializing and 6942 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6943 __kmp_all_nth = 0; 6944 __kmp_nth = 0; 6945 6946 /* setup the uber master thread and hierarchy */ 6947 gtid = __kmp_register_root(TRUE); 6948 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6949 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6950 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6951 6952 KMP_MB(); /* Flush all pending memory write invalidates. */ 6953 6954 __kmp_common_initialize(); 6955 6956 #if KMP_OS_UNIX 6957 /* invoke the child fork handler */ 6958 __kmp_register_atfork(); 6959 #endif 6960 6961 #if !KMP_DYNAMIC_LIB 6962 { 6963 /* Invoke the exit handler when the program finishes, only for static 6964 library. For dynamic library, we already have _fini and DllMain. */ 6965 int rc = atexit(__kmp_internal_end_atexit); 6966 if (rc != 0) { 6967 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6968 __kmp_msg_null); 6969 } 6970 } 6971 #endif 6972 6973 #if KMP_HANDLE_SIGNALS 6974 #if KMP_OS_UNIX 6975 /* NOTE: make sure that this is called before the user installs their own 6976 signal handlers so that the user handlers are called first. this way they 6977 can return false, not call our handler, avoid terminating the library, and 6978 continue execution where they left off. */ 6979 __kmp_install_signals(FALSE); 6980 #endif /* KMP_OS_UNIX */ 6981 #if KMP_OS_WINDOWS 6982 __kmp_install_signals(TRUE); 6983 #endif /* KMP_OS_WINDOWS */ 6984 #endif 6985 6986 /* we have finished the serial initialization */ 6987 __kmp_init_counter++; 6988 6989 __kmp_init_serial = TRUE; 6990 6991 if (__kmp_settings) { 6992 __kmp_env_print(); 6993 } 6994 6995 if (__kmp_display_env || __kmp_display_env_verbose) { 6996 __kmp_env_print_2(); 6997 } 6998 6999 #if OMPT_SUPPORT 7000 ompt_post_init(); 7001 #endif 7002 7003 KMP_MB(); 7004 7005 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7006 } 7007 7008 void __kmp_serial_initialize(void) { 7009 if (__kmp_init_serial) { 7010 return; 7011 } 7012 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7013 if (__kmp_init_serial) { 7014 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7015 return; 7016 } 7017 __kmp_do_serial_initialize(); 7018 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7019 } 7020 7021 static void __kmp_do_middle_initialize(void) { 7022 int i, j; 7023 int prev_dflt_team_nth; 7024 7025 if (!__kmp_init_serial) { 7026 __kmp_do_serial_initialize(); 7027 } 7028 7029 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7030 7031 // Save the previous value for the __kmp_dflt_team_nth so that 7032 // we can avoid some reinitialization if it hasn't changed. 7033 prev_dflt_team_nth = __kmp_dflt_team_nth; 7034 7035 #if KMP_AFFINITY_SUPPORTED 7036 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7037 // number of cores on the machine. 7038 __kmp_affinity_initialize(); 7039 7040 // Run through the __kmp_threads array and set the affinity mask 7041 // for each root thread that is currently registered with the RTL. 7042 for (i = 0; i < __kmp_threads_capacity; i++) { 7043 if (TCR_PTR(__kmp_threads[i]) != NULL) { 7044 __kmp_affinity_set_init_mask(i, TRUE); 7045 } 7046 } 7047 #endif /* KMP_AFFINITY_SUPPORTED */ 7048 7049 KMP_ASSERT(__kmp_xproc > 0); 7050 if (__kmp_avail_proc == 0) { 7051 __kmp_avail_proc = __kmp_xproc; 7052 } 7053 7054 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7055 // correct them now 7056 j = 0; 7057 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7058 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7059 __kmp_avail_proc; 7060 j++; 7061 } 7062 7063 if (__kmp_dflt_team_nth == 0) { 7064 #ifdef KMP_DFLT_NTH_CORES 7065 // Default #threads = #cores 7066 __kmp_dflt_team_nth = __kmp_ncores; 7067 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7068 "__kmp_ncores (%d)\n", 7069 __kmp_dflt_team_nth)); 7070 #else 7071 // Default #threads = #available OS procs 7072 __kmp_dflt_team_nth = __kmp_avail_proc; 7073 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7074 "__kmp_avail_proc(%d)\n", 7075 __kmp_dflt_team_nth)); 7076 #endif /* KMP_DFLT_NTH_CORES */ 7077 } 7078 7079 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7080 __kmp_dflt_team_nth = KMP_MIN_NTH; 7081 } 7082 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7083 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7084 } 7085 7086 if (__kmp_nesting_mode > 0) 7087 __kmp_set_nesting_mode_threads(); 7088 7089 // There's no harm in continuing if the following check fails, 7090 // but it indicates an error in the previous logic. 7091 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7092 7093 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7094 // Run through the __kmp_threads array and set the num threads icv for each 7095 // root thread that is currently registered with the RTL (which has not 7096 // already explicitly set its nthreads-var with a call to 7097 // omp_set_num_threads()). 7098 for (i = 0; i < __kmp_threads_capacity; i++) { 7099 kmp_info_t *thread = __kmp_threads[i]; 7100 if (thread == NULL) 7101 continue; 7102 if (thread->th.th_current_task->td_icvs.nproc != 0) 7103 continue; 7104 7105 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7106 } 7107 } 7108 KA_TRACE( 7109 20, 7110 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7111 __kmp_dflt_team_nth)); 7112 7113 #ifdef KMP_ADJUST_BLOCKTIME 7114 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7115 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7116 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7117 if (__kmp_nth > __kmp_avail_proc) { 7118 __kmp_zero_bt = TRUE; 7119 } 7120 } 7121 #endif /* KMP_ADJUST_BLOCKTIME */ 7122 7123 /* we have finished middle initialization */ 7124 TCW_SYNC_4(__kmp_init_middle, TRUE); 7125 7126 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7127 } 7128 7129 void __kmp_middle_initialize(void) { 7130 if (__kmp_init_middle) { 7131 return; 7132 } 7133 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7134 if (__kmp_init_middle) { 7135 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7136 return; 7137 } 7138 __kmp_do_middle_initialize(); 7139 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7140 } 7141 7142 void __kmp_parallel_initialize(void) { 7143 int gtid = __kmp_entry_gtid(); // this might be a new root 7144 7145 /* synchronize parallel initialization (for sibling) */ 7146 if (TCR_4(__kmp_init_parallel)) 7147 return; 7148 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7149 if (TCR_4(__kmp_init_parallel)) { 7150 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7151 return; 7152 } 7153 7154 /* TODO reinitialization after we have already shut down */ 7155 if (TCR_4(__kmp_global.g.g_done)) { 7156 KA_TRACE( 7157 10, 7158 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7159 __kmp_infinite_loop(); 7160 } 7161 7162 /* jc: The lock __kmp_initz_lock is already held, so calling 7163 __kmp_serial_initialize would cause a deadlock. So we call 7164 __kmp_do_serial_initialize directly. */ 7165 if (!__kmp_init_middle) { 7166 __kmp_do_middle_initialize(); 7167 } 7168 __kmp_resume_if_hard_paused(); 7169 7170 /* begin initialization */ 7171 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7172 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7173 7174 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7175 // Save the FP control regs. 7176 // Worker threads will set theirs to these values at thread startup. 7177 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7178 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7179 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7180 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7181 7182 #if KMP_OS_UNIX 7183 #if KMP_HANDLE_SIGNALS 7184 /* must be after __kmp_serial_initialize */ 7185 __kmp_install_signals(TRUE); 7186 #endif 7187 #endif 7188 7189 __kmp_suspend_initialize(); 7190 7191 #if defined(USE_LOAD_BALANCE) 7192 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7193 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7194 } 7195 #else 7196 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7197 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7198 } 7199 #endif 7200 7201 if (__kmp_version) { 7202 __kmp_print_version_2(); 7203 } 7204 7205 /* we have finished parallel initialization */ 7206 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7207 7208 KMP_MB(); 7209 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7210 7211 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7212 } 7213 7214 void __kmp_hidden_helper_initialize() { 7215 if (TCR_4(__kmp_init_hidden_helper)) 7216 return; 7217 7218 // __kmp_parallel_initialize is required before we initialize hidden helper 7219 if (!TCR_4(__kmp_init_parallel)) 7220 __kmp_parallel_initialize(); 7221 7222 // Double check. Note that this double check should not be placed before 7223 // __kmp_parallel_initialize as it will cause dead lock. 7224 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7225 if (TCR_4(__kmp_init_hidden_helper)) { 7226 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7227 return; 7228 } 7229 7230 // Set the count of hidden helper tasks to be executed to zero 7231 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7232 7233 // Set the global variable indicating that we're initializing hidden helper 7234 // team/threads 7235 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7236 7237 // Platform independent initialization 7238 __kmp_do_initialize_hidden_helper_threads(); 7239 7240 // Wait here for the finish of initialization of hidden helper teams 7241 __kmp_hidden_helper_threads_initz_wait(); 7242 7243 // We have finished hidden helper initialization 7244 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7245 7246 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7247 } 7248 7249 /* ------------------------------------------------------------------------ */ 7250 7251 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7252 kmp_team_t *team) { 7253 kmp_disp_t *dispatch; 7254 7255 KMP_MB(); 7256 7257 /* none of the threads have encountered any constructs, yet. */ 7258 this_thr->th.th_local.this_construct = 0; 7259 #if KMP_CACHE_MANAGE 7260 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7261 #endif /* KMP_CACHE_MANAGE */ 7262 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7263 KMP_DEBUG_ASSERT(dispatch); 7264 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7265 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7266 // this_thr->th.th_info.ds.ds_tid ] ); 7267 7268 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7269 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7270 if (__kmp_env_consistency_check) 7271 __kmp_push_parallel(gtid, team->t.t_ident); 7272 7273 KMP_MB(); /* Flush all pending memory write invalidates. */ 7274 } 7275 7276 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7277 kmp_team_t *team) { 7278 if (__kmp_env_consistency_check) 7279 __kmp_pop_parallel(gtid, team->t.t_ident); 7280 7281 __kmp_finish_implicit_task(this_thr); 7282 } 7283 7284 int __kmp_invoke_task_func(int gtid) { 7285 int rc; 7286 int tid = __kmp_tid_from_gtid(gtid); 7287 kmp_info_t *this_thr = __kmp_threads[gtid]; 7288 kmp_team_t *team = this_thr->th.th_team; 7289 7290 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7291 #if USE_ITT_BUILD 7292 if (__itt_stack_caller_create_ptr) { 7293 // inform ittnotify about entering user's code 7294 if (team->t.t_stack_id != NULL) { 7295 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7296 } else { 7297 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7298 __kmp_itt_stack_callee_enter( 7299 (__itt_caller)team->t.t_parent->t.t_stack_id); 7300 } 7301 } 7302 #endif /* USE_ITT_BUILD */ 7303 #if INCLUDE_SSC_MARKS 7304 SSC_MARK_INVOKING(); 7305 #endif 7306 7307 #if OMPT_SUPPORT 7308 void *dummy; 7309 void **exit_frame_p; 7310 ompt_data_t *my_task_data; 7311 ompt_data_t *my_parallel_data; 7312 int ompt_team_size; 7313 7314 if (ompt_enabled.enabled) { 7315 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7316 .ompt_task_info.frame.exit_frame.ptr); 7317 } else { 7318 exit_frame_p = &dummy; 7319 } 7320 7321 my_task_data = 7322 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7323 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7324 if (ompt_enabled.ompt_callback_implicit_task) { 7325 ompt_team_size = team->t.t_nproc; 7326 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7327 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7328 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7329 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7330 } 7331 #endif 7332 7333 #if KMP_STATS_ENABLED 7334 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7335 if (previous_state == stats_state_e::TEAMS_REGION) { 7336 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7337 } else { 7338 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7339 } 7340 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7341 #endif 7342 7343 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7344 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7345 #if OMPT_SUPPORT 7346 , 7347 exit_frame_p 7348 #endif 7349 ); 7350 #if OMPT_SUPPORT 7351 *exit_frame_p = NULL; 7352 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7353 #endif 7354 7355 #if KMP_STATS_ENABLED 7356 if (previous_state == stats_state_e::TEAMS_REGION) { 7357 KMP_SET_THREAD_STATE(previous_state); 7358 } 7359 KMP_POP_PARTITIONED_TIMER(); 7360 #endif 7361 7362 #if USE_ITT_BUILD 7363 if (__itt_stack_caller_create_ptr) { 7364 // inform ittnotify about leaving user's code 7365 if (team->t.t_stack_id != NULL) { 7366 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7367 } else { 7368 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7369 __kmp_itt_stack_callee_leave( 7370 (__itt_caller)team->t.t_parent->t.t_stack_id); 7371 } 7372 } 7373 #endif /* USE_ITT_BUILD */ 7374 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7375 7376 return rc; 7377 } 7378 7379 void __kmp_teams_master(int gtid) { 7380 // This routine is called by all primary threads in teams construct 7381 kmp_info_t *thr = __kmp_threads[gtid]; 7382 kmp_team_t *team = thr->th.th_team; 7383 ident_t *loc = team->t.t_ident; 7384 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7385 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7386 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7387 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7388 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7389 7390 // This thread is a new CG root. Set up the proper variables. 7391 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7392 tmp->cg_root = thr; // Make thr the CG root 7393 // Init to thread limit stored when league primary threads were forked 7394 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7395 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7396 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7397 " cg_nthreads to 1\n", 7398 thr, tmp)); 7399 tmp->up = thr->th.th_cg_roots; 7400 thr->th.th_cg_roots = tmp; 7401 7402 // Launch league of teams now, but not let workers execute 7403 // (they hang on fork barrier until next parallel) 7404 #if INCLUDE_SSC_MARKS 7405 SSC_MARK_FORKING(); 7406 #endif 7407 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7408 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7409 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7410 #if INCLUDE_SSC_MARKS 7411 SSC_MARK_JOINING(); 7412 #endif 7413 // If the team size was reduced from the limit, set it to the new size 7414 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7415 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7416 // AC: last parameter "1" eliminates join barrier which won't work because 7417 // worker threads are in a fork barrier waiting for more parallel regions 7418 __kmp_join_call(loc, gtid 7419 #if OMPT_SUPPORT 7420 , 7421 fork_context_intel 7422 #endif 7423 , 7424 1); 7425 } 7426 7427 int __kmp_invoke_teams_master(int gtid) { 7428 kmp_info_t *this_thr = __kmp_threads[gtid]; 7429 kmp_team_t *team = this_thr->th.th_team; 7430 #if KMP_DEBUG 7431 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7432 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7433 (void *)__kmp_teams_master); 7434 #endif 7435 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7436 #if OMPT_SUPPORT 7437 int tid = __kmp_tid_from_gtid(gtid); 7438 ompt_data_t *task_data = 7439 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7440 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7441 if (ompt_enabled.ompt_callback_implicit_task) { 7442 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7443 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7444 ompt_task_initial); 7445 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7446 } 7447 #endif 7448 __kmp_teams_master(gtid); 7449 #if OMPT_SUPPORT 7450 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7451 #endif 7452 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7453 return 1; 7454 } 7455 7456 /* this sets the requested number of threads for the next parallel region 7457 encountered by this team. since this should be enclosed in the forkjoin 7458 critical section it should avoid race conditions with asymmetrical nested 7459 parallelism */ 7460 7461 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7462 kmp_info_t *thr = __kmp_threads[gtid]; 7463 7464 if (num_threads > 0) 7465 thr->th.th_set_nproc = num_threads; 7466 } 7467 7468 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7469 int num_threads) { 7470 KMP_DEBUG_ASSERT(thr); 7471 // Remember the number of threads for inner parallel regions 7472 if (!TCR_4(__kmp_init_middle)) 7473 __kmp_middle_initialize(); // get internal globals calculated 7474 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7475 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7476 7477 if (num_threads == 0) { 7478 if (__kmp_teams_thread_limit > 0) { 7479 num_threads = __kmp_teams_thread_limit; 7480 } else { 7481 num_threads = __kmp_avail_proc / num_teams; 7482 } 7483 // adjust num_threads w/o warning as it is not user setting 7484 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7485 // no thread_limit clause specified - do not change thread-limit-var ICV 7486 if (num_threads > __kmp_dflt_team_nth) { 7487 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7488 } 7489 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7490 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7491 } // prevent team size to exceed thread-limit-var 7492 if (num_teams * num_threads > __kmp_teams_max_nth) { 7493 num_threads = __kmp_teams_max_nth / num_teams; 7494 } 7495 if (num_threads == 0) { 7496 num_threads = 1; 7497 } 7498 } else { 7499 // This thread will be the primary thread of the league primary threads 7500 // Store new thread limit; old limit is saved in th_cg_roots list 7501 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7502 // num_threads = min(num_threads, nthreads-var) 7503 if (num_threads > __kmp_dflt_team_nth) { 7504 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7505 } 7506 if (num_teams * num_threads > __kmp_teams_max_nth) { 7507 int new_threads = __kmp_teams_max_nth / num_teams; 7508 if (new_threads == 0) { 7509 new_threads = 1; 7510 } 7511 if (new_threads != num_threads) { 7512 if (!__kmp_reserve_warn) { // user asked for too many threads 7513 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7514 __kmp_msg(kmp_ms_warning, 7515 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7516 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7517 } 7518 } 7519 num_threads = new_threads; 7520 } 7521 } 7522 thr->th.th_teams_size.nth = num_threads; 7523 } 7524 7525 /* this sets the requested number of teams for the teams region and/or 7526 the number of threads for the next parallel region encountered */ 7527 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7528 int num_threads) { 7529 kmp_info_t *thr = __kmp_threads[gtid]; 7530 KMP_DEBUG_ASSERT(num_teams >= 0); 7531 KMP_DEBUG_ASSERT(num_threads >= 0); 7532 7533 if (num_teams == 0) { 7534 if (__kmp_nteams > 0) { 7535 num_teams = __kmp_nteams; 7536 } else { 7537 num_teams = 1; // default number of teams is 1. 7538 } 7539 } 7540 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7541 if (!__kmp_reserve_warn) { 7542 __kmp_reserve_warn = 1; 7543 __kmp_msg(kmp_ms_warning, 7544 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7545 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7546 } 7547 num_teams = __kmp_teams_max_nth; 7548 } 7549 // Set number of teams (number of threads in the outer "parallel" of the 7550 // teams) 7551 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7552 7553 __kmp_push_thread_limit(thr, num_teams, num_threads); 7554 } 7555 7556 /* This sets the requested number of teams for the teams region and/or 7557 the number of threads for the next parallel region encountered */ 7558 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7559 int num_teams_ub, int num_threads) { 7560 kmp_info_t *thr = __kmp_threads[gtid]; 7561 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7562 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7563 KMP_DEBUG_ASSERT(num_threads >= 0); 7564 7565 if (num_teams_lb > num_teams_ub) { 7566 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7567 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7568 } 7569 7570 int num_teams = 1; // defalt number of teams is 1. 7571 7572 if (num_teams_lb == 0 && num_teams_ub > 0) 7573 num_teams_lb = num_teams_ub; 7574 7575 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7576 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7577 if (num_teams > __kmp_teams_max_nth) { 7578 if (!__kmp_reserve_warn) { 7579 __kmp_reserve_warn = 1; 7580 __kmp_msg(kmp_ms_warning, 7581 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7582 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7583 } 7584 num_teams = __kmp_teams_max_nth; 7585 } 7586 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7587 num_teams = num_teams_ub; 7588 } else { // num_teams_lb <= num_teams <= num_teams_ub 7589 if (num_threads == 0) { 7590 if (num_teams_ub > __kmp_teams_max_nth) { 7591 num_teams = num_teams_lb; 7592 } else { 7593 num_teams = num_teams_ub; 7594 } 7595 } else { 7596 num_teams = (num_threads > __kmp_teams_max_nth) 7597 ? num_teams 7598 : __kmp_teams_max_nth / num_threads; 7599 if (num_teams < num_teams_lb) { 7600 num_teams = num_teams_lb; 7601 } else if (num_teams > num_teams_ub) { 7602 num_teams = num_teams_ub; 7603 } 7604 } 7605 } 7606 // Set number of teams (number of threads in the outer "parallel" of the 7607 // teams) 7608 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7609 7610 __kmp_push_thread_limit(thr, num_teams, num_threads); 7611 } 7612 7613 // Set the proc_bind var to use in the following parallel region. 7614 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7615 kmp_info_t *thr = __kmp_threads[gtid]; 7616 thr->th.th_set_proc_bind = proc_bind; 7617 } 7618 7619 /* Launch the worker threads into the microtask. */ 7620 7621 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7622 kmp_info_t *this_thr = __kmp_threads[gtid]; 7623 7624 #ifdef KMP_DEBUG 7625 int f; 7626 #endif /* KMP_DEBUG */ 7627 7628 KMP_DEBUG_ASSERT(team); 7629 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7630 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7631 KMP_MB(); /* Flush all pending memory write invalidates. */ 7632 7633 team->t.t_construct = 0; /* no single directives seen yet */ 7634 team->t.t_ordered.dt.t_value = 7635 0; /* thread 0 enters the ordered section first */ 7636 7637 /* Reset the identifiers on the dispatch buffer */ 7638 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7639 if (team->t.t_max_nproc > 1) { 7640 int i; 7641 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7642 team->t.t_disp_buffer[i].buffer_index = i; 7643 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7644 } 7645 } else { 7646 team->t.t_disp_buffer[0].buffer_index = 0; 7647 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7648 } 7649 7650 KMP_MB(); /* Flush all pending memory write invalidates. */ 7651 KMP_ASSERT(this_thr->th.th_team == team); 7652 7653 #ifdef KMP_DEBUG 7654 for (f = 0; f < team->t.t_nproc; f++) { 7655 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7656 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7657 } 7658 #endif /* KMP_DEBUG */ 7659 7660 /* release the worker threads so they may begin working */ 7661 __kmp_fork_barrier(gtid, 0); 7662 } 7663 7664 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7665 kmp_info_t *this_thr = __kmp_threads[gtid]; 7666 7667 KMP_DEBUG_ASSERT(team); 7668 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7669 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7670 KMP_MB(); /* Flush all pending memory write invalidates. */ 7671 7672 /* Join barrier after fork */ 7673 7674 #ifdef KMP_DEBUG 7675 if (__kmp_threads[gtid] && 7676 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7677 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7678 __kmp_threads[gtid]); 7679 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7680 "team->t.t_nproc=%d\n", 7681 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7682 team->t.t_nproc); 7683 __kmp_print_structure(); 7684 } 7685 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7686 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7687 #endif /* KMP_DEBUG */ 7688 7689 __kmp_join_barrier(gtid); /* wait for everyone */ 7690 #if OMPT_SUPPORT 7691 if (ompt_enabled.enabled && 7692 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7693 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7694 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7695 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7696 #if OMPT_OPTIONAL 7697 void *codeptr = NULL; 7698 if (KMP_MASTER_TID(ds_tid) && 7699 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7700 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7701 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7702 7703 if (ompt_enabled.ompt_callback_sync_region_wait) { 7704 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7705 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7706 codeptr); 7707 } 7708 if (ompt_enabled.ompt_callback_sync_region) { 7709 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7710 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7711 codeptr); 7712 } 7713 #endif 7714 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7715 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7716 ompt_scope_end, NULL, task_data, 0, ds_tid, 7717 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7718 } 7719 } 7720 #endif 7721 7722 KMP_MB(); /* Flush all pending memory write invalidates. */ 7723 KMP_ASSERT(this_thr->th.th_team == team); 7724 } 7725 7726 /* ------------------------------------------------------------------------ */ 7727 7728 #ifdef USE_LOAD_BALANCE 7729 7730 // Return the worker threads actively spinning in the hot team, if we 7731 // are at the outermost level of parallelism. Otherwise, return 0. 7732 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7733 int i; 7734 int retval; 7735 kmp_team_t *hot_team; 7736 7737 if (root->r.r_active) { 7738 return 0; 7739 } 7740 hot_team = root->r.r_hot_team; 7741 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7742 return hot_team->t.t_nproc - 1; // Don't count primary thread 7743 } 7744 7745 // Skip the primary thread - it is accounted for elsewhere. 7746 retval = 0; 7747 for (i = 1; i < hot_team->t.t_nproc; i++) { 7748 if (hot_team->t.t_threads[i]->th.th_active) { 7749 retval++; 7750 } 7751 } 7752 return retval; 7753 } 7754 7755 // Perform an automatic adjustment to the number of 7756 // threads used by the next parallel region. 7757 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7758 int retval; 7759 int pool_active; 7760 int hot_team_active; 7761 int team_curr_active; 7762 int system_active; 7763 7764 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7765 set_nproc)); 7766 KMP_DEBUG_ASSERT(root); 7767 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7768 ->th.th_current_task->td_icvs.dynamic == TRUE); 7769 KMP_DEBUG_ASSERT(set_nproc > 1); 7770 7771 if (set_nproc == 1) { 7772 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7773 return 1; 7774 } 7775 7776 // Threads that are active in the thread pool, active in the hot team for this 7777 // particular root (if we are at the outer par level), and the currently 7778 // executing thread (to become the primary thread) are available to add to the 7779 // new team, but are currently contributing to the system load, and must be 7780 // accounted for. 7781 pool_active = __kmp_thread_pool_active_nth; 7782 hot_team_active = __kmp_active_hot_team_nproc(root); 7783 team_curr_active = pool_active + hot_team_active + 1; 7784 7785 // Check the system load. 7786 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7787 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7788 "hot team active = %d\n", 7789 system_active, pool_active, hot_team_active)); 7790 7791 if (system_active < 0) { 7792 // There was an error reading the necessary info from /proc, so use the 7793 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7794 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7795 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7796 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7797 7798 // Make this call behave like the thread limit algorithm. 7799 retval = __kmp_avail_proc - __kmp_nth + 7800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7801 if (retval > set_nproc) { 7802 retval = set_nproc; 7803 } 7804 if (retval < KMP_MIN_NTH) { 7805 retval = KMP_MIN_NTH; 7806 } 7807 7808 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7809 retval)); 7810 return retval; 7811 } 7812 7813 // There is a slight delay in the load balance algorithm in detecting new 7814 // running procs. The real system load at this instant should be at least as 7815 // large as the #active omp thread that are available to add to the team. 7816 if (system_active < team_curr_active) { 7817 system_active = team_curr_active; 7818 } 7819 retval = __kmp_avail_proc - system_active + team_curr_active; 7820 if (retval > set_nproc) { 7821 retval = set_nproc; 7822 } 7823 if (retval < KMP_MIN_NTH) { 7824 retval = KMP_MIN_NTH; 7825 } 7826 7827 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7828 return retval; 7829 } // __kmp_load_balance_nproc() 7830 7831 #endif /* USE_LOAD_BALANCE */ 7832 7833 /* ------------------------------------------------------------------------ */ 7834 7835 /* NOTE: this is called with the __kmp_init_lock held */ 7836 void __kmp_cleanup(void) { 7837 int f; 7838 7839 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7840 7841 if (TCR_4(__kmp_init_parallel)) { 7842 #if KMP_HANDLE_SIGNALS 7843 __kmp_remove_signals(); 7844 #endif 7845 TCW_4(__kmp_init_parallel, FALSE); 7846 } 7847 7848 if (TCR_4(__kmp_init_middle)) { 7849 #if KMP_AFFINITY_SUPPORTED 7850 __kmp_affinity_uninitialize(); 7851 #endif /* KMP_AFFINITY_SUPPORTED */ 7852 __kmp_cleanup_hierarchy(); 7853 TCW_4(__kmp_init_middle, FALSE); 7854 } 7855 7856 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7857 7858 if (__kmp_init_serial) { 7859 __kmp_runtime_destroy(); 7860 __kmp_init_serial = FALSE; 7861 } 7862 7863 __kmp_cleanup_threadprivate_caches(); 7864 7865 for (f = 0; f < __kmp_threads_capacity; f++) { 7866 if (__kmp_root[f] != NULL) { 7867 __kmp_free(__kmp_root[f]); 7868 __kmp_root[f] = NULL; 7869 } 7870 } 7871 __kmp_free(__kmp_threads); 7872 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7873 // there is no need in freeing __kmp_root. 7874 __kmp_threads = NULL; 7875 __kmp_root = NULL; 7876 __kmp_threads_capacity = 0; 7877 7878 #if KMP_USE_DYNAMIC_LOCK 7879 __kmp_cleanup_indirect_user_locks(); 7880 #else 7881 __kmp_cleanup_user_locks(); 7882 #endif 7883 #if OMPD_SUPPORT 7884 if (ompd_state) { 7885 __kmp_free(ompd_env_block); 7886 ompd_env_block = NULL; 7887 ompd_env_block_size = 0; 7888 } 7889 #endif 7890 7891 #if KMP_AFFINITY_SUPPORTED 7892 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7893 __kmp_cpuinfo_file = NULL; 7894 #endif /* KMP_AFFINITY_SUPPORTED */ 7895 7896 #if KMP_USE_ADAPTIVE_LOCKS 7897 #if KMP_DEBUG_ADAPTIVE_LOCKS 7898 __kmp_print_speculative_stats(); 7899 #endif 7900 #endif 7901 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7902 __kmp_nested_nth.nth = NULL; 7903 __kmp_nested_nth.size = 0; 7904 __kmp_nested_nth.used = 0; 7905 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7906 __kmp_nested_proc_bind.bind_types = NULL; 7907 __kmp_nested_proc_bind.size = 0; 7908 __kmp_nested_proc_bind.used = 0; 7909 if (__kmp_affinity_format) { 7910 KMP_INTERNAL_FREE(__kmp_affinity_format); 7911 __kmp_affinity_format = NULL; 7912 } 7913 7914 __kmp_i18n_catclose(); 7915 7916 #if KMP_USE_HIER_SCHED 7917 __kmp_hier_scheds.deallocate(); 7918 #endif 7919 7920 #if KMP_STATS_ENABLED 7921 __kmp_stats_fini(); 7922 #endif 7923 7924 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7925 } 7926 7927 /* ------------------------------------------------------------------------ */ 7928 7929 int __kmp_ignore_mppbeg(void) { 7930 char *env; 7931 7932 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7933 if (__kmp_str_match_false(env)) 7934 return FALSE; 7935 } 7936 // By default __kmpc_begin() is no-op. 7937 return TRUE; 7938 } 7939 7940 int __kmp_ignore_mppend(void) { 7941 char *env; 7942 7943 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7944 if (__kmp_str_match_false(env)) 7945 return FALSE; 7946 } 7947 // By default __kmpc_end() is no-op. 7948 return TRUE; 7949 } 7950 7951 void __kmp_internal_begin(void) { 7952 int gtid; 7953 kmp_root_t *root; 7954 7955 /* this is a very important step as it will register new sibling threads 7956 and assign these new uber threads a new gtid */ 7957 gtid = __kmp_entry_gtid(); 7958 root = __kmp_threads[gtid]->th.th_root; 7959 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7960 7961 if (root->r.r_begin) 7962 return; 7963 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7964 if (root->r.r_begin) { 7965 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7966 return; 7967 } 7968 7969 root->r.r_begin = TRUE; 7970 7971 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7972 } 7973 7974 /* ------------------------------------------------------------------------ */ 7975 7976 void __kmp_user_set_library(enum library_type arg) { 7977 int gtid; 7978 kmp_root_t *root; 7979 kmp_info_t *thread; 7980 7981 /* first, make sure we are initialized so we can get our gtid */ 7982 7983 gtid = __kmp_entry_gtid(); 7984 thread = __kmp_threads[gtid]; 7985 7986 root = thread->th.th_root; 7987 7988 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7989 library_serial)); 7990 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7991 thread */ 7992 KMP_WARNING(SetLibraryIncorrectCall); 7993 return; 7994 } 7995 7996 switch (arg) { 7997 case library_serial: 7998 thread->th.th_set_nproc = 0; 7999 set__nproc(thread, 1); 8000 break; 8001 case library_turnaround: 8002 thread->th.th_set_nproc = 0; 8003 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8004 : __kmp_dflt_team_nth_ub); 8005 break; 8006 case library_throughput: 8007 thread->th.th_set_nproc = 0; 8008 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8009 : __kmp_dflt_team_nth_ub); 8010 break; 8011 default: 8012 KMP_FATAL(UnknownLibraryType, arg); 8013 } 8014 8015 __kmp_aux_set_library(arg); 8016 } 8017 8018 void __kmp_aux_set_stacksize(size_t arg) { 8019 if (!__kmp_init_serial) 8020 __kmp_serial_initialize(); 8021 8022 #if KMP_OS_DARWIN 8023 if (arg & (0x1000 - 1)) { 8024 arg &= ~(0x1000 - 1); 8025 if (arg + 0x1000) /* check for overflow if we round up */ 8026 arg += 0x1000; 8027 } 8028 #endif 8029 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8030 8031 /* only change the default stacksize before the first parallel region */ 8032 if (!TCR_4(__kmp_init_parallel)) { 8033 size_t value = arg; /* argument is in bytes */ 8034 8035 if (value < __kmp_sys_min_stksize) 8036 value = __kmp_sys_min_stksize; 8037 else if (value > KMP_MAX_STKSIZE) 8038 value = KMP_MAX_STKSIZE; 8039 8040 __kmp_stksize = value; 8041 8042 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8043 } 8044 8045 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8046 } 8047 8048 /* set the behaviour of the runtime library */ 8049 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8050 void __kmp_aux_set_library(enum library_type arg) { 8051 __kmp_library = arg; 8052 8053 switch (__kmp_library) { 8054 case library_serial: { 8055 KMP_INFORM(LibraryIsSerial); 8056 } break; 8057 case library_turnaround: 8058 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8059 __kmp_use_yield = 2; // only yield when oversubscribed 8060 break; 8061 case library_throughput: 8062 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8063 __kmp_dflt_blocktime = 200; 8064 break; 8065 default: 8066 KMP_FATAL(UnknownLibraryType, arg); 8067 } 8068 } 8069 8070 /* Getting team information common for all team API */ 8071 // Returns NULL if not in teams construct 8072 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8073 kmp_info_t *thr = __kmp_entry_thread(); 8074 teams_serialized = 0; 8075 if (thr->th.th_teams_microtask) { 8076 kmp_team_t *team = thr->th.th_team; 8077 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8078 int ii = team->t.t_level; 8079 teams_serialized = team->t.t_serialized; 8080 int level = tlevel + 1; 8081 KMP_DEBUG_ASSERT(ii >= tlevel); 8082 while (ii > level) { 8083 for (teams_serialized = team->t.t_serialized; 8084 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8085 } 8086 if (team->t.t_serialized && (!teams_serialized)) { 8087 team = team->t.t_parent; 8088 continue; 8089 } 8090 if (ii > level) { 8091 team = team->t.t_parent; 8092 ii--; 8093 } 8094 } 8095 return team; 8096 } 8097 return NULL; 8098 } 8099 8100 int __kmp_aux_get_team_num() { 8101 int serialized; 8102 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8103 if (team) { 8104 if (serialized > 1) { 8105 return 0; // teams region is serialized ( 1 team of 1 thread ). 8106 } else { 8107 return team->t.t_master_tid; 8108 } 8109 } 8110 return 0; 8111 } 8112 8113 int __kmp_aux_get_num_teams() { 8114 int serialized; 8115 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8116 if (team) { 8117 if (serialized > 1) { 8118 return 1; 8119 } else { 8120 return team->t.t_parent->t.t_nproc; 8121 } 8122 } 8123 return 1; 8124 } 8125 8126 /* ------------------------------------------------------------------------ */ 8127 8128 /* 8129 * Affinity Format Parser 8130 * 8131 * Field is in form of: %[[[0].]size]type 8132 * % and type are required (%% means print a literal '%') 8133 * type is either single char or long name surrounded by {}, 8134 * e.g., N or {num_threads} 8135 * 0 => leading zeros 8136 * . => right justified when size is specified 8137 * by default output is left justified 8138 * size is the *minimum* field length 8139 * All other characters are printed as is 8140 * 8141 * Available field types: 8142 * L {thread_level} - omp_get_level() 8143 * n {thread_num} - omp_get_thread_num() 8144 * h {host} - name of host machine 8145 * P {process_id} - process id (integer) 8146 * T {thread_identifier} - native thread identifier (integer) 8147 * N {num_threads} - omp_get_num_threads() 8148 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8149 * a {thread_affinity} - comma separated list of integers or integer ranges 8150 * (values of affinity mask) 8151 * 8152 * Implementation-specific field types can be added 8153 * If a type is unknown, print "undefined" 8154 */ 8155 8156 // Structure holding the short name, long name, and corresponding data type 8157 // for snprintf. A table of these will represent the entire valid keyword 8158 // field types. 8159 typedef struct kmp_affinity_format_field_t { 8160 char short_name; // from spec e.g., L -> thread level 8161 const char *long_name; // from spec thread_level -> thread level 8162 char field_format; // data type for snprintf (typically 'd' or 's' 8163 // for integer or string) 8164 } kmp_affinity_format_field_t; 8165 8166 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8167 #if KMP_AFFINITY_SUPPORTED 8168 {'A', "thread_affinity", 's'}, 8169 #endif 8170 {'t', "team_num", 'd'}, 8171 {'T', "num_teams", 'd'}, 8172 {'L', "nesting_level", 'd'}, 8173 {'n', "thread_num", 'd'}, 8174 {'N', "num_threads", 'd'}, 8175 {'a', "ancestor_tnum", 'd'}, 8176 {'H', "host", 's'}, 8177 {'P', "process_id", 'd'}, 8178 {'i', "native_thread_id", 'd'}}; 8179 8180 // Return the number of characters it takes to hold field 8181 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8182 const char **ptr, 8183 kmp_str_buf_t *field_buffer) { 8184 int rc, format_index, field_value; 8185 const char *width_left, *width_right; 8186 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8187 static const int FORMAT_SIZE = 20; 8188 char format[FORMAT_SIZE] = {0}; 8189 char absolute_short_name = 0; 8190 8191 KMP_DEBUG_ASSERT(gtid >= 0); 8192 KMP_DEBUG_ASSERT(th); 8193 KMP_DEBUG_ASSERT(**ptr == '%'); 8194 KMP_DEBUG_ASSERT(field_buffer); 8195 8196 __kmp_str_buf_clear(field_buffer); 8197 8198 // Skip the initial % 8199 (*ptr)++; 8200 8201 // Check for %% first 8202 if (**ptr == '%') { 8203 __kmp_str_buf_cat(field_buffer, "%", 1); 8204 (*ptr)++; // skip over the second % 8205 return 1; 8206 } 8207 8208 // Parse field modifiers if they are present 8209 pad_zeros = false; 8210 if (**ptr == '0') { 8211 pad_zeros = true; 8212 (*ptr)++; // skip over 0 8213 } 8214 right_justify = false; 8215 if (**ptr == '.') { 8216 right_justify = true; 8217 (*ptr)++; // skip over . 8218 } 8219 // Parse width of field: [width_left, width_right) 8220 width_left = width_right = NULL; 8221 if (**ptr >= '0' && **ptr <= '9') { 8222 width_left = *ptr; 8223 SKIP_DIGITS(*ptr); 8224 width_right = *ptr; 8225 } 8226 8227 // Create the format for KMP_SNPRINTF based on flags parsed above 8228 format_index = 0; 8229 format[format_index++] = '%'; 8230 if (!right_justify) 8231 format[format_index++] = '-'; 8232 if (pad_zeros) 8233 format[format_index++] = '0'; 8234 if (width_left && width_right) { 8235 int i = 0; 8236 // Only allow 8 digit number widths. 8237 // This also prevents overflowing format variable 8238 while (i < 8 && width_left < width_right) { 8239 format[format_index++] = *width_left; 8240 width_left++; 8241 i++; 8242 } 8243 } 8244 8245 // Parse a name (long or short) 8246 // Canonicalize the name into absolute_short_name 8247 found_valid_name = false; 8248 parse_long_name = (**ptr == '{'); 8249 if (parse_long_name) 8250 (*ptr)++; // skip initial left brace 8251 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8252 sizeof(__kmp_affinity_format_table[0]); 8253 ++i) { 8254 char short_name = __kmp_affinity_format_table[i].short_name; 8255 const char *long_name = __kmp_affinity_format_table[i].long_name; 8256 char field_format = __kmp_affinity_format_table[i].field_format; 8257 if (parse_long_name) { 8258 size_t length = KMP_STRLEN(long_name); 8259 if (strncmp(*ptr, long_name, length) == 0) { 8260 found_valid_name = true; 8261 (*ptr) += length; // skip the long name 8262 } 8263 } else if (**ptr == short_name) { 8264 found_valid_name = true; 8265 (*ptr)++; // skip the short name 8266 } 8267 if (found_valid_name) { 8268 format[format_index++] = field_format; 8269 format[format_index++] = '\0'; 8270 absolute_short_name = short_name; 8271 break; 8272 } 8273 } 8274 if (parse_long_name) { 8275 if (**ptr != '}') { 8276 absolute_short_name = 0; 8277 } else { 8278 (*ptr)++; // skip over the right brace 8279 } 8280 } 8281 8282 // Attempt to fill the buffer with the requested 8283 // value using snprintf within __kmp_str_buf_print() 8284 switch (absolute_short_name) { 8285 case 't': 8286 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8287 break; 8288 case 'T': 8289 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8290 break; 8291 case 'L': 8292 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8293 break; 8294 case 'n': 8295 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8296 break; 8297 case 'H': { 8298 static const int BUFFER_SIZE = 256; 8299 char buf[BUFFER_SIZE]; 8300 __kmp_expand_host_name(buf, BUFFER_SIZE); 8301 rc = __kmp_str_buf_print(field_buffer, format, buf); 8302 } break; 8303 case 'P': 8304 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8305 break; 8306 case 'i': 8307 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8308 break; 8309 case 'N': 8310 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8311 break; 8312 case 'a': 8313 field_value = 8314 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8315 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8316 break; 8317 #if KMP_AFFINITY_SUPPORTED 8318 case 'A': { 8319 kmp_str_buf_t buf; 8320 __kmp_str_buf_init(&buf); 8321 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8322 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8323 __kmp_str_buf_free(&buf); 8324 } break; 8325 #endif 8326 default: 8327 // According to spec, If an implementation does not have info for field 8328 // type, then "undefined" is printed 8329 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8330 // Skip the field 8331 if (parse_long_name) { 8332 SKIP_TOKEN(*ptr); 8333 if (**ptr == '}') 8334 (*ptr)++; 8335 } else { 8336 (*ptr)++; 8337 } 8338 } 8339 8340 KMP_ASSERT(format_index <= FORMAT_SIZE); 8341 return rc; 8342 } 8343 8344 /* 8345 * Return number of characters needed to hold the affinity string 8346 * (not including null byte character) 8347 * The resultant string is printed to buffer, which the caller can then 8348 * handle afterwards 8349 */ 8350 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8351 kmp_str_buf_t *buffer) { 8352 const char *parse_ptr; 8353 size_t retval; 8354 const kmp_info_t *th; 8355 kmp_str_buf_t field; 8356 8357 KMP_DEBUG_ASSERT(buffer); 8358 KMP_DEBUG_ASSERT(gtid >= 0); 8359 8360 __kmp_str_buf_init(&field); 8361 __kmp_str_buf_clear(buffer); 8362 8363 th = __kmp_threads[gtid]; 8364 retval = 0; 8365 8366 // If format is NULL or zero-length string, then we use 8367 // affinity-format-var ICV 8368 parse_ptr = format; 8369 if (parse_ptr == NULL || *parse_ptr == '\0') { 8370 parse_ptr = __kmp_affinity_format; 8371 } 8372 KMP_DEBUG_ASSERT(parse_ptr); 8373 8374 while (*parse_ptr != '\0') { 8375 // Parse a field 8376 if (*parse_ptr == '%') { 8377 // Put field in the buffer 8378 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8379 __kmp_str_buf_catbuf(buffer, &field); 8380 retval += rc; 8381 } else { 8382 // Put literal character in buffer 8383 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8384 retval++; 8385 parse_ptr++; 8386 } 8387 } 8388 __kmp_str_buf_free(&field); 8389 return retval; 8390 } 8391 8392 // Displays the affinity string to stdout 8393 void __kmp_aux_display_affinity(int gtid, const char *format) { 8394 kmp_str_buf_t buf; 8395 __kmp_str_buf_init(&buf); 8396 __kmp_aux_capture_affinity(gtid, format, &buf); 8397 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8398 __kmp_str_buf_free(&buf); 8399 } 8400 8401 /* ------------------------------------------------------------------------ */ 8402 8403 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8404 int blocktime = arg; /* argument is in milliseconds */ 8405 #if KMP_USE_MONITOR 8406 int bt_intervals; 8407 #endif 8408 kmp_int8 bt_set; 8409 8410 __kmp_save_internal_controls(thread); 8411 8412 /* Normalize and set blocktime for the teams */ 8413 if (blocktime < KMP_MIN_BLOCKTIME) 8414 blocktime = KMP_MIN_BLOCKTIME; 8415 else if (blocktime > KMP_MAX_BLOCKTIME) 8416 blocktime = KMP_MAX_BLOCKTIME; 8417 8418 set__blocktime_team(thread->th.th_team, tid, blocktime); 8419 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8420 8421 #if KMP_USE_MONITOR 8422 /* Calculate and set blocktime intervals for the teams */ 8423 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8424 8425 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8426 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8427 #endif 8428 8429 /* Set whether blocktime has been set to "TRUE" */ 8430 bt_set = TRUE; 8431 8432 set__bt_set_team(thread->th.th_team, tid, bt_set); 8433 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8434 #if KMP_USE_MONITOR 8435 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8436 "bt_intervals=%d, monitor_updates=%d\n", 8437 __kmp_gtid_from_tid(tid, thread->th.th_team), 8438 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8439 __kmp_monitor_wakeups)); 8440 #else 8441 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8442 __kmp_gtid_from_tid(tid, thread->th.th_team), 8443 thread->th.th_team->t.t_id, tid, blocktime)); 8444 #endif 8445 } 8446 8447 void __kmp_aux_set_defaults(char const *str, size_t len) { 8448 if (!__kmp_init_serial) { 8449 __kmp_serial_initialize(); 8450 } 8451 __kmp_env_initialize(str); 8452 8453 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8454 __kmp_env_print(); 8455 } 8456 } // __kmp_aux_set_defaults 8457 8458 /* ------------------------------------------------------------------------ */ 8459 /* internal fast reduction routines */ 8460 8461 PACKED_REDUCTION_METHOD_T 8462 __kmp_determine_reduction_method( 8463 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8464 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8465 kmp_critical_name *lck) { 8466 8467 // Default reduction method: critical construct ( lck != NULL, like in current 8468 // PAROPT ) 8469 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8470 // can be selected by RTL 8471 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8472 // can be selected by RTL 8473 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8474 // among generated by PAROPT. 8475 8476 PACKED_REDUCTION_METHOD_T retval; 8477 8478 int team_size; 8479 8480 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8481 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8482 8483 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8484 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8485 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8486 8487 retval = critical_reduce_block; 8488 8489 // another choice of getting a team size (with 1 dynamic deference) is slower 8490 team_size = __kmp_get_team_num_threads(global_tid); 8491 if (team_size == 1) { 8492 8493 retval = empty_reduce_block; 8494 8495 } else { 8496 8497 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8498 8499 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8500 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8501 8502 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8503 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8504 8505 int teamsize_cutoff = 4; 8506 8507 #if KMP_MIC_SUPPORTED 8508 if (__kmp_mic_type != non_mic) { 8509 teamsize_cutoff = 8; 8510 } 8511 #endif 8512 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8513 if (tree_available) { 8514 if (team_size <= teamsize_cutoff) { 8515 if (atomic_available) { 8516 retval = atomic_reduce_block; 8517 } 8518 } else { 8519 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8520 } 8521 } else if (atomic_available) { 8522 retval = atomic_reduce_block; 8523 } 8524 #else 8525 #error "Unknown or unsupported OS" 8526 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8527 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8528 8529 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8530 8531 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8532 8533 // basic tuning 8534 8535 if (atomic_available) { 8536 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8537 retval = atomic_reduce_block; 8538 } 8539 } // otherwise: use critical section 8540 8541 #elif KMP_OS_DARWIN 8542 8543 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8544 if (atomic_available && (num_vars <= 3)) { 8545 retval = atomic_reduce_block; 8546 } else if (tree_available) { 8547 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8548 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8549 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8550 } 8551 } // otherwise: use critical section 8552 8553 #else 8554 #error "Unknown or unsupported OS" 8555 #endif 8556 8557 #else 8558 #error "Unknown or unsupported architecture" 8559 #endif 8560 } 8561 8562 // KMP_FORCE_REDUCTION 8563 8564 // If the team is serialized (team_size == 1), ignore the forced reduction 8565 // method and stay with the unsynchronized method (empty_reduce_block) 8566 if (__kmp_force_reduction_method != reduction_method_not_defined && 8567 team_size != 1) { 8568 8569 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8570 8571 int atomic_available, tree_available; 8572 8573 switch ((forced_retval = __kmp_force_reduction_method)) { 8574 case critical_reduce_block: 8575 KMP_ASSERT(lck); // lck should be != 0 8576 break; 8577 8578 case atomic_reduce_block: 8579 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8580 if (!atomic_available) { 8581 KMP_WARNING(RedMethodNotSupported, "atomic"); 8582 forced_retval = critical_reduce_block; 8583 } 8584 break; 8585 8586 case tree_reduce_block: 8587 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8588 if (!tree_available) { 8589 KMP_WARNING(RedMethodNotSupported, "tree"); 8590 forced_retval = critical_reduce_block; 8591 } else { 8592 #if KMP_FAST_REDUCTION_BARRIER 8593 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8594 #endif 8595 } 8596 break; 8597 8598 default: 8599 KMP_ASSERT(0); // "unsupported method specified" 8600 } 8601 8602 retval = forced_retval; 8603 } 8604 8605 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8606 8607 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8608 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8609 8610 return (retval); 8611 } 8612 // this function is for testing set/get/determine reduce method 8613 kmp_int32 __kmp_get_reduce_method(void) { 8614 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8615 } 8616 8617 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8618 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8619 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8620 8621 // Hard pause shuts down the runtime completely. Resume happens naturally when 8622 // OpenMP is used subsequently. 8623 void __kmp_hard_pause() { 8624 __kmp_pause_status = kmp_hard_paused; 8625 __kmp_internal_end_thread(-1); 8626 } 8627 8628 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8629 void __kmp_resume_if_soft_paused() { 8630 if (__kmp_pause_status == kmp_soft_paused) { 8631 __kmp_pause_status = kmp_not_paused; 8632 8633 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8634 kmp_info_t *thread = __kmp_threads[gtid]; 8635 if (thread) { // Wake it if sleeping 8636 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8637 thread); 8638 if (fl.is_sleeping()) 8639 fl.resume(gtid); 8640 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8641 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8642 } else { // thread holds the lock and may sleep soon 8643 do { // until either the thread sleeps, or we can get the lock 8644 if (fl.is_sleeping()) { 8645 fl.resume(gtid); 8646 break; 8647 } else if (__kmp_try_suspend_mx(thread)) { 8648 __kmp_unlock_suspend_mx(thread); 8649 break; 8650 } 8651 } while (1); 8652 } 8653 } 8654 } 8655 } 8656 } 8657 8658 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8659 // TODO: add warning messages 8660 int __kmp_pause_resource(kmp_pause_status_t level) { 8661 if (level == kmp_not_paused) { // requesting resume 8662 if (__kmp_pause_status == kmp_not_paused) { 8663 // error message about runtime not being paused, so can't resume 8664 return 1; 8665 } else { 8666 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8667 __kmp_pause_status == kmp_hard_paused); 8668 __kmp_pause_status = kmp_not_paused; 8669 return 0; 8670 } 8671 } else if (level == kmp_soft_paused) { // requesting soft pause 8672 if (__kmp_pause_status != kmp_not_paused) { 8673 // error message about already being paused 8674 return 1; 8675 } else { 8676 __kmp_soft_pause(); 8677 return 0; 8678 } 8679 } else if (level == kmp_hard_paused) { // requesting hard pause 8680 if (__kmp_pause_status != kmp_not_paused) { 8681 // error message about already being paused 8682 return 1; 8683 } else { 8684 __kmp_hard_pause(); 8685 return 0; 8686 } 8687 } else { 8688 // error message about invalid level 8689 return 1; 8690 } 8691 } 8692 8693 void __kmp_omp_display_env(int verbose) { 8694 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8695 if (__kmp_init_serial == 0) 8696 __kmp_do_serial_initialize(); 8697 __kmp_display_env_impl(!verbose, verbose); 8698 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8699 } 8700 8701 // Globals and functions for hidden helper task 8702 kmp_info_t **__kmp_hidden_helper_threads; 8703 kmp_info_t *__kmp_hidden_helper_main_thread; 8704 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8705 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8706 #if KMP_OS_LINUX 8707 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8708 #else 8709 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8710 #endif 8711 8712 namespace { 8713 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8714 8715 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8716 // This is an explicit synchronization on all hidden helper threads in case 8717 // that when a regular thread pushes a hidden helper task to one hidden 8718 // helper thread, the thread has not been awaken once since they're released 8719 // by the main thread after creating the team. 8720 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8721 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8722 __kmp_hidden_helper_threads_num) 8723 ; 8724 8725 // If main thread, then wait for signal 8726 if (__kmpc_master(nullptr, *gtid)) { 8727 // First, unset the initial state and release the initial thread 8728 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8729 __kmp_hidden_helper_initz_release(); 8730 __kmp_hidden_helper_main_thread_wait(); 8731 // Now wake up all worker threads 8732 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8733 __kmp_hidden_helper_worker_thread_signal(); 8734 } 8735 } 8736 } 8737 } // namespace 8738 8739 void __kmp_hidden_helper_threads_initz_routine() { 8740 // Create a new root for hidden helper team/threads 8741 const int gtid = __kmp_register_root(TRUE); 8742 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8743 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8744 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8745 __kmp_hidden_helper_threads_num; 8746 8747 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8748 8749 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8750 8751 // Set the initialization flag to FALSE 8752 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8753 8754 __kmp_hidden_helper_threads_deinitz_release(); 8755 } 8756 8757 /* Nesting Mode: 8758 Set via KMP_NESTING_MODE, which takes an integer. 8759 Note: we skip duplicate topology levels, and skip levels with only 8760 one entity. 8761 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 8762 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 8763 in the topology, and initializes the number of threads at each of those 8764 levels to the number of entities at each level, respectively, below the 8765 entity at the parent level. 8766 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 8767 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 8768 the user to turn nesting on explicitly. This is an even more experimental 8769 option to this experimental feature, and may change or go away in the 8770 future. 8771 */ 8772 8773 // Allocate space to store nesting levels 8774 void __kmp_init_nesting_mode() { 8775 int levels = KMP_HW_LAST; 8776 __kmp_nesting_mode_nlevels = levels; 8777 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 8778 for (int i = 0; i < levels; ++i) 8779 __kmp_nesting_nth_level[i] = 0; 8780 if (__kmp_nested_nth.size < levels) { 8781 __kmp_nested_nth.nth = 8782 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 8783 __kmp_nested_nth.size = levels; 8784 } 8785 } 8786 8787 // Set # threads for top levels of nesting; must be called after topology set 8788 void __kmp_set_nesting_mode_threads() { 8789 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 8790 8791 if (__kmp_nesting_mode == 1) 8792 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 8793 else if (__kmp_nesting_mode > 1) 8794 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8795 8796 if (__kmp_topology) { // use topology info 8797 int loc, hw_level; 8798 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 8799 loc < __kmp_nesting_mode_nlevels; 8800 loc++, hw_level++) { 8801 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 8802 if (__kmp_nesting_nth_level[loc] == 1) 8803 loc--; 8804 } 8805 // Make sure all cores are used 8806 if (__kmp_nesting_mode > 1 && loc > 1) { 8807 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 8808 int num_cores = __kmp_topology->get_count(core_level); 8809 int upper_levels = 1; 8810 for (int level = 0; level < loc - 1; ++level) 8811 upper_levels *= __kmp_nesting_nth_level[level]; 8812 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 8813 __kmp_nesting_nth_level[loc - 1] = 8814 num_cores / __kmp_nesting_nth_level[loc - 2]; 8815 } 8816 __kmp_nesting_mode_nlevels = loc; 8817 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8818 } else { // no topology info available; provide a reasonable guesstimation 8819 if (__kmp_avail_proc >= 4) { 8820 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 8821 __kmp_nesting_nth_level[1] = 2; 8822 __kmp_nesting_mode_nlevels = 2; 8823 } else { 8824 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 8825 __kmp_nesting_mode_nlevels = 1; 8826 } 8827 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8828 } 8829 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 8830 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 8831 } 8832 set__nproc(thread, __kmp_nesting_nth_level[0]); 8833 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 8834 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8835 if (get__max_active_levels(thread) > 1) { 8836 // if max levels was set, set nesting mode levels to same 8837 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 8838 } 8839 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 8840 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 8841 } 8842