1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #include "tsan_annotations.h" 51 52 #if KMP_OS_WINDOWS 53 // windows does not need include files as it doesn't use shared memory 54 #else 55 #include <sys/mman.h> 56 #include <sys/stat.h> 57 #include <fcntl.h> 58 #define SHM_SIZE 1024 59 #endif 60 61 #if defined(KMP_GOMP_COMPAT) 62 char const __kmp_version_alt_comp[] = 63 KMP_VERSION_PREFIX "alternative compiler support: yes"; 64 #endif /* defined(KMP_GOMP_COMPAT) */ 65 66 char const __kmp_version_omp_api[] = 67 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 68 69 #ifdef KMP_DEBUG 70 char const __kmp_version_lock[] = 71 KMP_VERSION_PREFIX "lock type: run time selectable"; 72 #endif /* KMP_DEBUG */ 73 74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 75 76 /* ------------------------------------------------------------------------ */ 77 78 #if KMP_USE_MONITOR 79 kmp_info_t __kmp_monitor; 80 #endif 81 82 /* Forward declarations */ 83 84 void __kmp_cleanup(void); 85 86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 87 int gtid); 88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 89 kmp_internal_control_t *new_icvs, 90 ident_t *loc); 91 #if KMP_AFFINITY_SUPPORTED 92 static void __kmp_partition_places(kmp_team_t *team, 93 int update_master_only = 0); 94 #endif 95 static void __kmp_do_serial_initialize(void); 96 void __kmp_fork_barrier(int gtid, int tid); 97 void __kmp_join_barrier(int gtid); 98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 99 kmp_internal_control_t *new_icvs, ident_t *loc); 100 101 #ifdef USE_LOAD_BALANCE 102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 103 #endif 104 105 static int __kmp_expand_threads(int nNeed); 106 #if KMP_OS_WINDOWS 107 static int __kmp_unregister_root_other_thread(int gtid); 108 #endif 109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 111 112 /* Calculate the identifier of the current thread */ 113 /* fast (and somewhat portable) way to get unique identifier of executing 114 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 115 int __kmp_get_global_thread_id() { 116 int i; 117 kmp_info_t **other_threads; 118 size_t stack_data; 119 char *stack_addr; 120 size_t stack_size; 121 char *stack_base; 122 123 KA_TRACE( 124 1000, 125 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 126 __kmp_nth, __kmp_all_nth)); 127 128 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 129 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 130 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 131 __kmp_init_gtid for this to work. */ 132 133 if (!TCR_4(__kmp_init_gtid)) 134 return KMP_GTID_DNE; 135 136 #ifdef KMP_TDATA_GTID 137 if (TCR_4(__kmp_gtid_mode) >= 3) { 138 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 139 return __kmp_gtid; 140 } 141 #endif 142 if (TCR_4(__kmp_gtid_mode) >= 2) { 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 144 return __kmp_gtid_get_specific(); 145 } 146 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 147 148 stack_addr = (char *)&stack_data; 149 other_threads = __kmp_threads; 150 151 /* ATT: The code below is a source of potential bugs due to unsynchronized 152 access to __kmp_threads array. For example: 153 1. Current thread loads other_threads[i] to thr and checks it, it is 154 non-NULL. 155 2. Current thread is suspended by OS. 156 3. Another thread unregisters and finishes (debug versions of free() 157 may fill memory with something like 0xEF). 158 4. Current thread is resumed. 159 5. Current thread reads junk from *thr. 160 TODO: Fix it. --ln */ 161 162 for (i = 0; i < __kmp_threads_capacity; i++) { 163 164 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 165 if (!thr) 166 continue; 167 168 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 169 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 170 171 /* stack grows down -- search through all of the active threads */ 172 173 if (stack_addr <= stack_base) { 174 size_t stack_diff = stack_base - stack_addr; 175 176 if (stack_diff <= stack_size) { 177 /* The only way we can be closer than the allocated */ 178 /* stack size is if we are running on this thread. */ 179 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 180 return i; 181 } 182 } 183 } 184 185 /* get specific to try and determine our gtid */ 186 KA_TRACE(1000, 187 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 188 "thread, using TLS\n")); 189 i = __kmp_gtid_get_specific(); 190 191 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 192 193 /* if we havn't been assigned a gtid, then return code */ 194 if (i < 0) 195 return i; 196 197 /* dynamically updated stack window for uber threads to avoid get_specific 198 call */ 199 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 200 KMP_FATAL(StackOverflow, i); 201 } 202 203 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 204 if (stack_addr > stack_base) { 205 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 206 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 207 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 208 stack_base); 209 } else { 210 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 211 stack_base - stack_addr); 212 } 213 214 /* Reprint stack bounds for ubermaster since they have been refined */ 215 if (__kmp_storage_map) { 216 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 217 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 218 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 219 other_threads[i]->th.th_info.ds.ds_stacksize, 220 "th_%d stack (refinement)", i); 221 } 222 return i; 223 } 224 225 int __kmp_get_global_thread_id_reg() { 226 int gtid; 227 228 if (!__kmp_init_serial) { 229 gtid = KMP_GTID_DNE; 230 } else 231 #ifdef KMP_TDATA_GTID 232 if (TCR_4(__kmp_gtid_mode) >= 3) { 233 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 234 gtid = __kmp_gtid; 235 } else 236 #endif 237 if (TCR_4(__kmp_gtid_mode) >= 2) { 238 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 239 gtid = __kmp_gtid_get_specific(); 240 } else { 241 KA_TRACE(1000, 242 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 243 gtid = __kmp_get_global_thread_id(); 244 } 245 246 /* we must be a new uber master sibling thread */ 247 if (gtid == KMP_GTID_DNE) { 248 KA_TRACE(10, 249 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 250 "Registering a new gtid.\n")); 251 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 252 if (!__kmp_init_serial) { 253 __kmp_do_serial_initialize(); 254 gtid = __kmp_gtid_get_specific(); 255 } else { 256 gtid = __kmp_register_root(FALSE); 257 } 258 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 259 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 260 } 261 262 KMP_DEBUG_ASSERT(gtid >= 0); 263 264 return gtid; 265 } 266 267 /* caller must hold forkjoin_lock */ 268 void __kmp_check_stack_overlap(kmp_info_t *th) { 269 int f; 270 char *stack_beg = NULL; 271 char *stack_end = NULL; 272 int gtid; 273 274 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 275 if (__kmp_storage_map) { 276 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 277 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 278 279 gtid = __kmp_gtid_from_thread(th); 280 281 if (gtid == KMP_GTID_MONITOR) { 282 __kmp_print_storage_map_gtid( 283 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 284 "th_%s stack (%s)", "mon", 285 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 286 } else { 287 __kmp_print_storage_map_gtid( 288 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 289 "th_%d stack (%s)", gtid, 290 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 291 } 292 } 293 294 /* No point in checking ubermaster threads since they use refinement and 295 * cannot overlap */ 296 gtid = __kmp_gtid_from_thread(th); 297 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 298 KA_TRACE(10, 299 ("__kmp_check_stack_overlap: performing extensive checking\n")); 300 if (stack_beg == NULL) { 301 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 302 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 303 } 304 305 for (f = 0; f < __kmp_threads_capacity; f++) { 306 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 307 308 if (f_th && f_th != th) { 309 char *other_stack_end = 310 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 311 char *other_stack_beg = 312 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 313 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 314 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 315 316 /* Print the other stack values before the abort */ 317 if (__kmp_storage_map) 318 __kmp_print_storage_map_gtid( 319 -1, other_stack_beg, other_stack_end, 320 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 321 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 322 323 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 324 __kmp_msg_null); 325 } 326 } 327 } 328 } 329 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 330 } 331 332 /* ------------------------------------------------------------------------ */ 333 334 void __kmp_infinite_loop(void) { 335 static int done = FALSE; 336 337 while (!done) { 338 KMP_YIELD(TRUE); 339 } 340 } 341 342 #define MAX_MESSAGE 512 343 344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 345 char const *format, ...) { 346 char buffer[MAX_MESSAGE]; 347 va_list ap; 348 349 va_start(ap, format); 350 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 351 p2, (unsigned long)size, format); 352 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 353 __kmp_vprintf(kmp_err, buffer, ap); 354 #if KMP_PRINT_DATA_PLACEMENT 355 int node; 356 if (gtid >= 0) { 357 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 358 if (__kmp_storage_map_verbose) { 359 node = __kmp_get_host_node(p1); 360 if (node < 0) /* doesn't work, so don't try this next time */ 361 __kmp_storage_map_verbose = FALSE; 362 else { 363 char *last; 364 int lastNode; 365 int localProc = __kmp_get_cpu_from_gtid(gtid); 366 367 const int page_size = KMP_GET_PAGE_SIZE(); 368 369 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 370 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 371 if (localProc >= 0) 372 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 373 localProc >> 1); 374 else 375 __kmp_printf_no_lock(" GTID %d\n", gtid); 376 #if KMP_USE_PRCTL 377 /* The more elaborate format is disabled for now because of the prctl 378 * hanging bug. */ 379 do { 380 last = p1; 381 lastNode = node; 382 /* This loop collates adjacent pages with the same host node. */ 383 do { 384 (char *)p1 += page_size; 385 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 386 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 387 lastNode); 388 } while (p1 <= p2); 389 #else 390 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 391 (char *)p1 + (page_size - 1), 392 __kmp_get_host_node(p1)); 393 if (p1 < p2) { 394 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 395 (char *)p2 + (page_size - 1), 396 __kmp_get_host_node(p2)); 397 } 398 #endif 399 } 400 } 401 } else 402 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 403 } 404 #endif /* KMP_PRINT_DATA_PLACEMENT */ 405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 406 } 407 408 void __kmp_warn(char const *format, ...) { 409 char buffer[MAX_MESSAGE]; 410 va_list ap; 411 412 if (__kmp_generate_warnings == kmp_warnings_off) { 413 return; 414 } 415 416 va_start(ap, format); 417 418 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 419 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 420 __kmp_vprintf(kmp_err, buffer, ap); 421 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 422 423 va_end(ap); 424 } 425 426 void __kmp_abort_process() { 427 // Later threads may stall here, but that's ok because abort() will kill them. 428 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 429 430 if (__kmp_debug_buf) { 431 __kmp_dump_debug_buffer(); 432 } 433 434 if (KMP_OS_WINDOWS) { 435 // Let other threads know of abnormal termination and prevent deadlock 436 // if abort happened during library initialization or shutdown 437 __kmp_global.g.g_abort = SIGABRT; 438 439 /* On Windows* OS by default abort() causes pop-up error box, which stalls 440 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 441 boxes. _set_abort_behavior() works well, but this function is not 442 available in VS7 (this is not problem for DLL, but it is a problem for 443 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 444 help, at least in some versions of MS C RTL. 445 446 It seems following sequence is the only way to simulate abort() and 447 avoid pop-up error box. */ 448 raise(SIGABRT); 449 _exit(3); // Just in case, if signal ignored, exit anyway. 450 } else { 451 __kmp_unregister_library(); 452 abort(); 453 } 454 455 __kmp_infinite_loop(); 456 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 457 458 } // __kmp_abort_process 459 460 void __kmp_abort_thread(void) { 461 // TODO: Eliminate g_abort global variable and this function. 462 // In case of abort just call abort(), it will kill all the threads. 463 __kmp_infinite_loop(); 464 } // __kmp_abort_thread 465 466 /* Print out the storage map for the major kmp_info_t thread data structures 467 that are allocated together. */ 468 469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 470 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 471 gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 474 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 475 476 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 477 sizeof(kmp_local_t), "th_%d.th_local", gtid); 478 479 __kmp_print_storage_map_gtid( 480 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 481 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 482 483 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 484 &thr->th.th_bar[bs_plain_barrier + 1], 485 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 486 gtid); 487 488 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 489 &thr->th.th_bar[bs_forkjoin_barrier + 1], 490 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 491 gtid); 492 493 #if KMP_FAST_REDUCTION_BARRIER 494 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 495 &thr->th.th_bar[bs_reduction_barrier + 1], 496 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 497 gtid); 498 #endif // KMP_FAST_REDUCTION_BARRIER 499 } 500 501 /* Print out the storage map for the major kmp_team_t team data structures 502 that are allocated together. */ 503 504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 505 int team_id, int num_thr) { 506 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 507 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 508 header, team_id); 509 510 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 511 &team->t.t_bar[bs_last_barrier], 512 sizeof(kmp_balign_team_t) * bs_last_barrier, 513 "%s_%d.t_bar", header, team_id); 514 515 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 516 &team->t.t_bar[bs_plain_barrier + 1], 517 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 518 header, team_id); 519 520 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 521 &team->t.t_bar[bs_forkjoin_barrier + 1], 522 sizeof(kmp_balign_team_t), 523 "%s_%d.t_bar[forkjoin]", header, team_id); 524 525 #if KMP_FAST_REDUCTION_BARRIER 526 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 527 &team->t.t_bar[bs_reduction_barrier + 1], 528 sizeof(kmp_balign_team_t), 529 "%s_%d.t_bar[reduction]", header, team_id); 530 #endif // KMP_FAST_REDUCTION_BARRIER 531 532 __kmp_print_storage_map_gtid( 533 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 534 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 535 536 __kmp_print_storage_map_gtid( 537 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 538 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 539 540 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 541 &team->t.t_disp_buffer[num_disp_buff], 542 sizeof(dispatch_shared_info_t) * num_disp_buff, 543 "%s_%d.t_disp_buffer", header, team_id); 544 } 545 546 static void __kmp_init_allocator() { 547 __kmp_init_memkind(); 548 __kmp_init_target_mem(); 549 } 550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 551 552 /* ------------------------------------------------------------------------ */ 553 554 #if KMP_DYNAMIC_LIB 555 #if KMP_OS_WINDOWS 556 557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 558 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 559 560 switch (fdwReason) { 561 562 case DLL_PROCESS_ATTACH: 563 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 564 565 return TRUE; 566 567 case DLL_PROCESS_DETACH: 568 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 569 570 // According to Windows* documentation for DllMain entry point: 571 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 572 // lpReserved == NULL when FreeLibrary() is called, 573 // lpReserved != NULL when the process is terminated. 574 // When FreeLibrary() is called, worker threads remain alive. So the 575 // runtime's state is consistent and executing proper shutdown is OK. 576 // When the process is terminated, worker threads have exited or been 577 // forcefully terminated by the OS and only the shutdown thread remains. 578 // This can leave the runtime in an inconsistent state. 579 // Hence, only attempt proper cleanup when FreeLibrary() is called. 580 // Otherwise, rely on OS to reclaim resources. 581 if (lpReserved == NULL) 582 __kmp_internal_end_library(__kmp_gtid_get_specific()); 583 584 return TRUE; 585 586 case DLL_THREAD_ATTACH: 587 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 588 589 /* if we want to register new siblings all the time here call 590 * __kmp_get_gtid(); */ 591 return TRUE; 592 593 case DLL_THREAD_DETACH: 594 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 595 596 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 597 return TRUE; 598 } 599 600 return TRUE; 601 } 602 603 #endif /* KMP_OS_WINDOWS */ 604 #endif /* KMP_DYNAMIC_LIB */ 605 606 /* __kmp_parallel_deo -- Wait until it's our turn. */ 607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 608 int gtid = *gtid_ref; 609 #ifdef BUILD_PARALLEL_ORDERED 610 kmp_team_t *team = __kmp_team_from_gtid(gtid); 611 #endif /* BUILD_PARALLEL_ORDERED */ 612 613 if (__kmp_env_consistency_check) { 614 if (__kmp_threads[gtid]->th.th_root->r.r_active) 615 #if KMP_USE_DYNAMIC_LOCK 616 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 617 #else 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 619 #endif 620 } 621 #ifdef BUILD_PARALLEL_ORDERED 622 if (!team->t.t_serialized) { 623 KMP_MB(); 624 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 625 NULL); 626 KMP_MB(); 627 } 628 #endif /* BUILD_PARALLEL_ORDERED */ 629 } 630 631 /* __kmp_parallel_dxo -- Signal the next task. */ 632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 633 int gtid = *gtid_ref; 634 #ifdef BUILD_PARALLEL_ORDERED 635 int tid = __kmp_tid_from_gtid(gtid); 636 kmp_team_t *team = __kmp_team_from_gtid(gtid); 637 #endif /* BUILD_PARALLEL_ORDERED */ 638 639 if (__kmp_env_consistency_check) { 640 if (__kmp_threads[gtid]->th.th_root->r.r_active) 641 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 642 } 643 #ifdef BUILD_PARALLEL_ORDERED 644 if (!team->t.t_serialized) { 645 KMP_MB(); /* Flush all pending memory write invalidates. */ 646 647 /* use the tid of the next thread in this team */ 648 /* TODO replace with general release procedure */ 649 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 650 651 KMP_MB(); /* Flush all pending memory write invalidates. */ 652 } 653 #endif /* BUILD_PARALLEL_ORDERED */ 654 } 655 656 /* ------------------------------------------------------------------------ */ 657 /* The BARRIER for a SINGLE process section is always explicit */ 658 659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 660 int status; 661 kmp_info_t *th; 662 kmp_team_t *team; 663 664 if (!TCR_4(__kmp_init_parallel)) 665 __kmp_parallel_initialize(); 666 __kmp_resume_if_soft_paused(); 667 668 th = __kmp_threads[gtid]; 669 team = th->th.th_team; 670 status = 0; 671 672 th->th.th_ident = id_ref; 673 674 if (team->t.t_serialized) { 675 status = 1; 676 } else { 677 kmp_int32 old_this = th->th.th_local.this_construct; 678 679 ++th->th.th_local.this_construct; 680 /* try to set team count to thread count--success means thread got the 681 single block */ 682 /* TODO: Should this be acquire or release? */ 683 if (team->t.t_construct == old_this) { 684 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 685 th->th.th_local.this_construct); 686 } 687 #if USE_ITT_BUILD 688 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 689 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 690 team->t.t_active_level == 1) { 691 // Only report metadata by primary thread of active team at level 1 692 __kmp_itt_metadata_single(id_ref); 693 } 694 #endif /* USE_ITT_BUILD */ 695 } 696 697 if (__kmp_env_consistency_check) { 698 if (status && push_ws) { 699 __kmp_push_workshare(gtid, ct_psingle, id_ref); 700 } else { 701 __kmp_check_workshare(gtid, ct_psingle, id_ref); 702 } 703 } 704 #if USE_ITT_BUILD 705 if (status) { 706 __kmp_itt_single_start(gtid); 707 } 708 #endif /* USE_ITT_BUILD */ 709 return status; 710 } 711 712 void __kmp_exit_single(int gtid) { 713 #if USE_ITT_BUILD 714 __kmp_itt_single_end(gtid); 715 #endif /* USE_ITT_BUILD */ 716 if (__kmp_env_consistency_check) 717 __kmp_pop_workshare(gtid, ct_psingle, NULL); 718 } 719 720 /* determine if we can go parallel or must use a serialized parallel region and 721 * how many threads we can use 722 * set_nproc is the number of threads requested for the team 723 * returns 0 if we should serialize or only use one thread, 724 * otherwise the number of threads to use 725 * The forkjoin lock is held by the caller. */ 726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 727 int master_tid, int set_nthreads, 728 int enter_teams) { 729 int capacity; 730 int new_nthreads; 731 KMP_DEBUG_ASSERT(__kmp_init_serial); 732 KMP_DEBUG_ASSERT(root && parent_team); 733 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 734 735 // If dyn-var is set, dynamically adjust the number of desired threads, 736 // according to the method specified by dynamic_mode. 737 new_nthreads = set_nthreads; 738 if (!get__dynamic_2(parent_team, master_tid)) { 739 ; 740 } 741 #ifdef USE_LOAD_BALANCE 742 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 743 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 744 if (new_nthreads == 1) { 745 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 746 "reservation to 1 thread\n", 747 master_tid)); 748 return 1; 749 } 750 if (new_nthreads < set_nthreads) { 751 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 752 "reservation to %d threads\n", 753 master_tid, new_nthreads)); 754 } 755 } 756 #endif /* USE_LOAD_BALANCE */ 757 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 758 new_nthreads = __kmp_avail_proc - __kmp_nth + 759 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 760 if (new_nthreads <= 1) { 761 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 762 "reservation to 1 thread\n", 763 master_tid)); 764 return 1; 765 } 766 if (new_nthreads < set_nthreads) { 767 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 768 "reservation to %d threads\n", 769 master_tid, new_nthreads)); 770 } else { 771 new_nthreads = set_nthreads; 772 } 773 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 774 if (set_nthreads > 2) { 775 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 776 new_nthreads = (new_nthreads % set_nthreads) + 1; 777 if (new_nthreads == 1) { 778 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 779 "reservation to 1 thread\n", 780 master_tid)); 781 return 1; 782 } 783 if (new_nthreads < set_nthreads) { 784 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 785 "reservation to %d threads\n", 786 master_tid, new_nthreads)); 787 } 788 } 789 } else { 790 KMP_ASSERT(0); 791 } 792 793 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 794 if (__kmp_nth + new_nthreads - 795 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 796 __kmp_max_nth) { 797 int tl_nthreads = __kmp_max_nth - __kmp_nth + 798 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 799 if (tl_nthreads <= 0) { 800 tl_nthreads = 1; 801 } 802 803 // If dyn-var is false, emit a 1-time warning. 804 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 805 __kmp_reserve_warn = 1; 806 __kmp_msg(kmp_ms_warning, 807 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 808 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 809 } 810 if (tl_nthreads == 1) { 811 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 812 "reduced reservation to 1 thread\n", 813 master_tid)); 814 return 1; 815 } 816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 817 "reservation to %d threads\n", 818 master_tid, tl_nthreads)); 819 new_nthreads = tl_nthreads; 820 } 821 822 // Respect OMP_THREAD_LIMIT 823 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 824 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 825 if (cg_nthreads + new_nthreads - 826 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 827 max_cg_threads) { 828 int tl_nthreads = max_cg_threads - cg_nthreads + 829 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 830 if (tl_nthreads <= 0) { 831 tl_nthreads = 1; 832 } 833 834 // If dyn-var is false, emit a 1-time warning. 835 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 836 __kmp_reserve_warn = 1; 837 __kmp_msg(kmp_ms_warning, 838 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 839 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 840 } 841 if (tl_nthreads == 1) { 842 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 843 "reduced reservation to 1 thread\n", 844 master_tid)); 845 return 1; 846 } 847 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 848 "reservation to %d threads\n", 849 master_tid, tl_nthreads)); 850 new_nthreads = tl_nthreads; 851 } 852 853 // Check if the threads array is large enough, or needs expanding. 854 // See comment in __kmp_register_root() about the adjustment if 855 // __kmp_threads[0] == NULL. 856 capacity = __kmp_threads_capacity; 857 if (TCR_PTR(__kmp_threads[0]) == NULL) { 858 --capacity; 859 } 860 // If it is not for initializing the hidden helper team, we need to take 861 // __kmp_hidden_helper_threads_num out of the capacity because it is included 862 // in __kmp_threads_capacity. 863 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 864 capacity -= __kmp_hidden_helper_threads_num; 865 } 866 if (__kmp_nth + new_nthreads - 867 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 868 capacity) { 869 // Expand the threads array. 870 int slotsRequired = __kmp_nth + new_nthreads - 871 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 872 capacity; 873 int slotsAdded = __kmp_expand_threads(slotsRequired); 874 if (slotsAdded < slotsRequired) { 875 // The threads array was not expanded enough. 876 new_nthreads -= (slotsRequired - slotsAdded); 877 KMP_ASSERT(new_nthreads >= 1); 878 879 // If dyn-var is false, emit a 1-time warning. 880 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 881 __kmp_reserve_warn = 1; 882 if (__kmp_tp_cached) { 883 __kmp_msg(kmp_ms_warning, 884 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 885 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 886 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 887 } else { 888 __kmp_msg(kmp_ms_warning, 889 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 890 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 891 } 892 } 893 } 894 } 895 896 #ifdef KMP_DEBUG 897 if (new_nthreads == 1) { 898 KC_TRACE(10, 899 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 900 "dead roots and rechecking; requested %d threads\n", 901 __kmp_get_gtid(), set_nthreads)); 902 } else { 903 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 904 " %d threads\n", 905 __kmp_get_gtid(), new_nthreads, set_nthreads)); 906 } 907 #endif // KMP_DEBUG 908 return new_nthreads; 909 } 910 911 /* Allocate threads from the thread pool and assign them to the new team. We are 912 assured that there are enough threads available, because we checked on that 913 earlier within critical section forkjoin */ 914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 915 kmp_info_t *master_th, int master_gtid) { 916 int i; 917 int use_hot_team; 918 919 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 920 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 921 KMP_MB(); 922 923 /* first, let's setup the primary thread */ 924 master_th->th.th_info.ds.ds_tid = 0; 925 master_th->th.th_team = team; 926 master_th->th.th_team_nproc = team->t.t_nproc; 927 master_th->th.th_team_master = master_th; 928 master_th->th.th_team_serialized = FALSE; 929 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 930 931 /* make sure we are not the optimized hot team */ 932 #if KMP_NESTED_HOT_TEAMS 933 use_hot_team = 0; 934 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 935 if (hot_teams) { // hot teams array is not allocated if 936 // KMP_HOT_TEAMS_MAX_LEVEL=0 937 int level = team->t.t_active_level - 1; // index in array of hot teams 938 if (master_th->th.th_teams_microtask) { // are we inside the teams? 939 if (master_th->th.th_teams_size.nteams > 1) { 940 ++level; // level was not increased in teams construct for 941 // team_of_masters 942 } 943 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 944 master_th->th.th_teams_level == team->t.t_level) { 945 ++level; // level was not increased in teams construct for 946 // team_of_workers before the parallel 947 } // team->t.t_level will be increased inside parallel 948 } 949 if (level < __kmp_hot_teams_max_level) { 950 if (hot_teams[level].hot_team) { 951 // hot team has already been allocated for given level 952 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 953 use_hot_team = 1; // the team is ready to use 954 } else { 955 use_hot_team = 0; // AC: threads are not allocated yet 956 hot_teams[level].hot_team = team; // remember new hot team 957 hot_teams[level].hot_team_nth = team->t.t_nproc; 958 } 959 } else { 960 use_hot_team = 0; 961 } 962 } 963 #else 964 use_hot_team = team == root->r.r_hot_team; 965 #endif 966 if (!use_hot_team) { 967 968 /* install the primary thread */ 969 team->t.t_threads[0] = master_th; 970 __kmp_initialize_info(master_th, team, 0, master_gtid); 971 972 /* now, install the worker threads */ 973 for (i = 1; i < team->t.t_nproc; i++) { 974 975 /* fork or reallocate a new thread and install it in team */ 976 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 977 team->t.t_threads[i] = thr; 978 KMP_DEBUG_ASSERT(thr); 979 KMP_DEBUG_ASSERT(thr->th.th_team == team); 980 /* align team and thread arrived states */ 981 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 982 "T#%d(%d:%d) join =%llu, plain=%llu\n", 983 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 984 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 985 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 986 team->t.t_bar[bs_plain_barrier].b_arrived)); 987 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 988 thr->th.th_teams_level = master_th->th.th_teams_level; 989 thr->th.th_teams_size = master_th->th.th_teams_size; 990 { // Initialize threads' barrier data. 991 int b; 992 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 993 for (b = 0; b < bs_last_barrier; ++b) { 994 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 995 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 996 #if USE_DEBUGGER 997 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 998 #endif 999 } 1000 } 1001 } 1002 1003 #if KMP_AFFINITY_SUPPORTED 1004 __kmp_partition_places(team); 1005 #endif 1006 } 1007 1008 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1009 for (i = 0; i < team->t.t_nproc; i++) { 1010 kmp_info_t *thr = team->t.t_threads[i]; 1011 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1012 thr->th.th_prev_level != team->t.t_level) { 1013 team->t.t_display_affinity = 1; 1014 break; 1015 } 1016 } 1017 } 1018 1019 KMP_MB(); 1020 } 1021 1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1023 // Propagate any changes to the floating point control registers out to the team 1024 // We try to avoid unnecessary writes to the relevant cache line in the team 1025 // structure, so we don't make changes unless they are needed. 1026 inline static void propagateFPControl(kmp_team_t *team) { 1027 if (__kmp_inherit_fp_control) { 1028 kmp_int16 x87_fpu_control_word; 1029 kmp_uint32 mxcsr; 1030 1031 // Get primary thread's values of FPU control flags (both X87 and vector) 1032 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1033 __kmp_store_mxcsr(&mxcsr); 1034 mxcsr &= KMP_X86_MXCSR_MASK; 1035 1036 // There is no point looking at t_fp_control_saved here. 1037 // If it is TRUE, we still have to update the values if they are different 1038 // from those we now have. If it is FALSE we didn't save anything yet, but 1039 // our objective is the same. We have to ensure that the values in the team 1040 // are the same as those we have. 1041 // So, this code achieves what we need whether or not t_fp_control_saved is 1042 // true. By checking whether the value needs updating we avoid unnecessary 1043 // writes that would put the cache-line into a written state, causing all 1044 // threads in the team to have to read it again. 1045 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1046 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1047 // Although we don't use this value, other code in the runtime wants to know 1048 // whether it should restore them. So we must ensure it is correct. 1049 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1050 } else { 1051 // Similarly here. Don't write to this cache-line in the team structure 1052 // unless we have to. 1053 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1054 } 1055 } 1056 1057 // Do the opposite, setting the hardware registers to the updated values from 1058 // the team. 1059 inline static void updateHWFPControl(kmp_team_t *team) { 1060 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1061 // Only reset the fp control regs if they have been changed in the team. 1062 // the parallel region that we are exiting. 1063 kmp_int16 x87_fpu_control_word; 1064 kmp_uint32 mxcsr; 1065 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1066 __kmp_store_mxcsr(&mxcsr); 1067 mxcsr &= KMP_X86_MXCSR_MASK; 1068 1069 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1070 __kmp_clear_x87_fpu_status_word(); 1071 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1072 } 1073 1074 if (team->t.t_mxcsr != mxcsr) { 1075 __kmp_load_mxcsr(&team->t.t_mxcsr); 1076 } 1077 } 1078 } 1079 #else 1080 #define propagateFPControl(x) ((void)0) 1081 #define updateHWFPControl(x) ((void)0) 1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1083 1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1085 int realloc); // forward declaration 1086 1087 /* Run a parallel region that has been serialized, so runs only in a team of the 1088 single primary thread. */ 1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1090 kmp_info_t *this_thr; 1091 kmp_team_t *serial_team; 1092 1093 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1094 1095 /* Skip all this code for autopar serialized loops since it results in 1096 unacceptable overhead */ 1097 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1098 return; 1099 1100 if (!TCR_4(__kmp_init_parallel)) 1101 __kmp_parallel_initialize(); 1102 __kmp_resume_if_soft_paused(); 1103 1104 this_thr = __kmp_threads[global_tid]; 1105 serial_team = this_thr->th.th_serial_team; 1106 1107 /* utilize the serialized team held by this thread */ 1108 KMP_DEBUG_ASSERT(serial_team); 1109 KMP_MB(); 1110 1111 if (__kmp_tasking_mode != tskm_immediate_exec) { 1112 KMP_DEBUG_ASSERT( 1113 this_thr->th.th_task_team == 1114 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1115 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1116 NULL); 1117 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1118 "team %p, new task_team = NULL\n", 1119 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1120 this_thr->th.th_task_team = NULL; 1121 } 1122 1123 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1124 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1125 proc_bind = proc_bind_false; 1126 } else if (proc_bind == proc_bind_default) { 1127 // No proc_bind clause was specified, so use the current value 1128 // of proc-bind-var for this parallel region. 1129 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1130 } 1131 // Reset for next parallel region 1132 this_thr->th.th_set_proc_bind = proc_bind_default; 1133 1134 #if OMPT_SUPPORT 1135 ompt_data_t ompt_parallel_data = ompt_data_none; 1136 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1137 if (ompt_enabled.enabled && 1138 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1139 1140 ompt_task_info_t *parent_task_info; 1141 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1142 1143 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1144 if (ompt_enabled.ompt_callback_parallel_begin) { 1145 int team_size = 1; 1146 1147 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1148 &(parent_task_info->task_data), &(parent_task_info->frame), 1149 &ompt_parallel_data, team_size, 1150 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1151 } 1152 } 1153 #endif // OMPT_SUPPORT 1154 1155 if (this_thr->th.th_team != serial_team) { 1156 // Nested level will be an index in the nested nthreads array 1157 int level = this_thr->th.th_team->t.t_level; 1158 1159 if (serial_team->t.t_serialized) { 1160 /* this serial team was already used 1161 TODO increase performance by making this locks more specific */ 1162 kmp_team_t *new_team; 1163 1164 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1165 1166 new_team = 1167 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1168 #if OMPT_SUPPORT 1169 ompt_parallel_data, 1170 #endif 1171 proc_bind, &this_thr->th.th_current_task->td_icvs, 1172 0 USE_NESTED_HOT_ARG(NULL)); 1173 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1174 KMP_ASSERT(new_team); 1175 1176 /* setup new serialized team and install it */ 1177 new_team->t.t_threads[0] = this_thr; 1178 new_team->t.t_parent = this_thr->th.th_team; 1179 serial_team = new_team; 1180 this_thr->th.th_serial_team = serial_team; 1181 1182 KF_TRACE( 1183 10, 1184 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1185 global_tid, serial_team)); 1186 1187 /* TODO the above breaks the requirement that if we run out of resources, 1188 then we can still guarantee that serialized teams are ok, since we may 1189 need to allocate a new one */ 1190 } else { 1191 KF_TRACE( 1192 10, 1193 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1194 global_tid, serial_team)); 1195 } 1196 1197 /* we have to initialize this serial team */ 1198 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1199 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1200 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1201 serial_team->t.t_ident = loc; 1202 serial_team->t.t_serialized = 1; 1203 serial_team->t.t_nproc = 1; 1204 serial_team->t.t_parent = this_thr->th.th_team; 1205 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1206 this_thr->th.th_team = serial_team; 1207 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1208 1209 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1210 this_thr->th.th_current_task)); 1211 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1212 this_thr->th.th_current_task->td_flags.executing = 0; 1213 1214 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1215 1216 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1217 implicit task for each serialized task represented by 1218 team->t.t_serialized? */ 1219 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1220 &this_thr->th.th_current_task->td_parent->td_icvs); 1221 1222 // Thread value exists in the nested nthreads array for the next nested 1223 // level 1224 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1225 this_thr->th.th_current_task->td_icvs.nproc = 1226 __kmp_nested_nth.nth[level + 1]; 1227 } 1228 1229 if (__kmp_nested_proc_bind.used && 1230 (level + 1 < __kmp_nested_proc_bind.used)) { 1231 this_thr->th.th_current_task->td_icvs.proc_bind = 1232 __kmp_nested_proc_bind.bind_types[level + 1]; 1233 } 1234 1235 #if USE_DEBUGGER 1236 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1237 #endif 1238 this_thr->th.th_info.ds.ds_tid = 0; 1239 1240 /* set thread cache values */ 1241 this_thr->th.th_team_nproc = 1; 1242 this_thr->th.th_team_master = this_thr; 1243 this_thr->th.th_team_serialized = 1; 1244 1245 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1246 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1247 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1248 1249 propagateFPControl(serial_team); 1250 1251 /* check if we need to allocate dispatch buffers stack */ 1252 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1253 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1254 serial_team->t.t_dispatch->th_disp_buffer = 1255 (dispatch_private_info_t *)__kmp_allocate( 1256 sizeof(dispatch_private_info_t)); 1257 } 1258 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1259 1260 KMP_MB(); 1261 1262 } else { 1263 /* this serialized team is already being used, 1264 * that's fine, just add another nested level */ 1265 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1266 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1267 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1268 ++serial_team->t.t_serialized; 1269 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1270 1271 // Nested level will be an index in the nested nthreads array 1272 int level = this_thr->th.th_team->t.t_level; 1273 // Thread value exists in the nested nthreads array for the next nested 1274 // level 1275 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1276 this_thr->th.th_current_task->td_icvs.nproc = 1277 __kmp_nested_nth.nth[level + 1]; 1278 } 1279 serial_team->t.t_level++; 1280 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1281 "of serial team %p to %d\n", 1282 global_tid, serial_team, serial_team->t.t_level)); 1283 1284 /* allocate/push dispatch buffers stack */ 1285 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1286 { 1287 dispatch_private_info_t *disp_buffer = 1288 (dispatch_private_info_t *)__kmp_allocate( 1289 sizeof(dispatch_private_info_t)); 1290 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1291 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1292 } 1293 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1294 1295 KMP_MB(); 1296 } 1297 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1298 1299 // Perform the display affinity functionality for 1300 // serialized parallel regions 1301 if (__kmp_display_affinity) { 1302 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1303 this_thr->th.th_prev_num_threads != 1) { 1304 // NULL means use the affinity-format-var ICV 1305 __kmp_aux_display_affinity(global_tid, NULL); 1306 this_thr->th.th_prev_level = serial_team->t.t_level; 1307 this_thr->th.th_prev_num_threads = 1; 1308 } 1309 } 1310 1311 if (__kmp_env_consistency_check) 1312 __kmp_push_parallel(global_tid, NULL); 1313 #if OMPT_SUPPORT 1314 serial_team->t.ompt_team_info.master_return_address = codeptr; 1315 if (ompt_enabled.enabled && 1316 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1317 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1318 OMPT_GET_FRAME_ADDRESS(0); 1319 1320 ompt_lw_taskteam_t lw_taskteam; 1321 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1322 &ompt_parallel_data, codeptr); 1323 1324 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1325 // don't use lw_taskteam after linking. content was swaped 1326 1327 /* OMPT implicit task begin */ 1328 if (ompt_enabled.ompt_callback_implicit_task) { 1329 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1330 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1331 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1332 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1333 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1334 __kmp_tid_from_gtid(global_tid); 1335 } 1336 1337 /* OMPT state */ 1338 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1339 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1340 OMPT_GET_FRAME_ADDRESS(0); 1341 } 1342 #endif 1343 } 1344 1345 /* most of the work for a fork */ 1346 /* return true if we really went parallel, false if serialized */ 1347 int __kmp_fork_call(ident_t *loc, int gtid, 1348 enum fork_context_e call_context, // Intel, GNU, ... 1349 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1350 kmp_va_list ap) { 1351 void **argv; 1352 int i; 1353 int master_tid; 1354 int master_this_cons; 1355 kmp_team_t *team; 1356 kmp_team_t *parent_team; 1357 kmp_info_t *master_th; 1358 kmp_root_t *root; 1359 int nthreads; 1360 int master_active; 1361 int master_set_numthreads; 1362 int level; 1363 int active_level; 1364 int teams_level; 1365 #if KMP_NESTED_HOT_TEAMS 1366 kmp_hot_team_ptr_t **p_hot_teams; 1367 #endif 1368 { // KMP_TIME_BLOCK 1369 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1370 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1371 1372 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1373 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1374 /* Some systems prefer the stack for the root thread(s) to start with */ 1375 /* some gap from the parent stack to prevent false sharing. */ 1376 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1377 /* These 2 lines below are so this does not get optimized out */ 1378 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1379 __kmp_stkpadding += (short)((kmp_int64)dummy); 1380 } 1381 1382 /* initialize if needed */ 1383 KMP_DEBUG_ASSERT( 1384 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1385 if (!TCR_4(__kmp_init_parallel)) 1386 __kmp_parallel_initialize(); 1387 __kmp_resume_if_soft_paused(); 1388 1389 /* setup current data */ 1390 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1391 // shutdown 1392 parent_team = master_th->th.th_team; 1393 master_tid = master_th->th.th_info.ds.ds_tid; 1394 master_this_cons = master_th->th.th_local.this_construct; 1395 root = master_th->th.th_root; 1396 master_active = root->r.r_active; 1397 master_set_numthreads = master_th->th.th_set_nproc; 1398 1399 #if OMPT_SUPPORT 1400 ompt_data_t ompt_parallel_data = ompt_data_none; 1401 ompt_data_t *parent_task_data; 1402 ompt_frame_t *ompt_frame; 1403 ompt_data_t *implicit_task_data; 1404 void *return_address = NULL; 1405 1406 if (ompt_enabled.enabled) { 1407 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1408 NULL, NULL); 1409 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1410 } 1411 #endif 1412 1413 // Assign affinity to root thread if it hasn't happened yet 1414 __kmp_assign_root_init_mask(); 1415 1416 // Nested level will be an index in the nested nthreads array 1417 level = parent_team->t.t_level; 1418 // used to launch non-serial teams even if nested is not allowed 1419 active_level = parent_team->t.t_active_level; 1420 // needed to check nesting inside the teams 1421 teams_level = master_th->th.th_teams_level; 1422 #if KMP_NESTED_HOT_TEAMS 1423 p_hot_teams = &master_th->th.th_hot_teams; 1424 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1425 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1426 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1427 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1428 // it is either actual or not needed (when active_level > 0) 1429 (*p_hot_teams)[0].hot_team_nth = 1; 1430 } 1431 #endif 1432 1433 #if OMPT_SUPPORT 1434 if (ompt_enabled.enabled) { 1435 if (ompt_enabled.ompt_callback_parallel_begin) { 1436 int team_size = master_set_numthreads 1437 ? master_set_numthreads 1438 : get__nproc_2(parent_team, master_tid); 1439 int flags = OMPT_INVOKER(call_context) | 1440 ((microtask == (microtask_t)__kmp_teams_master) 1441 ? ompt_parallel_league 1442 : ompt_parallel_team); 1443 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1444 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1445 return_address); 1446 } 1447 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1448 } 1449 #endif 1450 1451 master_th->th.th_ident = loc; 1452 1453 if (master_th->th.th_teams_microtask && ap && 1454 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1455 // AC: This is start of parallel that is nested inside teams construct. 1456 // The team is actual (hot), all workers are ready at the fork barrier. 1457 // No lock needed to initialize the team a bit, then free workers. 1458 parent_team->t.t_ident = loc; 1459 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1460 parent_team->t.t_argc = argc; 1461 argv = (void **)parent_team->t.t_argv; 1462 for (i = argc - 1; i >= 0; --i) 1463 *argv++ = va_arg(kmp_va_deref(ap), void *); 1464 // Increment our nested depth levels, but not increase the serialization 1465 if (parent_team == master_th->th.th_serial_team) { 1466 // AC: we are in serialized parallel 1467 __kmpc_serialized_parallel(loc, gtid); 1468 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1469 1470 if (call_context == fork_context_gnu) { 1471 // AC: need to decrement t_serialized for enquiry functions to work 1472 // correctly, will restore at join time 1473 parent_team->t.t_serialized--; 1474 return TRUE; 1475 } 1476 1477 #if OMPD_SUPPORT 1478 parent_team->t.t_pkfn = microtask; 1479 #endif 1480 1481 #if OMPT_SUPPORT 1482 void *dummy; 1483 void **exit_frame_p; 1484 1485 ompt_lw_taskteam_t lw_taskteam; 1486 1487 if (ompt_enabled.enabled) { 1488 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1489 &ompt_parallel_data, return_address); 1490 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1491 1492 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1493 // don't use lw_taskteam after linking. content was swaped 1494 1495 /* OMPT implicit task begin */ 1496 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1497 if (ompt_enabled.ompt_callback_implicit_task) { 1498 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1499 __kmp_tid_from_gtid(gtid); 1500 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1501 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1502 implicit_task_data, 1, 1503 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1504 } 1505 1506 /* OMPT state */ 1507 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1508 } else { 1509 exit_frame_p = &dummy; 1510 } 1511 #endif 1512 // AC: need to decrement t_serialized for enquiry functions to work 1513 // correctly, will restore at join time 1514 parent_team->t.t_serialized--; 1515 1516 { 1517 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1518 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1519 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1520 #if OMPT_SUPPORT 1521 , 1522 exit_frame_p 1523 #endif 1524 ); 1525 } 1526 1527 #if OMPT_SUPPORT 1528 if (ompt_enabled.enabled) { 1529 *exit_frame_p = NULL; 1530 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1531 if (ompt_enabled.ompt_callback_implicit_task) { 1532 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1533 ompt_scope_end, NULL, implicit_task_data, 1, 1534 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1535 } 1536 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1537 __ompt_lw_taskteam_unlink(master_th); 1538 if (ompt_enabled.ompt_callback_parallel_end) { 1539 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1540 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1541 OMPT_INVOKER(call_context) | ompt_parallel_team, 1542 return_address); 1543 } 1544 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1545 } 1546 #endif 1547 return TRUE; 1548 } 1549 1550 parent_team->t.t_pkfn = microtask; 1551 parent_team->t.t_invoke = invoker; 1552 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1553 parent_team->t.t_active_level++; 1554 parent_team->t.t_level++; 1555 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1556 1557 #if OMPT_SUPPORT 1558 if (ompt_enabled.enabled) { 1559 ompt_lw_taskteam_t lw_taskteam; 1560 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1561 &ompt_parallel_data, return_address); 1562 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1563 } 1564 #endif 1565 1566 /* Change number of threads in the team if requested */ 1567 if (master_set_numthreads) { // The parallel has num_threads clause 1568 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1569 // AC: only can reduce number of threads dynamically, can't increase 1570 kmp_info_t **other_threads = parent_team->t.t_threads; 1571 parent_team->t.t_nproc = master_set_numthreads; 1572 for (i = 0; i < master_set_numthreads; ++i) { 1573 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1574 } 1575 // Keep extra threads hot in the team for possible next parallels 1576 } 1577 master_th->th.th_set_nproc = 0; 1578 } 1579 1580 #if USE_DEBUGGER 1581 if (__kmp_debugging) { // Let debugger override number of threads. 1582 int nth = __kmp_omp_num_threads(loc); 1583 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1584 master_set_numthreads = nth; 1585 } 1586 } 1587 #endif 1588 1589 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1590 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1591 KMP_ITT_DEBUG) && 1592 __kmp_forkjoin_frames_mode == 3 && 1593 parent_team->t.t_active_level == 1 // only report frames at level 1 1594 && master_th->th.th_teams_size.nteams == 1) { 1595 kmp_uint64 tmp_time = __itt_get_timestamp(); 1596 master_th->th.th_frame_time = tmp_time; 1597 parent_team->t.t_region_time = tmp_time; 1598 } 1599 if (__itt_stack_caller_create_ptr) { 1600 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1601 // create new stack stitching id before entering fork barrier 1602 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1603 } 1604 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1605 1606 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1607 "master_th=%p, gtid=%d\n", 1608 root, parent_team, master_th, gtid)); 1609 __kmp_internal_fork(loc, gtid, parent_team); 1610 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1611 "master_th=%p, gtid=%d\n", 1612 root, parent_team, master_th, gtid)); 1613 1614 if (call_context == fork_context_gnu) 1615 return TRUE; 1616 1617 /* Invoke microtask for PRIMARY thread */ 1618 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1619 parent_team->t.t_id, parent_team->t.t_pkfn)); 1620 1621 if (!parent_team->t.t_invoke(gtid)) { 1622 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1623 } 1624 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1625 parent_team->t.t_id, parent_team->t.t_pkfn)); 1626 KMP_MB(); /* Flush all pending memory write invalidates. */ 1627 1628 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1629 1630 return TRUE; 1631 } // Parallel closely nested in teams construct 1632 1633 #if KMP_DEBUG 1634 if (__kmp_tasking_mode != tskm_immediate_exec) { 1635 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1636 parent_team->t.t_task_team[master_th->th.th_task_state]); 1637 } 1638 #endif 1639 1640 int enter_teams = 0; 1641 if (parent_team->t.t_active_level >= 1642 master_th->th.th_current_task->td_icvs.max_active_levels) { 1643 nthreads = 1; 1644 } else { 1645 enter_teams = ((ap == NULL && active_level == 0) || 1646 (ap && teams_level > 0 && teams_level == level)); 1647 nthreads = 1648 master_set_numthreads 1649 ? master_set_numthreads 1650 : get__nproc_2( 1651 parent_team, 1652 master_tid); // TODO: get nproc directly from current task 1653 1654 // Check if we need to take forkjoin lock? (no need for serialized 1655 // parallel out of teams construct). This code moved here from 1656 // __kmp_reserve_threads() to speedup nested serialized parallels. 1657 if (nthreads > 1) { 1658 if ((get__max_active_levels(master_th) == 1 && 1659 (root->r.r_in_parallel && !enter_teams)) || 1660 (__kmp_library == library_serial)) { 1661 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1662 " threads\n", 1663 gtid, nthreads)); 1664 nthreads = 1; 1665 } 1666 } 1667 if (nthreads > 1) { 1668 /* determine how many new threads we can use */ 1669 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1670 /* AC: If we execute teams from parallel region (on host), then teams 1671 should be created but each can only have 1 thread if nesting is 1672 disabled. If teams called from serial region, then teams and their 1673 threads should be created regardless of the nesting setting. */ 1674 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1675 nthreads, enter_teams); 1676 if (nthreads == 1) { 1677 // Free lock for single thread execution here; for multi-thread 1678 // execution it will be freed later after team of threads created 1679 // and initialized 1680 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1681 } 1682 } 1683 } 1684 KMP_DEBUG_ASSERT(nthreads > 0); 1685 1686 // If we temporarily changed the set number of threads then restore it now 1687 master_th->th.th_set_nproc = 0; 1688 1689 /* create a serialized parallel region? */ 1690 if (nthreads == 1) { 1691 /* josh todo: hypothetical question: what do we do for OS X*? */ 1692 #if KMP_OS_LINUX && \ 1693 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1694 void *args[argc]; 1695 #else 1696 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1697 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1698 KMP_ARCH_AARCH64) */ 1699 1700 KA_TRACE(20, 1701 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1702 1703 __kmpc_serialized_parallel(loc, gtid); 1704 1705 #if OMPD_SUPPORT 1706 master_th->th.th_serial_team->t.t_pkfn = microtask; 1707 #endif 1708 1709 if (call_context == fork_context_intel) { 1710 /* TODO this sucks, use the compiler itself to pass args! :) */ 1711 master_th->th.th_serial_team->t.t_ident = loc; 1712 if (!ap) { 1713 // revert change made in __kmpc_serialized_parallel() 1714 master_th->th.th_serial_team->t.t_level--; 1715 // Get args from parent team for teams construct 1716 1717 #if OMPT_SUPPORT 1718 void *dummy; 1719 void **exit_frame_p; 1720 ompt_task_info_t *task_info; 1721 1722 ompt_lw_taskteam_t lw_taskteam; 1723 1724 if (ompt_enabled.enabled) { 1725 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1726 &ompt_parallel_data, return_address); 1727 1728 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1729 // don't use lw_taskteam after linking. content was swaped 1730 1731 task_info = OMPT_CUR_TASK_INFO(master_th); 1732 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1733 if (ompt_enabled.ompt_callback_implicit_task) { 1734 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1735 __kmp_tid_from_gtid(gtid); 1736 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1737 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1738 &(task_info->task_data), 1, 1739 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1740 ompt_task_implicit); 1741 } 1742 1743 /* OMPT state */ 1744 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1745 } else { 1746 exit_frame_p = &dummy; 1747 } 1748 #endif 1749 1750 { 1751 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1752 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1753 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1754 parent_team->t.t_argv 1755 #if OMPT_SUPPORT 1756 , 1757 exit_frame_p 1758 #endif 1759 ); 1760 } 1761 1762 #if OMPT_SUPPORT 1763 if (ompt_enabled.enabled) { 1764 *exit_frame_p = NULL; 1765 if (ompt_enabled.ompt_callback_implicit_task) { 1766 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1767 ompt_scope_end, NULL, &(task_info->task_data), 1, 1768 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1769 ompt_task_implicit); 1770 } 1771 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1772 __ompt_lw_taskteam_unlink(master_th); 1773 if (ompt_enabled.ompt_callback_parallel_end) { 1774 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1775 &ompt_parallel_data, parent_task_data, 1776 OMPT_INVOKER(call_context) | ompt_parallel_team, 1777 return_address); 1778 } 1779 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1780 } 1781 #endif 1782 } else if (microtask == (microtask_t)__kmp_teams_master) { 1783 KMP_DEBUG_ASSERT(master_th->th.th_team == 1784 master_th->th.th_serial_team); 1785 team = master_th->th.th_team; 1786 // team->t.t_pkfn = microtask; 1787 team->t.t_invoke = invoker; 1788 __kmp_alloc_argv_entries(argc, team, TRUE); 1789 team->t.t_argc = argc; 1790 argv = (void **)team->t.t_argv; 1791 if (ap) { 1792 for (i = argc - 1; i >= 0; --i) 1793 *argv++ = va_arg(kmp_va_deref(ap), void *); 1794 } else { 1795 for (i = 0; i < argc; ++i) 1796 // Get args from parent team for teams construct 1797 argv[i] = parent_team->t.t_argv[i]; 1798 } 1799 // AC: revert change made in __kmpc_serialized_parallel() 1800 // because initial code in teams should have level=0 1801 team->t.t_level--; 1802 // AC: call special invoker for outer "parallel" of teams construct 1803 invoker(gtid); 1804 #if OMPT_SUPPORT 1805 if (ompt_enabled.enabled) { 1806 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1807 if (ompt_enabled.ompt_callback_implicit_task) { 1808 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1809 ompt_scope_end, NULL, &(task_info->task_data), 0, 1810 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1811 } 1812 if (ompt_enabled.ompt_callback_parallel_end) { 1813 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1814 &ompt_parallel_data, parent_task_data, 1815 OMPT_INVOKER(call_context) | ompt_parallel_league, 1816 return_address); 1817 } 1818 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1819 } 1820 #endif 1821 } else { 1822 argv = args; 1823 for (i = argc - 1; i >= 0; --i) 1824 *argv++ = va_arg(kmp_va_deref(ap), void *); 1825 KMP_MB(); 1826 1827 #if OMPT_SUPPORT 1828 void *dummy; 1829 void **exit_frame_p; 1830 ompt_task_info_t *task_info; 1831 1832 ompt_lw_taskteam_t lw_taskteam; 1833 1834 if (ompt_enabled.enabled) { 1835 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1836 &ompt_parallel_data, return_address); 1837 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1838 // don't use lw_taskteam after linking. content was swaped 1839 task_info = OMPT_CUR_TASK_INFO(master_th); 1840 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1841 1842 /* OMPT implicit task begin */ 1843 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1844 if (ompt_enabled.ompt_callback_implicit_task) { 1845 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1846 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1847 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1848 ompt_task_implicit); 1849 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1850 __kmp_tid_from_gtid(gtid); 1851 } 1852 1853 /* OMPT state */ 1854 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1855 } else { 1856 exit_frame_p = &dummy; 1857 } 1858 #endif 1859 1860 { 1861 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1862 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1863 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1864 #if OMPT_SUPPORT 1865 , 1866 exit_frame_p 1867 #endif 1868 ); 1869 } 1870 1871 #if OMPT_SUPPORT 1872 if (ompt_enabled.enabled) { 1873 *exit_frame_p = NULL; 1874 if (ompt_enabled.ompt_callback_implicit_task) { 1875 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1876 ompt_scope_end, NULL, &(task_info->task_data), 1, 1877 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1878 ompt_task_implicit); 1879 } 1880 1881 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1882 __ompt_lw_taskteam_unlink(master_th); 1883 if (ompt_enabled.ompt_callback_parallel_end) { 1884 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1885 &ompt_parallel_data, parent_task_data, 1886 OMPT_INVOKER(call_context) | ompt_parallel_team, 1887 return_address); 1888 } 1889 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1890 } 1891 #endif 1892 } 1893 } else if (call_context == fork_context_gnu) { 1894 #if OMPT_SUPPORT 1895 ompt_lw_taskteam_t lwt; 1896 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1897 return_address); 1898 1899 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1900 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1901 // don't use lw_taskteam after linking. content was swaped 1902 #endif 1903 1904 // we were called from GNU native code 1905 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1906 return FALSE; 1907 } else { 1908 KMP_ASSERT2(call_context < fork_context_last, 1909 "__kmp_fork_call: unknown fork_context parameter"); 1910 } 1911 1912 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1913 KMP_MB(); 1914 return FALSE; 1915 } // if (nthreads == 1) 1916 1917 // GEH: only modify the executing flag in the case when not serialized 1918 // serialized case is handled in kmpc_serialized_parallel 1919 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1920 "curtask=%p, curtask_max_aclevel=%d\n", 1921 parent_team->t.t_active_level, master_th, 1922 master_th->th.th_current_task, 1923 master_th->th.th_current_task->td_icvs.max_active_levels)); 1924 // TODO: GEH - cannot do this assertion because root thread not set up as 1925 // executing 1926 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1927 master_th->th.th_current_task->td_flags.executing = 0; 1928 1929 if (!master_th->th.th_teams_microtask || level > teams_level) { 1930 /* Increment our nested depth level */ 1931 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1932 } 1933 1934 // See if we need to make a copy of the ICVs. 1935 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1936 if ((level + 1 < __kmp_nested_nth.used) && 1937 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1938 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1939 } else { 1940 nthreads_icv = 0; // don't update 1941 } 1942 1943 // Figure out the proc_bind_policy for the new team. 1944 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1945 kmp_proc_bind_t proc_bind_icv = 1946 proc_bind_default; // proc_bind_default means don't update 1947 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1948 proc_bind = proc_bind_false; 1949 } else { 1950 if (proc_bind == proc_bind_default) { 1951 // No proc_bind clause specified; use current proc-bind-var for this 1952 // parallel region 1953 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1954 } 1955 /* else: The proc_bind policy was specified explicitly on parallel clause. 1956 This overrides proc-bind-var for this parallel region, but does not 1957 change proc-bind-var. */ 1958 // Figure the value of proc-bind-var for the child threads. 1959 if ((level + 1 < __kmp_nested_proc_bind.used) && 1960 (__kmp_nested_proc_bind.bind_types[level + 1] != 1961 master_th->th.th_current_task->td_icvs.proc_bind)) { 1962 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1963 } 1964 } 1965 1966 // Reset for next parallel region 1967 master_th->th.th_set_proc_bind = proc_bind_default; 1968 1969 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1970 kmp_internal_control_t new_icvs; 1971 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1972 new_icvs.next = NULL; 1973 if (nthreads_icv > 0) { 1974 new_icvs.nproc = nthreads_icv; 1975 } 1976 if (proc_bind_icv != proc_bind_default) { 1977 new_icvs.proc_bind = proc_bind_icv; 1978 } 1979 1980 /* allocate a new parallel team */ 1981 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1982 team = __kmp_allocate_team(root, nthreads, nthreads, 1983 #if OMPT_SUPPORT 1984 ompt_parallel_data, 1985 #endif 1986 proc_bind, &new_icvs, 1987 argc USE_NESTED_HOT_ARG(master_th)); 1988 } else { 1989 /* allocate a new parallel team */ 1990 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1991 team = __kmp_allocate_team(root, nthreads, nthreads, 1992 #if OMPT_SUPPORT 1993 ompt_parallel_data, 1994 #endif 1995 proc_bind, 1996 &master_th->th.th_current_task->td_icvs, 1997 argc USE_NESTED_HOT_ARG(master_th)); 1998 } 1999 KF_TRACE( 2000 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2001 2002 /* setup the new team */ 2003 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2004 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2005 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2006 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2007 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2008 #if OMPT_SUPPORT 2009 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2010 return_address); 2011 #endif 2012 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2013 // TODO: parent_team->t.t_level == INT_MAX ??? 2014 if (!master_th->th.th_teams_microtask || level > teams_level) { 2015 int new_level = parent_team->t.t_level + 1; 2016 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2017 new_level = parent_team->t.t_active_level + 1; 2018 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2019 } else { 2020 // AC: Do not increase parallel level at start of the teams construct 2021 int new_level = parent_team->t.t_level; 2022 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2023 new_level = parent_team->t.t_active_level; 2024 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2025 } 2026 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2027 // set primary thread's schedule as new run-time schedule 2028 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2029 2030 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2031 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2032 2033 // Update the floating point rounding in the team if required. 2034 propagateFPControl(team); 2035 #if OMPD_SUPPORT 2036 if (ompd_state & OMPD_ENABLE_BP) 2037 ompd_bp_parallel_begin(); 2038 #endif 2039 2040 if (__kmp_tasking_mode != tskm_immediate_exec) { 2041 // Set primary thread's task team to team's task team. Unless this is hot 2042 // team, it should be NULL. 2043 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2044 parent_team->t.t_task_team[master_th->th.th_task_state]); 2045 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2046 "%p, new task_team %p / team %p\n", 2047 __kmp_gtid_from_thread(master_th), 2048 master_th->th.th_task_team, parent_team, 2049 team->t.t_task_team[master_th->th.th_task_state], team)); 2050 2051 if (active_level || master_th->th.th_task_team) { 2052 // Take a memo of primary thread's task_state 2053 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2054 if (master_th->th.th_task_state_top >= 2055 master_th->th.th_task_state_stack_sz) { // increase size 2056 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2057 kmp_uint8 *old_stack, *new_stack; 2058 kmp_uint32 i; 2059 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2060 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2061 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2062 } 2063 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2064 ++i) { // zero-init rest of stack 2065 new_stack[i] = 0; 2066 } 2067 old_stack = master_th->th.th_task_state_memo_stack; 2068 master_th->th.th_task_state_memo_stack = new_stack; 2069 master_th->th.th_task_state_stack_sz = new_size; 2070 __kmp_free(old_stack); 2071 } 2072 // Store primary thread's task_state on stack 2073 master_th->th 2074 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2075 master_th->th.th_task_state; 2076 master_th->th.th_task_state_top++; 2077 #if KMP_NESTED_HOT_TEAMS 2078 if (master_th->th.th_hot_teams && 2079 active_level < __kmp_hot_teams_max_level && 2080 team == master_th->th.th_hot_teams[active_level].hot_team) { 2081 // Restore primary thread's nested state if nested hot team 2082 master_th->th.th_task_state = 2083 master_th->th 2084 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2085 } else { 2086 #endif 2087 master_th->th.th_task_state = 0; 2088 #if KMP_NESTED_HOT_TEAMS 2089 } 2090 #endif 2091 } 2092 #if !KMP_NESTED_HOT_TEAMS 2093 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2094 (team == root->r.r_hot_team)); 2095 #endif 2096 } 2097 2098 KA_TRACE( 2099 20, 2100 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2101 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2102 team->t.t_nproc)); 2103 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2104 (team->t.t_master_tid == 0 && 2105 (team->t.t_parent == root->r.r_root_team || 2106 team->t.t_parent->t.t_serialized))); 2107 KMP_MB(); 2108 2109 /* now, setup the arguments */ 2110 argv = (void **)team->t.t_argv; 2111 if (ap) { 2112 for (i = argc - 1; i >= 0; --i) { 2113 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2114 KMP_CHECK_UPDATE(*argv, new_argv); 2115 argv++; 2116 } 2117 } else { 2118 for (i = 0; i < argc; ++i) { 2119 // Get args from parent team for teams construct 2120 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2121 } 2122 } 2123 2124 /* now actually fork the threads */ 2125 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2126 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2127 root->r.r_active = TRUE; 2128 2129 __kmp_fork_team_threads(root, team, master_th, gtid); 2130 __kmp_setup_icv_copy(team, nthreads, 2131 &master_th->th.th_current_task->td_icvs, loc); 2132 2133 #if OMPT_SUPPORT 2134 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2135 #endif 2136 2137 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2138 2139 #if USE_ITT_BUILD 2140 if (team->t.t_active_level == 1 // only report frames at level 1 2141 && !master_th->th.th_teams_microtask) { // not in teams construct 2142 #if USE_ITT_NOTIFY 2143 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2144 (__kmp_forkjoin_frames_mode == 3 || 2145 __kmp_forkjoin_frames_mode == 1)) { 2146 kmp_uint64 tmp_time = 0; 2147 if (__itt_get_timestamp_ptr) 2148 tmp_time = __itt_get_timestamp(); 2149 // Internal fork - report frame begin 2150 master_th->th.th_frame_time = tmp_time; 2151 if (__kmp_forkjoin_frames_mode == 3) 2152 team->t.t_region_time = tmp_time; 2153 } else 2154 // only one notification scheme (either "submit" or "forking/joined", not both) 2155 #endif /* USE_ITT_NOTIFY */ 2156 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2157 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2158 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2159 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2160 } 2161 } 2162 #endif /* USE_ITT_BUILD */ 2163 2164 /* now go on and do the work */ 2165 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2166 KMP_MB(); 2167 KF_TRACE(10, 2168 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2169 root, team, master_th, gtid)); 2170 2171 #if USE_ITT_BUILD 2172 if (__itt_stack_caller_create_ptr) { 2173 // create new stack stitching id before entering fork barrier 2174 if (!enter_teams) { 2175 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2176 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2177 } else if (parent_team->t.t_serialized) { 2178 // keep stack stitching id in the serialized parent_team; 2179 // current team will be used for parallel inside the teams; 2180 // if parent_team is active, then it already keeps stack stitching id 2181 // for the league of teams 2182 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2183 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2184 } 2185 } 2186 #endif /* USE_ITT_BUILD */ 2187 2188 // AC: skip __kmp_internal_fork at teams construct, let only primary 2189 // threads execute 2190 if (ap) { 2191 __kmp_internal_fork(loc, gtid, team); 2192 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2193 "master_th=%p, gtid=%d\n", 2194 root, team, master_th, gtid)); 2195 } 2196 2197 if (call_context == fork_context_gnu) { 2198 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2199 return TRUE; 2200 } 2201 2202 /* Invoke microtask for PRIMARY thread */ 2203 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2204 team->t.t_id, team->t.t_pkfn)); 2205 } // END of timer KMP_fork_call block 2206 2207 #if KMP_STATS_ENABLED 2208 // If beginning a teams construct, then change thread state 2209 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2210 if (!ap) { 2211 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2212 } 2213 #endif 2214 2215 if (!team->t.t_invoke(gtid)) { 2216 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2217 } 2218 2219 #if KMP_STATS_ENABLED 2220 // If was beginning of a teams construct, then reset thread state 2221 if (!ap) { 2222 KMP_SET_THREAD_STATE(previous_state); 2223 } 2224 #endif 2225 2226 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2227 team->t.t_id, team->t.t_pkfn)); 2228 KMP_MB(); /* Flush all pending memory write invalidates. */ 2229 2230 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2231 #if OMPT_SUPPORT 2232 if (ompt_enabled.enabled) { 2233 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2234 } 2235 #endif 2236 2237 return TRUE; 2238 } 2239 2240 #if OMPT_SUPPORT 2241 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2242 kmp_team_t *team) { 2243 // restore state outside the region 2244 thread->th.ompt_thread_info.state = 2245 ((team->t.t_serialized) ? ompt_state_work_serial 2246 : ompt_state_work_parallel); 2247 } 2248 2249 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2250 kmp_team_t *team, ompt_data_t *parallel_data, 2251 int flags, void *codeptr) { 2252 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2253 if (ompt_enabled.ompt_callback_parallel_end) { 2254 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2255 parallel_data, &(task_info->task_data), flags, codeptr); 2256 } 2257 2258 task_info->frame.enter_frame = ompt_data_none; 2259 __kmp_join_restore_state(thread, team); 2260 } 2261 #endif 2262 2263 void __kmp_join_call(ident_t *loc, int gtid 2264 #if OMPT_SUPPORT 2265 , 2266 enum fork_context_e fork_context 2267 #endif 2268 , 2269 int exit_teams) { 2270 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2271 kmp_team_t *team; 2272 kmp_team_t *parent_team; 2273 kmp_info_t *master_th; 2274 kmp_root_t *root; 2275 int master_active; 2276 2277 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2278 2279 /* setup current data */ 2280 master_th = __kmp_threads[gtid]; 2281 root = master_th->th.th_root; 2282 team = master_th->th.th_team; 2283 parent_team = team->t.t_parent; 2284 2285 master_th->th.th_ident = loc; 2286 2287 #if OMPT_SUPPORT 2288 void *team_microtask = (void *)team->t.t_pkfn; 2289 // For GOMP interface with serialized parallel, need the 2290 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2291 // and end-parallel events. 2292 if (ompt_enabled.enabled && 2293 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2294 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2295 } 2296 #endif 2297 2298 #if KMP_DEBUG 2299 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2300 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2301 "th_task_team = %p\n", 2302 __kmp_gtid_from_thread(master_th), team, 2303 team->t.t_task_team[master_th->th.th_task_state], 2304 master_th->th.th_task_team)); 2305 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2306 team->t.t_task_team[master_th->th.th_task_state]); 2307 } 2308 #endif 2309 2310 if (team->t.t_serialized) { 2311 if (master_th->th.th_teams_microtask) { 2312 // We are in teams construct 2313 int level = team->t.t_level; 2314 int tlevel = master_th->th.th_teams_level; 2315 if (level == tlevel) { 2316 // AC: we haven't incremented it earlier at start of teams construct, 2317 // so do it here - at the end of teams construct 2318 team->t.t_level++; 2319 } else if (level == tlevel + 1) { 2320 // AC: we are exiting parallel inside teams, need to increment 2321 // serialization in order to restore it in the next call to 2322 // __kmpc_end_serialized_parallel 2323 team->t.t_serialized++; 2324 } 2325 } 2326 __kmpc_end_serialized_parallel(loc, gtid); 2327 2328 #if OMPT_SUPPORT 2329 if (ompt_enabled.enabled) { 2330 __kmp_join_restore_state(master_th, parent_team); 2331 } 2332 #endif 2333 2334 return; 2335 } 2336 2337 master_active = team->t.t_master_active; 2338 2339 if (!exit_teams) { 2340 // AC: No barrier for internal teams at exit from teams construct. 2341 // But there is barrier for external team (league). 2342 __kmp_internal_join(loc, gtid, team); 2343 #if USE_ITT_BUILD 2344 if (__itt_stack_caller_create_ptr) { 2345 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2346 // destroy the stack stitching id after join barrier 2347 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2348 team->t.t_stack_id = NULL; 2349 } 2350 #endif 2351 } else { 2352 master_th->th.th_task_state = 2353 0; // AC: no tasking in teams (out of any parallel) 2354 #if USE_ITT_BUILD 2355 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2356 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2357 // destroy the stack stitching id on exit from the teams construct 2358 // if parent_team is active, then the id will be destroyed later on 2359 // by master of the league of teams 2360 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2361 parent_team->t.t_stack_id = NULL; 2362 } 2363 #endif 2364 } 2365 2366 KMP_MB(); 2367 2368 #if OMPT_SUPPORT 2369 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2370 void *codeptr = team->t.ompt_team_info.master_return_address; 2371 #endif 2372 2373 #if USE_ITT_BUILD 2374 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2375 if (team->t.t_active_level == 1 && 2376 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2377 master_th->th.th_teams_size.nteams == 1)) { 2378 master_th->th.th_ident = loc; 2379 // only one notification scheme (either "submit" or "forking/joined", not 2380 // both) 2381 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2382 __kmp_forkjoin_frames_mode == 3) 2383 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2384 master_th->th.th_frame_time, 0, loc, 2385 master_th->th.th_team_nproc, 1); 2386 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2387 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2388 __kmp_itt_region_joined(gtid); 2389 } // active_level == 1 2390 #endif /* USE_ITT_BUILD */ 2391 2392 if (master_th->th.th_teams_microtask && !exit_teams && 2393 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2394 team->t.t_level == master_th->th.th_teams_level + 1) { 2395 // AC: We need to leave the team structure intact at the end of parallel 2396 // inside the teams construct, so that at the next parallel same (hot) team 2397 // works, only adjust nesting levels 2398 #if OMPT_SUPPORT 2399 ompt_data_t ompt_parallel_data = ompt_data_none; 2400 if (ompt_enabled.enabled) { 2401 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2402 if (ompt_enabled.ompt_callback_implicit_task) { 2403 int ompt_team_size = team->t.t_nproc; 2404 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2405 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2406 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2407 } 2408 task_info->frame.exit_frame = ompt_data_none; 2409 task_info->task_data = ompt_data_none; 2410 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2411 __ompt_lw_taskteam_unlink(master_th); 2412 } 2413 #endif 2414 /* Decrement our nested depth level */ 2415 team->t.t_level--; 2416 team->t.t_active_level--; 2417 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2418 2419 // Restore number of threads in the team if needed. This code relies on 2420 // the proper adjustment of th_teams_size.nth after the fork in 2421 // __kmp_teams_master on each teams primary thread in the case that 2422 // __kmp_reserve_threads reduced it. 2423 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2424 int old_num = master_th->th.th_team_nproc; 2425 int new_num = master_th->th.th_teams_size.nth; 2426 kmp_info_t **other_threads = team->t.t_threads; 2427 team->t.t_nproc = new_num; 2428 for (int i = 0; i < old_num; ++i) { 2429 other_threads[i]->th.th_team_nproc = new_num; 2430 } 2431 // Adjust states of non-used threads of the team 2432 for (int i = old_num; i < new_num; ++i) { 2433 // Re-initialize thread's barrier data. 2434 KMP_DEBUG_ASSERT(other_threads[i]); 2435 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2436 for (int b = 0; b < bs_last_barrier; ++b) { 2437 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2438 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2439 #if USE_DEBUGGER 2440 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2441 #endif 2442 } 2443 if (__kmp_tasking_mode != tskm_immediate_exec) { 2444 // Synchronize thread's task state 2445 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2446 } 2447 } 2448 } 2449 2450 #if OMPT_SUPPORT 2451 if (ompt_enabled.enabled) { 2452 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2453 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2454 } 2455 #endif 2456 2457 return; 2458 } 2459 2460 /* do cleanup and restore the parent team */ 2461 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2462 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2463 2464 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2465 2466 /* jc: The following lock has instructions with REL and ACQ semantics, 2467 separating the parallel user code called in this parallel region 2468 from the serial user code called after this function returns. */ 2469 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2470 2471 if (!master_th->th.th_teams_microtask || 2472 team->t.t_level > master_th->th.th_teams_level) { 2473 /* Decrement our nested depth level */ 2474 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2475 } 2476 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2477 2478 #if OMPT_SUPPORT 2479 if (ompt_enabled.enabled) { 2480 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2481 if (ompt_enabled.ompt_callback_implicit_task) { 2482 int flags = (team_microtask == (void *)__kmp_teams_master) 2483 ? ompt_task_initial 2484 : ompt_task_implicit; 2485 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2486 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2487 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2488 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2489 } 2490 task_info->frame.exit_frame = ompt_data_none; 2491 task_info->task_data = ompt_data_none; 2492 } 2493 #endif 2494 2495 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2496 master_th, team)); 2497 __kmp_pop_current_task_from_thread(master_th); 2498 2499 #if KMP_AFFINITY_SUPPORTED 2500 // Restore master thread's partition. 2501 master_th->th.th_first_place = team->t.t_first_place; 2502 master_th->th.th_last_place = team->t.t_last_place; 2503 #endif // KMP_AFFINITY_SUPPORTED 2504 master_th->th.th_def_allocator = team->t.t_def_allocator; 2505 2506 #if OMPD_SUPPORT 2507 if (ompd_state & OMPD_ENABLE_BP) 2508 ompd_bp_parallel_end(); 2509 #endif 2510 updateHWFPControl(team); 2511 2512 if (root->r.r_active != master_active) 2513 root->r.r_active = master_active; 2514 2515 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2516 master_th)); // this will free worker threads 2517 2518 /* this race was fun to find. make sure the following is in the critical 2519 region otherwise assertions may fail occasionally since the old team may be 2520 reallocated and the hierarchy appears inconsistent. it is actually safe to 2521 run and won't cause any bugs, but will cause those assertion failures. it's 2522 only one deref&assign so might as well put this in the critical region */ 2523 master_th->th.th_team = parent_team; 2524 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2525 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2526 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2527 2528 /* restore serialized team, if need be */ 2529 if (parent_team->t.t_serialized && 2530 parent_team != master_th->th.th_serial_team && 2531 parent_team != root->r.r_root_team) { 2532 __kmp_free_team(root, 2533 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2534 master_th->th.th_serial_team = parent_team; 2535 } 2536 2537 if (__kmp_tasking_mode != tskm_immediate_exec) { 2538 if (master_th->th.th_task_state_top > 2539 0) { // Restore task state from memo stack 2540 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2541 // Remember primary thread's state if we re-use this nested hot team 2542 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2543 master_th->th.th_task_state; 2544 --master_th->th.th_task_state_top; // pop 2545 // Now restore state at this level 2546 master_th->th.th_task_state = 2547 master_th->th 2548 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2549 } 2550 // Copy the task team from the parent team to the primary thread 2551 master_th->th.th_task_team = 2552 parent_team->t.t_task_team[master_th->th.th_task_state]; 2553 KA_TRACE(20, 2554 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2555 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2556 parent_team)); 2557 } 2558 2559 // TODO: GEH - cannot do this assertion because root thread not set up as 2560 // executing 2561 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2562 master_th->th.th_current_task->td_flags.executing = 1; 2563 2564 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2565 2566 #if OMPT_SUPPORT 2567 int flags = 2568 OMPT_INVOKER(fork_context) | 2569 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2570 : ompt_parallel_team); 2571 if (ompt_enabled.enabled) { 2572 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2573 codeptr); 2574 } 2575 #endif 2576 2577 KMP_MB(); 2578 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2579 } 2580 2581 /* Check whether we should push an internal control record onto the 2582 serial team stack. If so, do it. */ 2583 void __kmp_save_internal_controls(kmp_info_t *thread) { 2584 2585 if (thread->th.th_team != thread->th.th_serial_team) { 2586 return; 2587 } 2588 if (thread->th.th_team->t.t_serialized > 1) { 2589 int push = 0; 2590 2591 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2592 push = 1; 2593 } else { 2594 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2595 thread->th.th_team->t.t_serialized) { 2596 push = 1; 2597 } 2598 } 2599 if (push) { /* push a record on the serial team's stack */ 2600 kmp_internal_control_t *control = 2601 (kmp_internal_control_t *)__kmp_allocate( 2602 sizeof(kmp_internal_control_t)); 2603 2604 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2605 2606 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2607 2608 control->next = thread->th.th_team->t.t_control_stack_top; 2609 thread->th.th_team->t.t_control_stack_top = control; 2610 } 2611 } 2612 } 2613 2614 /* Changes set_nproc */ 2615 void __kmp_set_num_threads(int new_nth, int gtid) { 2616 kmp_info_t *thread; 2617 kmp_root_t *root; 2618 2619 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2620 KMP_DEBUG_ASSERT(__kmp_init_serial); 2621 2622 if (new_nth < 1) 2623 new_nth = 1; 2624 else if (new_nth > __kmp_max_nth) 2625 new_nth = __kmp_max_nth; 2626 2627 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2628 thread = __kmp_threads[gtid]; 2629 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2630 return; // nothing to do 2631 2632 __kmp_save_internal_controls(thread); 2633 2634 set__nproc(thread, new_nth); 2635 2636 // If this omp_set_num_threads() call will cause the hot team size to be 2637 // reduced (in the absence of a num_threads clause), then reduce it now, 2638 // rather than waiting for the next parallel region. 2639 root = thread->th.th_root; 2640 if (__kmp_init_parallel && (!root->r.r_active) && 2641 (root->r.r_hot_team->t.t_nproc > new_nth) 2642 #if KMP_NESTED_HOT_TEAMS 2643 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2644 #endif 2645 ) { 2646 kmp_team_t *hot_team = root->r.r_hot_team; 2647 int f; 2648 2649 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2650 2651 // Release the extra threads we don't need any more. 2652 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2653 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2654 if (__kmp_tasking_mode != tskm_immediate_exec) { 2655 // When decreasing team size, threads no longer in the team should unref 2656 // task team. 2657 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2658 } 2659 __kmp_free_thread(hot_team->t.t_threads[f]); 2660 hot_team->t.t_threads[f] = NULL; 2661 } 2662 hot_team->t.t_nproc = new_nth; 2663 #if KMP_NESTED_HOT_TEAMS 2664 if (thread->th.th_hot_teams) { 2665 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2666 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2667 } 2668 #endif 2669 2670 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2671 2672 // Update the t_nproc field in the threads that are still active. 2673 for (f = 0; f < new_nth; f++) { 2674 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2675 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2676 } 2677 // Special flag in case omp_set_num_threads() call 2678 hot_team->t.t_size_changed = -1; 2679 } 2680 } 2681 2682 /* Changes max_active_levels */ 2683 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2684 kmp_info_t *thread; 2685 2686 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2687 "%d = (%d)\n", 2688 gtid, max_active_levels)); 2689 KMP_DEBUG_ASSERT(__kmp_init_serial); 2690 2691 // validate max_active_levels 2692 if (max_active_levels < 0) { 2693 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2694 // We ignore this call if the user has specified a negative value. 2695 // The current setting won't be changed. The last valid setting will be 2696 // used. A warning will be issued (if warnings are allowed as controlled by 2697 // the KMP_WARNINGS env var). 2698 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2699 "max_active_levels for thread %d = (%d)\n", 2700 gtid, max_active_levels)); 2701 return; 2702 } 2703 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2704 // it's OK, the max_active_levels is within the valid range: [ 0; 2705 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2706 // We allow a zero value. (implementation defined behavior) 2707 } else { 2708 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2709 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2710 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2711 // Current upper limit is MAX_INT. (implementation defined behavior) 2712 // If the input exceeds the upper limit, we correct the input to be the 2713 // upper limit. (implementation defined behavior) 2714 // Actually, the flow should never get here until we use MAX_INT limit. 2715 } 2716 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2717 "max_active_levels for thread %d = (%d)\n", 2718 gtid, max_active_levels)); 2719 2720 thread = __kmp_threads[gtid]; 2721 2722 __kmp_save_internal_controls(thread); 2723 2724 set__max_active_levels(thread, max_active_levels); 2725 } 2726 2727 /* Gets max_active_levels */ 2728 int __kmp_get_max_active_levels(int gtid) { 2729 kmp_info_t *thread; 2730 2731 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2732 KMP_DEBUG_ASSERT(__kmp_init_serial); 2733 2734 thread = __kmp_threads[gtid]; 2735 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2736 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2737 "curtask_maxaclevel=%d\n", 2738 gtid, thread->th.th_current_task, 2739 thread->th.th_current_task->td_icvs.max_active_levels)); 2740 return thread->th.th_current_task->td_icvs.max_active_levels; 2741 } 2742 2743 // nteams-var per-device ICV 2744 void __kmp_set_num_teams(int num_teams) { 2745 if (num_teams > 0) 2746 __kmp_nteams = num_teams; 2747 } 2748 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2749 // teams-thread-limit-var per-device ICV 2750 void __kmp_set_teams_thread_limit(int limit) { 2751 if (limit > 0) 2752 __kmp_teams_thread_limit = limit; 2753 } 2754 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2755 2756 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2757 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2758 2759 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2760 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2761 kmp_info_t *thread; 2762 kmp_sched_t orig_kind; 2763 // kmp_team_t *team; 2764 2765 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2766 gtid, (int)kind, chunk)); 2767 KMP_DEBUG_ASSERT(__kmp_init_serial); 2768 2769 // Check if the kind parameter is valid, correct if needed. 2770 // Valid parameters should fit in one of two intervals - standard or extended: 2771 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2772 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2773 orig_kind = kind; 2774 kind = __kmp_sched_without_mods(kind); 2775 2776 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2777 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2778 // TODO: Hint needs attention in case we change the default schedule. 2779 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2780 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2781 __kmp_msg_null); 2782 kind = kmp_sched_default; 2783 chunk = 0; // ignore chunk value in case of bad kind 2784 } 2785 2786 thread = __kmp_threads[gtid]; 2787 2788 __kmp_save_internal_controls(thread); 2789 2790 if (kind < kmp_sched_upper_std) { 2791 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2792 // differ static chunked vs. unchunked: chunk should be invalid to 2793 // indicate unchunked schedule (which is the default) 2794 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2795 } else { 2796 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2797 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2798 } 2799 } else { 2800 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2801 // kmp_sched_lower - 2 ]; 2802 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2803 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2804 kmp_sched_lower - 2]; 2805 } 2806 __kmp_sched_apply_mods_intkind( 2807 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2808 if (kind == kmp_sched_auto || chunk < 1) { 2809 // ignore parameter chunk for schedule auto 2810 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2811 } else { 2812 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2813 } 2814 } 2815 2816 /* Gets def_sched_var ICV values */ 2817 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2818 kmp_info_t *thread; 2819 enum sched_type th_type; 2820 2821 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2822 KMP_DEBUG_ASSERT(__kmp_init_serial); 2823 2824 thread = __kmp_threads[gtid]; 2825 2826 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2827 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2828 case kmp_sch_static: 2829 case kmp_sch_static_greedy: 2830 case kmp_sch_static_balanced: 2831 *kind = kmp_sched_static; 2832 __kmp_sched_apply_mods_stdkind(kind, th_type); 2833 *chunk = 0; // chunk was not set, try to show this fact via zero value 2834 return; 2835 case kmp_sch_static_chunked: 2836 *kind = kmp_sched_static; 2837 break; 2838 case kmp_sch_dynamic_chunked: 2839 *kind = kmp_sched_dynamic; 2840 break; 2841 case kmp_sch_guided_chunked: 2842 case kmp_sch_guided_iterative_chunked: 2843 case kmp_sch_guided_analytical_chunked: 2844 *kind = kmp_sched_guided; 2845 break; 2846 case kmp_sch_auto: 2847 *kind = kmp_sched_auto; 2848 break; 2849 case kmp_sch_trapezoidal: 2850 *kind = kmp_sched_trapezoidal; 2851 break; 2852 #if KMP_STATIC_STEAL_ENABLED 2853 case kmp_sch_static_steal: 2854 *kind = kmp_sched_static_steal; 2855 break; 2856 #endif 2857 default: 2858 KMP_FATAL(UnknownSchedulingType, th_type); 2859 } 2860 2861 __kmp_sched_apply_mods_stdkind(kind, th_type); 2862 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2863 } 2864 2865 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2866 2867 int ii, dd; 2868 kmp_team_t *team; 2869 kmp_info_t *thr; 2870 2871 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2872 KMP_DEBUG_ASSERT(__kmp_init_serial); 2873 2874 // validate level 2875 if (level == 0) 2876 return 0; 2877 if (level < 0) 2878 return -1; 2879 thr = __kmp_threads[gtid]; 2880 team = thr->th.th_team; 2881 ii = team->t.t_level; 2882 if (level > ii) 2883 return -1; 2884 2885 if (thr->th.th_teams_microtask) { 2886 // AC: we are in teams region where multiple nested teams have same level 2887 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2888 if (level <= 2889 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2890 KMP_DEBUG_ASSERT(ii >= tlevel); 2891 // AC: As we need to pass by the teams league, we need to artificially 2892 // increase ii 2893 if (ii == tlevel) { 2894 ii += 2; // three teams have same level 2895 } else { 2896 ii++; // two teams have same level 2897 } 2898 } 2899 } 2900 2901 if (ii == level) 2902 return __kmp_tid_from_gtid(gtid); 2903 2904 dd = team->t.t_serialized; 2905 level++; 2906 while (ii > level) { 2907 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2908 } 2909 if ((team->t.t_serialized) && (!dd)) { 2910 team = team->t.t_parent; 2911 continue; 2912 } 2913 if (ii > level) { 2914 team = team->t.t_parent; 2915 dd = team->t.t_serialized; 2916 ii--; 2917 } 2918 } 2919 2920 return (dd > 1) ? (0) : (team->t.t_master_tid); 2921 } 2922 2923 int __kmp_get_team_size(int gtid, int level) { 2924 2925 int ii, dd; 2926 kmp_team_t *team; 2927 kmp_info_t *thr; 2928 2929 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2930 KMP_DEBUG_ASSERT(__kmp_init_serial); 2931 2932 // validate level 2933 if (level == 0) 2934 return 1; 2935 if (level < 0) 2936 return -1; 2937 thr = __kmp_threads[gtid]; 2938 team = thr->th.th_team; 2939 ii = team->t.t_level; 2940 if (level > ii) 2941 return -1; 2942 2943 if (thr->th.th_teams_microtask) { 2944 // AC: we are in teams region where multiple nested teams have same level 2945 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2946 if (level <= 2947 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2948 KMP_DEBUG_ASSERT(ii >= tlevel); 2949 // AC: As we need to pass by the teams league, we need to artificially 2950 // increase ii 2951 if (ii == tlevel) { 2952 ii += 2; // three teams have same level 2953 } else { 2954 ii++; // two teams have same level 2955 } 2956 } 2957 } 2958 2959 while (ii > level) { 2960 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2961 } 2962 if (team->t.t_serialized && (!dd)) { 2963 team = team->t.t_parent; 2964 continue; 2965 } 2966 if (ii > level) { 2967 team = team->t.t_parent; 2968 ii--; 2969 } 2970 } 2971 2972 return team->t.t_nproc; 2973 } 2974 2975 kmp_r_sched_t __kmp_get_schedule_global() { 2976 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2977 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2978 // independently. So one can get the updated schedule here. 2979 2980 kmp_r_sched_t r_sched; 2981 2982 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2983 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2984 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2985 // different roots (even in OMP 2.5) 2986 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2987 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2988 if (s == kmp_sch_static) { 2989 // replace STATIC with more detailed schedule (balanced or greedy) 2990 r_sched.r_sched_type = __kmp_static; 2991 } else if (s == kmp_sch_guided_chunked) { 2992 // replace GUIDED with more detailed schedule (iterative or analytical) 2993 r_sched.r_sched_type = __kmp_guided; 2994 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2995 r_sched.r_sched_type = __kmp_sched; 2996 } 2997 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2998 2999 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3000 // __kmp_chunk may be wrong here (if it was not ever set) 3001 r_sched.chunk = KMP_DEFAULT_CHUNK; 3002 } else { 3003 r_sched.chunk = __kmp_chunk; 3004 } 3005 3006 return r_sched; 3007 } 3008 3009 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3010 at least argc number of *t_argv entries for the requested team. */ 3011 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3012 3013 KMP_DEBUG_ASSERT(team); 3014 if (!realloc || argc > team->t.t_max_argc) { 3015 3016 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3017 "current entries=%d\n", 3018 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3019 /* if previously allocated heap space for args, free them */ 3020 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3021 __kmp_free((void *)team->t.t_argv); 3022 3023 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3024 /* use unused space in the cache line for arguments */ 3025 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3026 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3027 "argv entries\n", 3028 team->t.t_id, team->t.t_max_argc)); 3029 team->t.t_argv = &team->t.t_inline_argv[0]; 3030 if (__kmp_storage_map) { 3031 __kmp_print_storage_map_gtid( 3032 -1, &team->t.t_inline_argv[0], 3033 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3034 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3035 team->t.t_id); 3036 } 3037 } else { 3038 /* allocate space for arguments in the heap */ 3039 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3040 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3041 : 2 * argc; 3042 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3043 "argv entries\n", 3044 team->t.t_id, team->t.t_max_argc)); 3045 team->t.t_argv = 3046 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3047 if (__kmp_storage_map) { 3048 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3049 &team->t.t_argv[team->t.t_max_argc], 3050 sizeof(void *) * team->t.t_max_argc, 3051 "team_%d.t_argv", team->t.t_id); 3052 } 3053 } 3054 } 3055 } 3056 3057 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3058 int i; 3059 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3060 team->t.t_threads = 3061 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3062 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3063 sizeof(dispatch_shared_info_t) * num_disp_buff); 3064 team->t.t_dispatch = 3065 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3066 team->t.t_implicit_task_taskdata = 3067 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3068 team->t.t_max_nproc = max_nth; 3069 3070 /* setup dispatch buffers */ 3071 for (i = 0; i < num_disp_buff; ++i) { 3072 team->t.t_disp_buffer[i].buffer_index = i; 3073 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3074 } 3075 } 3076 3077 static void __kmp_free_team_arrays(kmp_team_t *team) { 3078 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3079 int i; 3080 for (i = 0; i < team->t.t_max_nproc; ++i) { 3081 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3082 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3083 team->t.t_dispatch[i].th_disp_buffer = NULL; 3084 } 3085 } 3086 #if KMP_USE_HIER_SCHED 3087 __kmp_dispatch_free_hierarchies(team); 3088 #endif 3089 __kmp_free(team->t.t_threads); 3090 __kmp_free(team->t.t_disp_buffer); 3091 __kmp_free(team->t.t_dispatch); 3092 __kmp_free(team->t.t_implicit_task_taskdata); 3093 team->t.t_threads = NULL; 3094 team->t.t_disp_buffer = NULL; 3095 team->t.t_dispatch = NULL; 3096 team->t.t_implicit_task_taskdata = 0; 3097 } 3098 3099 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3100 kmp_info_t **oldThreads = team->t.t_threads; 3101 3102 __kmp_free(team->t.t_disp_buffer); 3103 __kmp_free(team->t.t_dispatch); 3104 __kmp_free(team->t.t_implicit_task_taskdata); 3105 __kmp_allocate_team_arrays(team, max_nth); 3106 3107 KMP_MEMCPY(team->t.t_threads, oldThreads, 3108 team->t.t_nproc * sizeof(kmp_info_t *)); 3109 3110 __kmp_free(oldThreads); 3111 } 3112 3113 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3114 3115 kmp_r_sched_t r_sched = 3116 __kmp_get_schedule_global(); // get current state of scheduling globals 3117 3118 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3119 3120 kmp_internal_control_t g_icvs = { 3121 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3122 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3123 // adjustment of threads (per thread) 3124 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3125 // whether blocktime is explicitly set 3126 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3127 #if KMP_USE_MONITOR 3128 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3129 // intervals 3130 #endif 3131 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3132 // next parallel region (per thread) 3133 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3134 __kmp_cg_max_nth, // int thread_limit; 3135 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3136 // for max_active_levels 3137 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3138 // {sched,chunk} pair 3139 __kmp_nested_proc_bind.bind_types[0], 3140 __kmp_default_device, 3141 NULL // struct kmp_internal_control *next; 3142 }; 3143 3144 return g_icvs; 3145 } 3146 3147 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3148 3149 kmp_internal_control_t gx_icvs; 3150 gx_icvs.serial_nesting_level = 3151 0; // probably =team->t.t_serial like in save_inter_controls 3152 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3153 gx_icvs.next = NULL; 3154 3155 return gx_icvs; 3156 } 3157 3158 static void __kmp_initialize_root(kmp_root_t *root) { 3159 int f; 3160 kmp_team_t *root_team; 3161 kmp_team_t *hot_team; 3162 int hot_team_max_nth; 3163 kmp_r_sched_t r_sched = 3164 __kmp_get_schedule_global(); // get current state of scheduling globals 3165 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3166 KMP_DEBUG_ASSERT(root); 3167 KMP_ASSERT(!root->r.r_begin); 3168 3169 /* setup the root state structure */ 3170 __kmp_init_lock(&root->r.r_begin_lock); 3171 root->r.r_begin = FALSE; 3172 root->r.r_active = FALSE; 3173 root->r.r_in_parallel = 0; 3174 root->r.r_blocktime = __kmp_dflt_blocktime; 3175 #if KMP_AFFINITY_SUPPORTED 3176 root->r.r_affinity_assigned = FALSE; 3177 #endif 3178 3179 /* setup the root team for this task */ 3180 /* allocate the root team structure */ 3181 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3182 3183 root_team = 3184 __kmp_allocate_team(root, 3185 1, // new_nproc 3186 1, // max_nproc 3187 #if OMPT_SUPPORT 3188 ompt_data_none, // root parallel id 3189 #endif 3190 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3191 0 // argc 3192 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3193 ); 3194 #if USE_DEBUGGER 3195 // Non-NULL value should be assigned to make the debugger display the root 3196 // team. 3197 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3198 #endif 3199 3200 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3201 3202 root->r.r_root_team = root_team; 3203 root_team->t.t_control_stack_top = NULL; 3204 3205 /* initialize root team */ 3206 root_team->t.t_threads[0] = NULL; 3207 root_team->t.t_nproc = 1; 3208 root_team->t.t_serialized = 1; 3209 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3210 root_team->t.t_sched.sched = r_sched.sched; 3211 KA_TRACE( 3212 20, 3213 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3214 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3215 3216 /* setup the hot team for this task */ 3217 /* allocate the hot team structure */ 3218 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3219 3220 hot_team = 3221 __kmp_allocate_team(root, 3222 1, // new_nproc 3223 __kmp_dflt_team_nth_ub * 2, // max_nproc 3224 #if OMPT_SUPPORT 3225 ompt_data_none, // root parallel id 3226 #endif 3227 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3228 0 // argc 3229 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3230 ); 3231 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3232 3233 root->r.r_hot_team = hot_team; 3234 root_team->t.t_control_stack_top = NULL; 3235 3236 /* first-time initialization */ 3237 hot_team->t.t_parent = root_team; 3238 3239 /* initialize hot team */ 3240 hot_team_max_nth = hot_team->t.t_max_nproc; 3241 for (f = 0; f < hot_team_max_nth; ++f) { 3242 hot_team->t.t_threads[f] = NULL; 3243 } 3244 hot_team->t.t_nproc = 1; 3245 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3246 hot_team->t.t_sched.sched = r_sched.sched; 3247 hot_team->t.t_size_changed = 0; 3248 } 3249 3250 #ifdef KMP_DEBUG 3251 3252 typedef struct kmp_team_list_item { 3253 kmp_team_p const *entry; 3254 struct kmp_team_list_item *next; 3255 } kmp_team_list_item_t; 3256 typedef kmp_team_list_item_t *kmp_team_list_t; 3257 3258 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3259 kmp_team_list_t list, // List of teams. 3260 kmp_team_p const *team // Team to add. 3261 ) { 3262 3263 // List must terminate with item where both entry and next are NULL. 3264 // Team is added to the list only once. 3265 // List is sorted in ascending order by team id. 3266 // Team id is *not* a key. 3267 3268 kmp_team_list_t l; 3269 3270 KMP_DEBUG_ASSERT(list != NULL); 3271 if (team == NULL) { 3272 return; 3273 } 3274 3275 __kmp_print_structure_team_accum(list, team->t.t_parent); 3276 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3277 3278 // Search list for the team. 3279 l = list; 3280 while (l->next != NULL && l->entry != team) { 3281 l = l->next; 3282 } 3283 if (l->next != NULL) { 3284 return; // Team has been added before, exit. 3285 } 3286 3287 // Team is not found. Search list again for insertion point. 3288 l = list; 3289 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3290 l = l->next; 3291 } 3292 3293 // Insert team. 3294 { 3295 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3296 sizeof(kmp_team_list_item_t)); 3297 *item = *l; 3298 l->entry = team; 3299 l->next = item; 3300 } 3301 } 3302 3303 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3304 3305 ) { 3306 __kmp_printf("%s", title); 3307 if (team != NULL) { 3308 __kmp_printf("%2x %p\n", team->t.t_id, team); 3309 } else { 3310 __kmp_printf(" - (nil)\n"); 3311 } 3312 } 3313 3314 static void __kmp_print_structure_thread(char const *title, 3315 kmp_info_p const *thread) { 3316 __kmp_printf("%s", title); 3317 if (thread != NULL) { 3318 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3319 } else { 3320 __kmp_printf(" - (nil)\n"); 3321 } 3322 } 3323 3324 void __kmp_print_structure(void) { 3325 3326 kmp_team_list_t list; 3327 3328 // Initialize list of teams. 3329 list = 3330 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3331 list->entry = NULL; 3332 list->next = NULL; 3333 3334 __kmp_printf("\n------------------------------\nGlobal Thread " 3335 "Table\n------------------------------\n"); 3336 { 3337 int gtid; 3338 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3339 __kmp_printf("%2d", gtid); 3340 if (__kmp_threads != NULL) { 3341 __kmp_printf(" %p", __kmp_threads[gtid]); 3342 } 3343 if (__kmp_root != NULL) { 3344 __kmp_printf(" %p", __kmp_root[gtid]); 3345 } 3346 __kmp_printf("\n"); 3347 } 3348 } 3349 3350 // Print out __kmp_threads array. 3351 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3352 "----------\n"); 3353 if (__kmp_threads != NULL) { 3354 int gtid; 3355 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3356 kmp_info_t const *thread = __kmp_threads[gtid]; 3357 if (thread != NULL) { 3358 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3359 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3360 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3361 __kmp_print_structure_team(" Serial Team: ", 3362 thread->th.th_serial_team); 3363 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3364 __kmp_print_structure_thread(" Primary: ", 3365 thread->th.th_team_master); 3366 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3367 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3368 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3369 __kmp_print_structure_thread(" Next in pool: ", 3370 thread->th.th_next_pool); 3371 __kmp_printf("\n"); 3372 __kmp_print_structure_team_accum(list, thread->th.th_team); 3373 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3374 } 3375 } 3376 } else { 3377 __kmp_printf("Threads array is not allocated.\n"); 3378 } 3379 3380 // Print out __kmp_root array. 3381 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3382 "--------\n"); 3383 if (__kmp_root != NULL) { 3384 int gtid; 3385 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3386 kmp_root_t const *root = __kmp_root[gtid]; 3387 if (root != NULL) { 3388 __kmp_printf("GTID %2d %p:\n", gtid, root); 3389 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3390 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3391 __kmp_print_structure_thread(" Uber Thread: ", 3392 root->r.r_uber_thread); 3393 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3394 __kmp_printf(" In Parallel: %2d\n", 3395 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3396 __kmp_printf("\n"); 3397 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3398 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3399 } 3400 } 3401 } else { 3402 __kmp_printf("Ubers array is not allocated.\n"); 3403 } 3404 3405 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3406 "--------\n"); 3407 while (list->next != NULL) { 3408 kmp_team_p const *team = list->entry; 3409 int i; 3410 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3411 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3412 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3413 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3414 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3415 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3416 for (i = 0; i < team->t.t_nproc; ++i) { 3417 __kmp_printf(" Thread %2d: ", i); 3418 __kmp_print_structure_thread("", team->t.t_threads[i]); 3419 } 3420 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3421 __kmp_printf("\n"); 3422 list = list->next; 3423 } 3424 3425 // Print out __kmp_thread_pool and __kmp_team_pool. 3426 __kmp_printf("\n------------------------------\nPools\n----------------------" 3427 "--------\n"); 3428 __kmp_print_structure_thread("Thread pool: ", 3429 CCAST(kmp_info_t *, __kmp_thread_pool)); 3430 __kmp_print_structure_team("Team pool: ", 3431 CCAST(kmp_team_t *, __kmp_team_pool)); 3432 __kmp_printf("\n"); 3433 3434 // Free team list. 3435 while (list != NULL) { 3436 kmp_team_list_item_t *item = list; 3437 list = list->next; 3438 KMP_INTERNAL_FREE(item); 3439 } 3440 } 3441 3442 #endif 3443 3444 //--------------------------------------------------------------------------- 3445 // Stuff for per-thread fast random number generator 3446 // Table of primes 3447 static const unsigned __kmp_primes[] = { 3448 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3449 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3450 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3451 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3452 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3453 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3454 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3455 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3456 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3457 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3458 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3459 3460 //--------------------------------------------------------------------------- 3461 // __kmp_get_random: Get a random number using a linear congruential method. 3462 unsigned short __kmp_get_random(kmp_info_t *thread) { 3463 unsigned x = thread->th.th_x; 3464 unsigned short r = (unsigned short)(x >> 16); 3465 3466 thread->th.th_x = x * thread->th.th_a + 1; 3467 3468 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3469 thread->th.th_info.ds.ds_tid, r)); 3470 3471 return r; 3472 } 3473 //-------------------------------------------------------- 3474 // __kmp_init_random: Initialize a random number generator 3475 void __kmp_init_random(kmp_info_t *thread) { 3476 unsigned seed = thread->th.th_info.ds.ds_tid; 3477 3478 thread->th.th_a = 3479 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3480 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3481 KA_TRACE(30, 3482 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3483 } 3484 3485 #if KMP_OS_WINDOWS 3486 /* reclaim array entries for root threads that are already dead, returns number 3487 * reclaimed */ 3488 static int __kmp_reclaim_dead_roots(void) { 3489 int i, r = 0; 3490 3491 for (i = 0; i < __kmp_threads_capacity; ++i) { 3492 if (KMP_UBER_GTID(i) && 3493 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3494 !__kmp_root[i] 3495 ->r.r_active) { // AC: reclaim only roots died in non-active state 3496 r += __kmp_unregister_root_other_thread(i); 3497 } 3498 } 3499 return r; 3500 } 3501 #endif 3502 3503 /* This function attempts to create free entries in __kmp_threads and 3504 __kmp_root, and returns the number of free entries generated. 3505 3506 For Windows* OS static library, the first mechanism used is to reclaim array 3507 entries for root threads that are already dead. 3508 3509 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3510 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3511 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3512 threadprivate cache array has been created. Synchronization with 3513 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3514 3515 After any dead root reclamation, if the clipping value allows array expansion 3516 to result in the generation of a total of nNeed free slots, the function does 3517 that expansion. If not, nothing is done beyond the possible initial root 3518 thread reclamation. 3519 3520 If any argument is negative, the behavior is undefined. */ 3521 static int __kmp_expand_threads(int nNeed) { 3522 int added = 0; 3523 int minimumRequiredCapacity; 3524 int newCapacity; 3525 kmp_info_t **newThreads; 3526 kmp_root_t **newRoot; 3527 3528 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3529 // resizing __kmp_threads does not need additional protection if foreign 3530 // threads are present 3531 3532 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3533 /* only for Windows static library */ 3534 /* reclaim array entries for root threads that are already dead */ 3535 added = __kmp_reclaim_dead_roots(); 3536 3537 if (nNeed) { 3538 nNeed -= added; 3539 if (nNeed < 0) 3540 nNeed = 0; 3541 } 3542 #endif 3543 if (nNeed <= 0) 3544 return added; 3545 3546 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3547 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3548 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3549 // > __kmp_max_nth in one of two ways: 3550 // 3551 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3552 // may not be reused by another thread, so we may need to increase 3553 // __kmp_threads_capacity to __kmp_max_nth + 1. 3554 // 3555 // 2) New foreign root(s) are encountered. We always register new foreign 3556 // roots. This may cause a smaller # of threads to be allocated at 3557 // subsequent parallel regions, but the worker threads hang around (and 3558 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3559 // 3560 // Anyway, that is the reason for moving the check to see if 3561 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3562 // instead of having it performed here. -BB 3563 3564 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3565 3566 /* compute expansion headroom to check if we can expand */ 3567 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3568 /* possible expansion too small -- give up */ 3569 return added; 3570 } 3571 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3572 3573 newCapacity = __kmp_threads_capacity; 3574 do { 3575 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3576 : __kmp_sys_max_nth; 3577 } while (newCapacity < minimumRequiredCapacity); 3578 newThreads = (kmp_info_t **)__kmp_allocate( 3579 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3580 newRoot = 3581 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3582 KMP_MEMCPY(newThreads, __kmp_threads, 3583 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3584 KMP_MEMCPY(newRoot, __kmp_root, 3585 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3586 3587 kmp_info_t **temp_threads = __kmp_threads; 3588 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3589 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3590 __kmp_free(temp_threads); 3591 added += newCapacity - __kmp_threads_capacity; 3592 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3593 3594 if (newCapacity > __kmp_tp_capacity) { 3595 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3596 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3597 __kmp_threadprivate_resize_cache(newCapacity); 3598 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3599 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3600 } 3601 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3602 } 3603 3604 return added; 3605 } 3606 3607 /* Register the current thread as a root thread and obtain our gtid. We must 3608 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3609 thread that calls from __kmp_do_serial_initialize() */ 3610 int __kmp_register_root(int initial_thread) { 3611 kmp_info_t *root_thread; 3612 kmp_root_t *root; 3613 int gtid; 3614 int capacity; 3615 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3616 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3617 KMP_MB(); 3618 3619 /* 2007-03-02: 3620 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3621 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3622 work as expected -- it may return false (that means there is at least one 3623 empty slot in __kmp_threads array), but it is possible the only free slot 3624 is #0, which is reserved for initial thread and so cannot be used for this 3625 one. Following code workarounds this bug. 3626 3627 However, right solution seems to be not reserving slot #0 for initial 3628 thread because: 3629 (1) there is no magic in slot #0, 3630 (2) we cannot detect initial thread reliably (the first thread which does 3631 serial initialization may be not a real initial thread). 3632 */ 3633 capacity = __kmp_threads_capacity; 3634 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3635 --capacity; 3636 } 3637 3638 // If it is not for initializing the hidden helper team, we need to take 3639 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3640 // in __kmp_threads_capacity. 3641 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3642 capacity -= __kmp_hidden_helper_threads_num; 3643 } 3644 3645 /* see if there are too many threads */ 3646 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3647 if (__kmp_tp_cached) { 3648 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3649 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3650 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3651 } else { 3652 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3653 __kmp_msg_null); 3654 } 3655 } 3656 3657 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3658 // 0: initial thread, also a regular OpenMP thread. 3659 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3660 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3661 // regular OpenMP threads. 3662 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3663 // Find an available thread slot for hidden helper thread. Slots for hidden 3664 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3665 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3666 gtid <= __kmp_hidden_helper_threads_num; 3667 gtid++) 3668 ; 3669 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3670 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3671 "hidden helper thread: T#%d\n", 3672 gtid)); 3673 } else { 3674 /* find an available thread slot */ 3675 // Don't reassign the zero slot since we need that to only be used by 3676 // initial thread. Slots for hidden helper threads should also be skipped. 3677 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3678 gtid = 0; 3679 } else { 3680 for (gtid = __kmp_hidden_helper_threads_num + 1; 3681 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3682 ; 3683 } 3684 KA_TRACE( 3685 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3686 KMP_ASSERT(gtid < __kmp_threads_capacity); 3687 } 3688 3689 /* update global accounting */ 3690 __kmp_all_nth++; 3691 TCW_4(__kmp_nth, __kmp_nth + 1); 3692 3693 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3694 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3695 if (__kmp_adjust_gtid_mode) { 3696 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3697 if (TCR_4(__kmp_gtid_mode) != 2) { 3698 TCW_4(__kmp_gtid_mode, 2); 3699 } 3700 } else { 3701 if (TCR_4(__kmp_gtid_mode) != 1) { 3702 TCW_4(__kmp_gtid_mode, 1); 3703 } 3704 } 3705 } 3706 3707 #ifdef KMP_ADJUST_BLOCKTIME 3708 /* Adjust blocktime to zero if necessary */ 3709 /* Middle initialization might not have occurred yet */ 3710 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3711 if (__kmp_nth > __kmp_avail_proc) { 3712 __kmp_zero_bt = TRUE; 3713 } 3714 } 3715 #endif /* KMP_ADJUST_BLOCKTIME */ 3716 3717 /* setup this new hierarchy */ 3718 if (!(root = __kmp_root[gtid])) { 3719 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3720 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3721 } 3722 3723 #if KMP_STATS_ENABLED 3724 // Initialize stats as soon as possible (right after gtid assignment). 3725 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3726 __kmp_stats_thread_ptr->startLife(); 3727 KMP_SET_THREAD_STATE(SERIAL_REGION); 3728 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3729 #endif 3730 __kmp_initialize_root(root); 3731 3732 /* setup new root thread structure */ 3733 if (root->r.r_uber_thread) { 3734 root_thread = root->r.r_uber_thread; 3735 } else { 3736 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3737 if (__kmp_storage_map) { 3738 __kmp_print_thread_storage_map(root_thread, gtid); 3739 } 3740 root_thread->th.th_info.ds.ds_gtid = gtid; 3741 #if OMPT_SUPPORT 3742 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3743 #endif 3744 root_thread->th.th_root = root; 3745 if (__kmp_env_consistency_check) { 3746 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3747 } 3748 #if USE_FAST_MEMORY 3749 __kmp_initialize_fast_memory(root_thread); 3750 #endif /* USE_FAST_MEMORY */ 3751 3752 #if KMP_USE_BGET 3753 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3754 __kmp_initialize_bget(root_thread); 3755 #endif 3756 __kmp_init_random(root_thread); // Initialize random number generator 3757 } 3758 3759 /* setup the serial team held in reserve by the root thread */ 3760 if (!root_thread->th.th_serial_team) { 3761 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3762 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3763 root_thread->th.th_serial_team = __kmp_allocate_team( 3764 root, 1, 1, 3765 #if OMPT_SUPPORT 3766 ompt_data_none, // root parallel id 3767 #endif 3768 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3769 } 3770 KMP_ASSERT(root_thread->th.th_serial_team); 3771 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3772 root_thread->th.th_serial_team)); 3773 3774 /* drop root_thread into place */ 3775 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3776 3777 root->r.r_root_team->t.t_threads[0] = root_thread; 3778 root->r.r_hot_team->t.t_threads[0] = root_thread; 3779 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3780 // AC: the team created in reserve, not for execution (it is unused for now). 3781 root_thread->th.th_serial_team->t.t_serialized = 0; 3782 root->r.r_uber_thread = root_thread; 3783 3784 /* initialize the thread, get it ready to go */ 3785 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3786 TCW_4(__kmp_init_gtid, TRUE); 3787 3788 /* prepare the primary thread for get_gtid() */ 3789 __kmp_gtid_set_specific(gtid); 3790 3791 #if USE_ITT_BUILD 3792 __kmp_itt_thread_name(gtid); 3793 #endif /* USE_ITT_BUILD */ 3794 3795 #ifdef KMP_TDATA_GTID 3796 __kmp_gtid = gtid; 3797 #endif 3798 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3799 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3800 3801 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3802 "plain=%u\n", 3803 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3804 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3805 KMP_INIT_BARRIER_STATE)); 3806 { // Initialize barrier data. 3807 int b; 3808 for (b = 0; b < bs_last_barrier; ++b) { 3809 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3810 #if USE_DEBUGGER 3811 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3812 #endif 3813 } 3814 } 3815 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3816 KMP_INIT_BARRIER_STATE); 3817 3818 #if KMP_AFFINITY_SUPPORTED 3819 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3820 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3821 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3822 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3823 #endif /* KMP_AFFINITY_SUPPORTED */ 3824 root_thread->th.th_def_allocator = __kmp_def_allocator; 3825 root_thread->th.th_prev_level = 0; 3826 root_thread->th.th_prev_num_threads = 1; 3827 3828 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3829 tmp->cg_root = root_thread; 3830 tmp->cg_thread_limit = __kmp_cg_max_nth; 3831 tmp->cg_nthreads = 1; 3832 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3833 " cg_nthreads init to 1\n", 3834 root_thread, tmp)); 3835 tmp->up = NULL; 3836 root_thread->th.th_cg_roots = tmp; 3837 3838 __kmp_root_counter++; 3839 3840 #if OMPT_SUPPORT 3841 if (!initial_thread && ompt_enabled.enabled) { 3842 3843 kmp_info_t *root_thread = ompt_get_thread(); 3844 3845 ompt_set_thread_state(root_thread, ompt_state_overhead); 3846 3847 if (ompt_enabled.ompt_callback_thread_begin) { 3848 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3849 ompt_thread_initial, __ompt_get_thread_data_internal()); 3850 } 3851 ompt_data_t *task_data; 3852 ompt_data_t *parallel_data; 3853 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3854 NULL); 3855 if (ompt_enabled.ompt_callback_implicit_task) { 3856 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3857 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3858 } 3859 3860 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3861 } 3862 #endif 3863 #if OMPD_SUPPORT 3864 if (ompd_state & OMPD_ENABLE_BP) 3865 ompd_bp_thread_begin(); 3866 #endif 3867 3868 KMP_MB(); 3869 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3870 3871 return gtid; 3872 } 3873 3874 #if KMP_NESTED_HOT_TEAMS 3875 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3876 const int max_level) { 3877 int i, n, nth; 3878 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3879 if (!hot_teams || !hot_teams[level].hot_team) { 3880 return 0; 3881 } 3882 KMP_DEBUG_ASSERT(level < max_level); 3883 kmp_team_t *team = hot_teams[level].hot_team; 3884 nth = hot_teams[level].hot_team_nth; 3885 n = nth - 1; // primary thread is not freed 3886 if (level < max_level - 1) { 3887 for (i = 0; i < nth; ++i) { 3888 kmp_info_t *th = team->t.t_threads[i]; 3889 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3890 if (i > 0 && th->th.th_hot_teams) { 3891 __kmp_free(th->th.th_hot_teams); 3892 th->th.th_hot_teams = NULL; 3893 } 3894 } 3895 } 3896 __kmp_free_team(root, team, NULL); 3897 return n; 3898 } 3899 #endif 3900 3901 // Resets a root thread and clear its root and hot teams. 3902 // Returns the number of __kmp_threads entries directly and indirectly freed. 3903 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3904 kmp_team_t *root_team = root->r.r_root_team; 3905 kmp_team_t *hot_team = root->r.r_hot_team; 3906 int n = hot_team->t.t_nproc; 3907 int i; 3908 3909 KMP_DEBUG_ASSERT(!root->r.r_active); 3910 3911 root->r.r_root_team = NULL; 3912 root->r.r_hot_team = NULL; 3913 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3914 // before call to __kmp_free_team(). 3915 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3916 #if KMP_NESTED_HOT_TEAMS 3917 if (__kmp_hot_teams_max_level > 3918 0) { // need to free nested hot teams and their threads if any 3919 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3920 kmp_info_t *th = hot_team->t.t_threads[i]; 3921 if (__kmp_hot_teams_max_level > 1) { 3922 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3923 } 3924 if (th->th.th_hot_teams) { 3925 __kmp_free(th->th.th_hot_teams); 3926 th->th.th_hot_teams = NULL; 3927 } 3928 } 3929 } 3930 #endif 3931 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3932 3933 // Before we can reap the thread, we need to make certain that all other 3934 // threads in the teams that had this root as ancestor have stopped trying to 3935 // steal tasks. 3936 if (__kmp_tasking_mode != tskm_immediate_exec) { 3937 __kmp_wait_to_unref_task_teams(); 3938 } 3939 3940 #if KMP_OS_WINDOWS 3941 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3942 KA_TRACE( 3943 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3944 "\n", 3945 (LPVOID) & (root->r.r_uber_thread->th), 3946 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3947 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3948 #endif /* KMP_OS_WINDOWS */ 3949 3950 #if OMPD_SUPPORT 3951 if (ompd_state & OMPD_ENABLE_BP) 3952 ompd_bp_thread_end(); 3953 #endif 3954 3955 #if OMPT_SUPPORT 3956 ompt_data_t *task_data; 3957 ompt_data_t *parallel_data; 3958 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3959 NULL); 3960 if (ompt_enabled.ompt_callback_implicit_task) { 3961 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3962 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3963 } 3964 if (ompt_enabled.ompt_callback_thread_end) { 3965 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3966 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3967 } 3968 #endif 3969 3970 TCW_4(__kmp_nth, 3971 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3972 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3973 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3974 " to %d\n", 3975 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3976 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3977 if (i == 1) { 3978 // need to free contention group structure 3979 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3980 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3981 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3982 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3983 root->r.r_uber_thread->th.th_cg_roots = NULL; 3984 } 3985 __kmp_reap_thread(root->r.r_uber_thread, 1); 3986 3987 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3988 // instead of freeing. 3989 root->r.r_uber_thread = NULL; 3990 /* mark root as no longer in use */ 3991 root->r.r_begin = FALSE; 3992 3993 return n; 3994 } 3995 3996 void __kmp_unregister_root_current_thread(int gtid) { 3997 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3998 /* this lock should be ok, since unregister_root_current_thread is never 3999 called during an abort, only during a normal close. furthermore, if you 4000 have the forkjoin lock, you should never try to get the initz lock */ 4001 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4002 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4003 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4004 "exiting T#%d\n", 4005 gtid)); 4006 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4007 return; 4008 } 4009 kmp_root_t *root = __kmp_root[gtid]; 4010 4011 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4012 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4013 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4014 KMP_ASSERT(root->r.r_active == FALSE); 4015 4016 KMP_MB(); 4017 4018 kmp_info_t *thread = __kmp_threads[gtid]; 4019 kmp_team_t *team = thread->th.th_team; 4020 kmp_task_team_t *task_team = thread->th.th_task_team; 4021 4022 // we need to wait for the proxy tasks before finishing the thread 4023 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4024 #if OMPT_SUPPORT 4025 // the runtime is shutting down so we won't report any events 4026 thread->th.ompt_thread_info.state = ompt_state_undefined; 4027 #endif 4028 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4029 } 4030 4031 __kmp_reset_root(gtid, root); 4032 4033 KMP_MB(); 4034 KC_TRACE(10, 4035 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4036 4037 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4038 } 4039 4040 #if KMP_OS_WINDOWS 4041 /* __kmp_forkjoin_lock must be already held 4042 Unregisters a root thread that is not the current thread. Returns the number 4043 of __kmp_threads entries freed as a result. */ 4044 static int __kmp_unregister_root_other_thread(int gtid) { 4045 kmp_root_t *root = __kmp_root[gtid]; 4046 int r; 4047 4048 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4049 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4050 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4051 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4052 KMP_ASSERT(root->r.r_active == FALSE); 4053 4054 r = __kmp_reset_root(gtid, root); 4055 KC_TRACE(10, 4056 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4057 return r; 4058 } 4059 #endif 4060 4061 #if KMP_DEBUG 4062 void __kmp_task_info() { 4063 4064 kmp_int32 gtid = __kmp_entry_gtid(); 4065 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4066 kmp_info_t *this_thr = __kmp_threads[gtid]; 4067 kmp_team_t *steam = this_thr->th.th_serial_team; 4068 kmp_team_t *team = this_thr->th.th_team; 4069 4070 __kmp_printf( 4071 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4072 "ptask=%p\n", 4073 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4074 team->t.t_implicit_task_taskdata[tid].td_parent); 4075 } 4076 #endif // KMP_DEBUG 4077 4078 /* TODO optimize with one big memclr, take out what isn't needed, split 4079 responsibility to workers as much as possible, and delay initialization of 4080 features as much as possible */ 4081 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4082 int tid, int gtid) { 4083 /* this_thr->th.th_info.ds.ds_gtid is setup in 4084 kmp_allocate_thread/create_worker. 4085 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4086 KMP_DEBUG_ASSERT(this_thr != NULL); 4087 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4088 KMP_DEBUG_ASSERT(team); 4089 KMP_DEBUG_ASSERT(team->t.t_threads); 4090 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4091 kmp_info_t *master = team->t.t_threads[0]; 4092 KMP_DEBUG_ASSERT(master); 4093 KMP_DEBUG_ASSERT(master->th.th_root); 4094 4095 KMP_MB(); 4096 4097 TCW_SYNC_PTR(this_thr->th.th_team, team); 4098 4099 this_thr->th.th_info.ds.ds_tid = tid; 4100 this_thr->th.th_set_nproc = 0; 4101 if (__kmp_tasking_mode != tskm_immediate_exec) 4102 // When tasking is possible, threads are not safe to reap until they are 4103 // done tasking; this will be set when tasking code is exited in wait 4104 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4105 else // no tasking --> always safe to reap 4106 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4107 this_thr->th.th_set_proc_bind = proc_bind_default; 4108 #if KMP_AFFINITY_SUPPORTED 4109 this_thr->th.th_new_place = this_thr->th.th_current_place; 4110 #endif 4111 this_thr->th.th_root = master->th.th_root; 4112 4113 /* setup the thread's cache of the team structure */ 4114 this_thr->th.th_team_nproc = team->t.t_nproc; 4115 this_thr->th.th_team_master = master; 4116 this_thr->th.th_team_serialized = team->t.t_serialized; 4117 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4118 4119 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4120 4121 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4122 tid, gtid, this_thr, this_thr->th.th_current_task)); 4123 4124 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4125 team, tid, TRUE); 4126 4127 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4128 tid, gtid, this_thr, this_thr->th.th_current_task)); 4129 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4130 // __kmp_initialize_team()? 4131 4132 /* TODO no worksharing in speculative threads */ 4133 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4134 4135 this_thr->th.th_local.this_construct = 0; 4136 4137 if (!this_thr->th.th_pri_common) { 4138 this_thr->th.th_pri_common = 4139 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4140 if (__kmp_storage_map) { 4141 __kmp_print_storage_map_gtid( 4142 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4143 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4144 } 4145 this_thr->th.th_pri_head = NULL; 4146 } 4147 4148 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4149 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4150 // Make new thread's CG root same as primary thread's 4151 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4152 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4153 if (tmp) { 4154 // worker changes CG, need to check if old CG should be freed 4155 int i = tmp->cg_nthreads--; 4156 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4157 " on node %p of thread %p to %d\n", 4158 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4159 if (i == 1) { 4160 __kmp_free(tmp); // last thread left CG --> free it 4161 } 4162 } 4163 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4164 // Increment new thread's CG root's counter to add the new thread 4165 this_thr->th.th_cg_roots->cg_nthreads++; 4166 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4167 " node %p of thread %p to %d\n", 4168 this_thr, this_thr->th.th_cg_roots, 4169 this_thr->th.th_cg_roots->cg_root, 4170 this_thr->th.th_cg_roots->cg_nthreads)); 4171 this_thr->th.th_current_task->td_icvs.thread_limit = 4172 this_thr->th.th_cg_roots->cg_thread_limit; 4173 } 4174 4175 /* Initialize dynamic dispatch */ 4176 { 4177 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4178 // Use team max_nproc since this will never change for the team. 4179 size_t disp_size = 4180 sizeof(dispatch_private_info_t) * 4181 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4182 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4183 team->t.t_max_nproc)); 4184 KMP_ASSERT(dispatch); 4185 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4186 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4187 4188 dispatch->th_disp_index = 0; 4189 dispatch->th_doacross_buf_idx = 0; 4190 if (!dispatch->th_disp_buffer) { 4191 dispatch->th_disp_buffer = 4192 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4193 4194 if (__kmp_storage_map) { 4195 __kmp_print_storage_map_gtid( 4196 gtid, &dispatch->th_disp_buffer[0], 4197 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4198 ? 1 4199 : __kmp_dispatch_num_buffers], 4200 disp_size, 4201 "th_%d.th_dispatch.th_disp_buffer " 4202 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4203 gtid, team->t.t_id, gtid); 4204 } 4205 } else { 4206 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4207 } 4208 4209 dispatch->th_dispatch_pr_current = 0; 4210 dispatch->th_dispatch_sh_current = 0; 4211 4212 dispatch->th_deo_fcn = 0; /* ORDERED */ 4213 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4214 } 4215 4216 this_thr->th.th_next_pool = NULL; 4217 4218 if (!this_thr->th.th_task_state_memo_stack) { 4219 size_t i; 4220 this_thr->th.th_task_state_memo_stack = 4221 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4222 this_thr->th.th_task_state_top = 0; 4223 this_thr->th.th_task_state_stack_sz = 4; 4224 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4225 ++i) // zero init the stack 4226 this_thr->th.th_task_state_memo_stack[i] = 0; 4227 } 4228 4229 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4230 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4231 4232 KMP_MB(); 4233 } 4234 4235 /* allocate a new thread for the requesting team. this is only called from 4236 within a forkjoin critical section. we will first try to get an available 4237 thread from the thread pool. if none is available, we will fork a new one 4238 assuming we are able to create a new one. this should be assured, as the 4239 caller should check on this first. */ 4240 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4241 int new_tid) { 4242 kmp_team_t *serial_team; 4243 kmp_info_t *new_thr; 4244 int new_gtid; 4245 4246 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4247 KMP_DEBUG_ASSERT(root && team); 4248 #if !KMP_NESTED_HOT_TEAMS 4249 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4250 #endif 4251 KMP_MB(); 4252 4253 /* first, try to get one from the thread pool */ 4254 if (__kmp_thread_pool) { 4255 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4256 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4257 if (new_thr == __kmp_thread_pool_insert_pt) { 4258 __kmp_thread_pool_insert_pt = NULL; 4259 } 4260 TCW_4(new_thr->th.th_in_pool, FALSE); 4261 __kmp_suspend_initialize_thread(new_thr); 4262 __kmp_lock_suspend_mx(new_thr); 4263 if (new_thr->th.th_active_in_pool == TRUE) { 4264 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4265 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4266 new_thr->th.th_active_in_pool = FALSE; 4267 } 4268 __kmp_unlock_suspend_mx(new_thr); 4269 4270 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4271 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4272 KMP_ASSERT(!new_thr->th.th_team); 4273 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4274 4275 /* setup the thread structure */ 4276 __kmp_initialize_info(new_thr, team, new_tid, 4277 new_thr->th.th_info.ds.ds_gtid); 4278 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4279 4280 TCW_4(__kmp_nth, __kmp_nth + 1); 4281 4282 new_thr->th.th_task_state = 0; 4283 new_thr->th.th_task_state_top = 0; 4284 new_thr->th.th_task_state_stack_sz = 4; 4285 4286 #ifdef KMP_ADJUST_BLOCKTIME 4287 /* Adjust blocktime back to zero if necessary */ 4288 /* Middle initialization might not have occurred yet */ 4289 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4290 if (__kmp_nth > __kmp_avail_proc) { 4291 __kmp_zero_bt = TRUE; 4292 } 4293 } 4294 #endif /* KMP_ADJUST_BLOCKTIME */ 4295 4296 #if KMP_DEBUG 4297 // If thread entered pool via __kmp_free_thread, wait_flag should != 4298 // KMP_BARRIER_PARENT_FLAG. 4299 int b; 4300 kmp_balign_t *balign = new_thr->th.th_bar; 4301 for (b = 0; b < bs_last_barrier; ++b) 4302 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4303 #endif 4304 4305 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4306 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4307 4308 KMP_MB(); 4309 return new_thr; 4310 } 4311 4312 /* no, well fork a new one */ 4313 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4314 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4315 4316 #if KMP_USE_MONITOR 4317 // If this is the first worker thread the RTL is creating, then also 4318 // launch the monitor thread. We try to do this as early as possible. 4319 if (!TCR_4(__kmp_init_monitor)) { 4320 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4321 if (!TCR_4(__kmp_init_monitor)) { 4322 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4323 TCW_4(__kmp_init_monitor, 1); 4324 __kmp_create_monitor(&__kmp_monitor); 4325 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4326 #if KMP_OS_WINDOWS 4327 // AC: wait until monitor has started. This is a fix for CQ232808. 4328 // The reason is that if the library is loaded/unloaded in a loop with 4329 // small (parallel) work in between, then there is high probability that 4330 // monitor thread started after the library shutdown. At shutdown it is 4331 // too late to cope with the problem, because when the primary thread is 4332 // in DllMain (process detach) the monitor has no chances to start (it is 4333 // blocked), and primary thread has no means to inform the monitor that 4334 // the library has gone, because all the memory which the monitor can 4335 // access is going to be released/reset. 4336 while (TCR_4(__kmp_init_monitor) < 2) { 4337 KMP_YIELD(TRUE); 4338 } 4339 KF_TRACE(10, ("after monitor thread has started\n")); 4340 #endif 4341 } 4342 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4343 } 4344 #endif 4345 4346 KMP_MB(); 4347 4348 { 4349 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4350 ? 1 4351 : __kmp_hidden_helper_threads_num + 1; 4352 4353 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4354 ++new_gtid) { 4355 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4356 } 4357 4358 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4359 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4360 } 4361 } 4362 4363 /* allocate space for it. */ 4364 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4365 4366 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4367 4368 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4369 // suppress race conditions detection on synchronization flags in debug mode 4370 // this helps to analyze library internals eliminating false positives 4371 __itt_suppress_mark_range( 4372 __itt_suppress_range, __itt_suppress_threading_errors, 4373 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4374 __itt_suppress_mark_range( 4375 __itt_suppress_range, __itt_suppress_threading_errors, 4376 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4377 #if KMP_OS_WINDOWS 4378 __itt_suppress_mark_range( 4379 __itt_suppress_range, __itt_suppress_threading_errors, 4380 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4381 #else 4382 __itt_suppress_mark_range(__itt_suppress_range, 4383 __itt_suppress_threading_errors, 4384 &new_thr->th.th_suspend_init_count, 4385 sizeof(new_thr->th.th_suspend_init_count)); 4386 #endif 4387 // TODO: check if we need to also suppress b_arrived flags 4388 __itt_suppress_mark_range(__itt_suppress_range, 4389 __itt_suppress_threading_errors, 4390 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4391 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4392 __itt_suppress_mark_range(__itt_suppress_range, 4393 __itt_suppress_threading_errors, 4394 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4395 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4396 __itt_suppress_mark_range(__itt_suppress_range, 4397 __itt_suppress_threading_errors, 4398 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4399 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4400 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4401 if (__kmp_storage_map) { 4402 __kmp_print_thread_storage_map(new_thr, new_gtid); 4403 } 4404 4405 // add the reserve serialized team, initialized from the team's primary thread 4406 { 4407 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4408 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4409 new_thr->th.th_serial_team = serial_team = 4410 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4411 #if OMPT_SUPPORT 4412 ompt_data_none, // root parallel id 4413 #endif 4414 proc_bind_default, &r_icvs, 4415 0 USE_NESTED_HOT_ARG(NULL)); 4416 } 4417 KMP_ASSERT(serial_team); 4418 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4419 // execution (it is unused for now). 4420 serial_team->t.t_threads[0] = new_thr; 4421 KF_TRACE(10, 4422 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4423 new_thr)); 4424 4425 /* setup the thread structures */ 4426 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4427 4428 #if USE_FAST_MEMORY 4429 __kmp_initialize_fast_memory(new_thr); 4430 #endif /* USE_FAST_MEMORY */ 4431 4432 #if KMP_USE_BGET 4433 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4434 __kmp_initialize_bget(new_thr); 4435 #endif 4436 4437 __kmp_init_random(new_thr); // Initialize random number generator 4438 4439 /* Initialize these only once when thread is grabbed for a team allocation */ 4440 KA_TRACE(20, 4441 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4442 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4443 4444 int b; 4445 kmp_balign_t *balign = new_thr->th.th_bar; 4446 for (b = 0; b < bs_last_barrier; ++b) { 4447 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4448 balign[b].bb.team = NULL; 4449 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4450 balign[b].bb.use_oncore_barrier = 0; 4451 } 4452 4453 new_thr->th.th_spin_here = FALSE; 4454 new_thr->th.th_next_waiting = 0; 4455 #if KMP_OS_UNIX 4456 new_thr->th.th_blocking = false; 4457 #endif 4458 4459 #if KMP_AFFINITY_SUPPORTED 4460 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4461 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4462 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4463 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4464 #endif 4465 new_thr->th.th_def_allocator = __kmp_def_allocator; 4466 new_thr->th.th_prev_level = 0; 4467 new_thr->th.th_prev_num_threads = 1; 4468 4469 TCW_4(new_thr->th.th_in_pool, FALSE); 4470 new_thr->th.th_active_in_pool = FALSE; 4471 TCW_4(new_thr->th.th_active, TRUE); 4472 4473 /* adjust the global counters */ 4474 __kmp_all_nth++; 4475 __kmp_nth++; 4476 4477 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4478 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4479 if (__kmp_adjust_gtid_mode) { 4480 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4481 if (TCR_4(__kmp_gtid_mode) != 2) { 4482 TCW_4(__kmp_gtid_mode, 2); 4483 } 4484 } else { 4485 if (TCR_4(__kmp_gtid_mode) != 1) { 4486 TCW_4(__kmp_gtid_mode, 1); 4487 } 4488 } 4489 } 4490 4491 #ifdef KMP_ADJUST_BLOCKTIME 4492 /* Adjust blocktime back to zero if necessary */ 4493 /* Middle initialization might not have occurred yet */ 4494 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4495 if (__kmp_nth > __kmp_avail_proc) { 4496 __kmp_zero_bt = TRUE; 4497 } 4498 } 4499 #endif /* KMP_ADJUST_BLOCKTIME */ 4500 4501 /* actually fork it and create the new worker thread */ 4502 KF_TRACE( 4503 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4504 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4505 KF_TRACE(10, 4506 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4507 4508 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4509 new_gtid)); 4510 KMP_MB(); 4511 return new_thr; 4512 } 4513 4514 /* Reinitialize team for reuse. 4515 The hot team code calls this case at every fork barrier, so EPCC barrier 4516 test are extremely sensitive to changes in it, esp. writes to the team 4517 struct, which cause a cache invalidation in all threads. 4518 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4519 static void __kmp_reinitialize_team(kmp_team_t *team, 4520 kmp_internal_control_t *new_icvs, 4521 ident_t *loc) { 4522 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4523 team->t.t_threads[0], team)); 4524 KMP_DEBUG_ASSERT(team && new_icvs); 4525 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4526 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4527 4528 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4529 // Copy ICVs to the primary thread's implicit taskdata 4530 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4531 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4532 4533 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4534 team->t.t_threads[0], team)); 4535 } 4536 4537 /* Initialize the team data structure. 4538 This assumes the t_threads and t_max_nproc are already set. 4539 Also, we don't touch the arguments */ 4540 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4541 kmp_internal_control_t *new_icvs, 4542 ident_t *loc) { 4543 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4544 4545 /* verify */ 4546 KMP_DEBUG_ASSERT(team); 4547 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4548 KMP_DEBUG_ASSERT(team->t.t_threads); 4549 KMP_MB(); 4550 4551 team->t.t_master_tid = 0; /* not needed */ 4552 /* team->t.t_master_bar; not needed */ 4553 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4554 team->t.t_nproc = new_nproc; 4555 4556 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4557 team->t.t_next_pool = NULL; 4558 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4559 * up hot team */ 4560 4561 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4562 team->t.t_invoke = NULL; /* not needed */ 4563 4564 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4565 team->t.t_sched.sched = new_icvs->sched.sched; 4566 4567 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4568 team->t.t_fp_control_saved = FALSE; /* not needed */ 4569 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4570 team->t.t_mxcsr = 0; /* not needed */ 4571 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4572 4573 team->t.t_construct = 0; 4574 4575 team->t.t_ordered.dt.t_value = 0; 4576 team->t.t_master_active = FALSE; 4577 4578 #ifdef KMP_DEBUG 4579 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4580 #endif 4581 #if KMP_OS_WINDOWS 4582 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4583 #endif 4584 4585 team->t.t_control_stack_top = NULL; 4586 4587 __kmp_reinitialize_team(team, new_icvs, loc); 4588 4589 KMP_MB(); 4590 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4591 } 4592 4593 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4594 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4595 static void 4596 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4597 if (KMP_AFFINITY_CAPABLE()) { 4598 int status; 4599 if (old_mask != NULL) { 4600 status = __kmp_get_system_affinity(old_mask, TRUE); 4601 int error = errno; 4602 if (status != 0) { 4603 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4604 __kmp_msg_null); 4605 } 4606 } 4607 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4608 } 4609 } 4610 #endif 4611 4612 #if KMP_AFFINITY_SUPPORTED 4613 4614 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4615 // It calculates the worker + primary thread's partition based upon the parent 4616 // thread's partition, and binds each worker to a thread in their partition. 4617 // The primary thread's partition should already include its current binding. 4618 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4619 // Do not partition places for the hidden helper team 4620 if (KMP_HIDDEN_HELPER_TEAM(team)) 4621 return; 4622 // Copy the primary thread's place partition to the team struct 4623 kmp_info_t *master_th = team->t.t_threads[0]; 4624 KMP_DEBUG_ASSERT(master_th != NULL); 4625 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4626 int first_place = master_th->th.th_first_place; 4627 int last_place = master_th->th.th_last_place; 4628 int masters_place = master_th->th.th_current_place; 4629 team->t.t_first_place = first_place; 4630 team->t.t_last_place = last_place; 4631 4632 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4633 "bound to place %d partition = [%d,%d]\n", 4634 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4635 team->t.t_id, masters_place, first_place, last_place)); 4636 4637 switch (proc_bind) { 4638 4639 case proc_bind_default: 4640 // Serial teams might have the proc_bind policy set to proc_bind_default. 4641 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4642 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4643 break; 4644 4645 case proc_bind_primary: { 4646 int f; 4647 int n_th = team->t.t_nproc; 4648 for (f = 1; f < n_th; f++) { 4649 kmp_info_t *th = team->t.t_threads[f]; 4650 KMP_DEBUG_ASSERT(th != NULL); 4651 th->th.th_first_place = first_place; 4652 th->th.th_last_place = last_place; 4653 th->th.th_new_place = masters_place; 4654 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4655 team->t.t_display_affinity != 1) { 4656 team->t.t_display_affinity = 1; 4657 } 4658 4659 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4660 "partition = [%d,%d]\n", 4661 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4662 f, masters_place, first_place, last_place)); 4663 } 4664 } break; 4665 4666 case proc_bind_close: { 4667 int f; 4668 int n_th = team->t.t_nproc; 4669 int n_places; 4670 if (first_place <= last_place) { 4671 n_places = last_place - first_place + 1; 4672 } else { 4673 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4674 } 4675 if (n_th <= n_places) { 4676 int place = masters_place; 4677 for (f = 1; f < n_th; f++) { 4678 kmp_info_t *th = team->t.t_threads[f]; 4679 KMP_DEBUG_ASSERT(th != NULL); 4680 4681 if (place == last_place) { 4682 place = first_place; 4683 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4684 place = 0; 4685 } else { 4686 place++; 4687 } 4688 th->th.th_first_place = first_place; 4689 th->th.th_last_place = last_place; 4690 th->th.th_new_place = place; 4691 if (__kmp_display_affinity && place != th->th.th_current_place && 4692 team->t.t_display_affinity != 1) { 4693 team->t.t_display_affinity = 1; 4694 } 4695 4696 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4697 "partition = [%d,%d]\n", 4698 __kmp_gtid_from_thread(team->t.t_threads[f]), 4699 team->t.t_id, f, place, first_place, last_place)); 4700 } 4701 } else { 4702 int S, rem, gap, s_count; 4703 S = n_th / n_places; 4704 s_count = 0; 4705 rem = n_th - (S * n_places); 4706 gap = rem > 0 ? n_places / rem : n_places; 4707 int place = masters_place; 4708 int gap_ct = gap; 4709 for (f = 0; f < n_th; f++) { 4710 kmp_info_t *th = team->t.t_threads[f]; 4711 KMP_DEBUG_ASSERT(th != NULL); 4712 4713 th->th.th_first_place = first_place; 4714 th->th.th_last_place = last_place; 4715 th->th.th_new_place = place; 4716 if (__kmp_display_affinity && place != th->th.th_current_place && 4717 team->t.t_display_affinity != 1) { 4718 team->t.t_display_affinity = 1; 4719 } 4720 s_count++; 4721 4722 if ((s_count == S) && rem && (gap_ct == gap)) { 4723 // do nothing, add an extra thread to place on next iteration 4724 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4725 // we added an extra thread to this place; move to next place 4726 if (place == last_place) { 4727 place = first_place; 4728 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4729 place = 0; 4730 } else { 4731 place++; 4732 } 4733 s_count = 0; 4734 gap_ct = 1; 4735 rem--; 4736 } else if (s_count == S) { // place full; don't add extra 4737 if (place == last_place) { 4738 place = first_place; 4739 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4740 place = 0; 4741 } else { 4742 place++; 4743 } 4744 gap_ct++; 4745 s_count = 0; 4746 } 4747 4748 KA_TRACE(100, 4749 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4750 "partition = [%d,%d]\n", 4751 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4752 th->th.th_new_place, first_place, last_place)); 4753 } 4754 KMP_DEBUG_ASSERT(place == masters_place); 4755 } 4756 } break; 4757 4758 case proc_bind_spread: { 4759 int f; 4760 int n_th = team->t.t_nproc; 4761 int n_places; 4762 int thidx; 4763 if (first_place <= last_place) { 4764 n_places = last_place - first_place + 1; 4765 } else { 4766 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4767 } 4768 if (n_th <= n_places) { 4769 int place = -1; 4770 4771 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4772 int S = n_places / n_th; 4773 int s_count, rem, gap, gap_ct; 4774 4775 place = masters_place; 4776 rem = n_places - n_th * S; 4777 gap = rem ? n_th / rem : 1; 4778 gap_ct = gap; 4779 thidx = n_th; 4780 if (update_master_only == 1) 4781 thidx = 1; 4782 for (f = 0; f < thidx; f++) { 4783 kmp_info_t *th = team->t.t_threads[f]; 4784 KMP_DEBUG_ASSERT(th != NULL); 4785 4786 th->th.th_first_place = place; 4787 th->th.th_new_place = place; 4788 if (__kmp_display_affinity && place != th->th.th_current_place && 4789 team->t.t_display_affinity != 1) { 4790 team->t.t_display_affinity = 1; 4791 } 4792 s_count = 1; 4793 while (s_count < S) { 4794 if (place == last_place) { 4795 place = first_place; 4796 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4797 place = 0; 4798 } else { 4799 place++; 4800 } 4801 s_count++; 4802 } 4803 if (rem && (gap_ct == gap)) { 4804 if (place == last_place) { 4805 place = first_place; 4806 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4807 place = 0; 4808 } else { 4809 place++; 4810 } 4811 rem--; 4812 gap_ct = 0; 4813 } 4814 th->th.th_last_place = place; 4815 gap_ct++; 4816 4817 if (place == last_place) { 4818 place = first_place; 4819 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4820 place = 0; 4821 } else { 4822 place++; 4823 } 4824 4825 KA_TRACE(100, 4826 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4827 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4828 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4829 f, th->th.th_new_place, th->th.th_first_place, 4830 th->th.th_last_place, __kmp_affinity_num_masks)); 4831 } 4832 } else { 4833 /* Having uniform space of available computation places I can create 4834 T partitions of round(P/T) size and put threads into the first 4835 place of each partition. */ 4836 double current = static_cast<double>(masters_place); 4837 double spacing = 4838 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4839 int first, last; 4840 kmp_info_t *th; 4841 4842 thidx = n_th + 1; 4843 if (update_master_only == 1) 4844 thidx = 1; 4845 for (f = 0; f < thidx; f++) { 4846 first = static_cast<int>(current); 4847 last = static_cast<int>(current + spacing) - 1; 4848 KMP_DEBUG_ASSERT(last >= first); 4849 if (first >= n_places) { 4850 if (masters_place) { 4851 first -= n_places; 4852 last -= n_places; 4853 if (first == (masters_place + 1)) { 4854 KMP_DEBUG_ASSERT(f == n_th); 4855 first--; 4856 } 4857 if (last == masters_place) { 4858 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4859 last--; 4860 } 4861 } else { 4862 KMP_DEBUG_ASSERT(f == n_th); 4863 first = 0; 4864 last = 0; 4865 } 4866 } 4867 if (last >= n_places) { 4868 last = (n_places - 1); 4869 } 4870 place = first; 4871 current += spacing; 4872 if (f < n_th) { 4873 KMP_DEBUG_ASSERT(0 <= first); 4874 KMP_DEBUG_ASSERT(n_places > first); 4875 KMP_DEBUG_ASSERT(0 <= last); 4876 KMP_DEBUG_ASSERT(n_places > last); 4877 KMP_DEBUG_ASSERT(last_place >= first_place); 4878 th = team->t.t_threads[f]; 4879 KMP_DEBUG_ASSERT(th); 4880 th->th.th_first_place = first; 4881 th->th.th_new_place = place; 4882 th->th.th_last_place = last; 4883 if (__kmp_display_affinity && place != th->th.th_current_place && 4884 team->t.t_display_affinity != 1) { 4885 team->t.t_display_affinity = 1; 4886 } 4887 KA_TRACE(100, 4888 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4889 "partition = [%d,%d], spacing = %.4f\n", 4890 __kmp_gtid_from_thread(team->t.t_threads[f]), 4891 team->t.t_id, f, th->th.th_new_place, 4892 th->th.th_first_place, th->th.th_last_place, spacing)); 4893 } 4894 } 4895 } 4896 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4897 } else { 4898 int S, rem, gap, s_count; 4899 S = n_th / n_places; 4900 s_count = 0; 4901 rem = n_th - (S * n_places); 4902 gap = rem > 0 ? n_places / rem : n_places; 4903 int place = masters_place; 4904 int gap_ct = gap; 4905 thidx = n_th; 4906 if (update_master_only == 1) 4907 thidx = 1; 4908 for (f = 0; f < thidx; f++) { 4909 kmp_info_t *th = team->t.t_threads[f]; 4910 KMP_DEBUG_ASSERT(th != NULL); 4911 4912 th->th.th_first_place = place; 4913 th->th.th_last_place = place; 4914 th->th.th_new_place = place; 4915 if (__kmp_display_affinity && place != th->th.th_current_place && 4916 team->t.t_display_affinity != 1) { 4917 team->t.t_display_affinity = 1; 4918 } 4919 s_count++; 4920 4921 if ((s_count == S) && rem && (gap_ct == gap)) { 4922 // do nothing, add an extra thread to place on next iteration 4923 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4924 // we added an extra thread to this place; move on to next place 4925 if (place == last_place) { 4926 place = first_place; 4927 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4928 place = 0; 4929 } else { 4930 place++; 4931 } 4932 s_count = 0; 4933 gap_ct = 1; 4934 rem--; 4935 } else if (s_count == S) { // place is full; don't add extra thread 4936 if (place == last_place) { 4937 place = first_place; 4938 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4939 place = 0; 4940 } else { 4941 place++; 4942 } 4943 gap_ct++; 4944 s_count = 0; 4945 } 4946 4947 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4948 "partition = [%d,%d]\n", 4949 __kmp_gtid_from_thread(team->t.t_threads[f]), 4950 team->t.t_id, f, th->th.th_new_place, 4951 th->th.th_first_place, th->th.th_last_place)); 4952 } 4953 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4954 } 4955 } break; 4956 4957 default: 4958 break; 4959 } 4960 4961 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4962 } 4963 4964 #endif // KMP_AFFINITY_SUPPORTED 4965 4966 /* allocate a new team data structure to use. take one off of the free pool if 4967 available */ 4968 kmp_team_t * 4969 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4970 #if OMPT_SUPPORT 4971 ompt_data_t ompt_parallel_data, 4972 #endif 4973 kmp_proc_bind_t new_proc_bind, 4974 kmp_internal_control_t *new_icvs, 4975 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4976 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4977 int f; 4978 kmp_team_t *team; 4979 int use_hot_team = !root->r.r_active; 4980 int level = 0; 4981 4982 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4983 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4984 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4985 KMP_MB(); 4986 4987 #if KMP_NESTED_HOT_TEAMS 4988 kmp_hot_team_ptr_t *hot_teams; 4989 if (master) { 4990 team = master->th.th_team; 4991 level = team->t.t_active_level; 4992 if (master->th.th_teams_microtask) { // in teams construct? 4993 if (master->th.th_teams_size.nteams > 1 && 4994 ( // #teams > 1 4995 team->t.t_pkfn == 4996 (microtask_t)__kmp_teams_master || // inner fork of the teams 4997 master->th.th_teams_level < 4998 team->t.t_level)) { // or nested parallel inside the teams 4999 ++level; // not increment if #teams==1, or for outer fork of the teams; 5000 // increment otherwise 5001 } 5002 } 5003 hot_teams = master->th.th_hot_teams; 5004 if (level < __kmp_hot_teams_max_level && hot_teams && 5005 hot_teams[level].hot_team) { 5006 // hot team has already been allocated for given level 5007 use_hot_team = 1; 5008 } else { 5009 use_hot_team = 0; 5010 } 5011 } else { 5012 // check we won't access uninitialized hot_teams, just in case 5013 KMP_DEBUG_ASSERT(new_nproc == 1); 5014 } 5015 #endif 5016 // Optimization to use a "hot" team 5017 if (use_hot_team && new_nproc > 1) { 5018 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5019 #if KMP_NESTED_HOT_TEAMS 5020 team = hot_teams[level].hot_team; 5021 #else 5022 team = root->r.r_hot_team; 5023 #endif 5024 #if KMP_DEBUG 5025 if (__kmp_tasking_mode != tskm_immediate_exec) { 5026 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5027 "task_team[1] = %p before reinit\n", 5028 team->t.t_task_team[0], team->t.t_task_team[1])); 5029 } 5030 #endif 5031 5032 // Has the number of threads changed? 5033 /* Let's assume the most common case is that the number of threads is 5034 unchanged, and put that case first. */ 5035 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5036 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5037 // This case can mean that omp_set_num_threads() was called and the hot 5038 // team size was already reduced, so we check the special flag 5039 if (team->t.t_size_changed == -1) { 5040 team->t.t_size_changed = 1; 5041 } else { 5042 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5043 } 5044 5045 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5046 kmp_r_sched_t new_sched = new_icvs->sched; 5047 // set primary thread's schedule as new run-time schedule 5048 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5049 5050 __kmp_reinitialize_team(team, new_icvs, 5051 root->r.r_uber_thread->th.th_ident); 5052 5053 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5054 team->t.t_threads[0], team)); 5055 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5056 5057 #if KMP_AFFINITY_SUPPORTED 5058 if ((team->t.t_size_changed == 0) && 5059 (team->t.t_proc_bind == new_proc_bind)) { 5060 if (new_proc_bind == proc_bind_spread) { 5061 __kmp_partition_places( 5062 team, 1); // add flag to update only master for spread 5063 } 5064 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5065 "proc_bind = %d, partition = [%d,%d]\n", 5066 team->t.t_id, new_proc_bind, team->t.t_first_place, 5067 team->t.t_last_place)); 5068 } else { 5069 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5070 __kmp_partition_places(team); 5071 } 5072 #else 5073 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5074 #endif /* KMP_AFFINITY_SUPPORTED */ 5075 } else if (team->t.t_nproc > new_nproc) { 5076 KA_TRACE(20, 5077 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5078 new_nproc)); 5079 5080 team->t.t_size_changed = 1; 5081 #if KMP_NESTED_HOT_TEAMS 5082 if (__kmp_hot_teams_mode == 0) { 5083 // AC: saved number of threads should correspond to team's value in this 5084 // mode, can be bigger in mode 1, when hot team has threads in reserve 5085 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5086 hot_teams[level].hot_team_nth = new_nproc; 5087 #endif // KMP_NESTED_HOT_TEAMS 5088 /* release the extra threads we don't need any more */ 5089 for (f = new_nproc; f < team->t.t_nproc; f++) { 5090 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5091 if (__kmp_tasking_mode != tskm_immediate_exec) { 5092 // When decreasing team size, threads no longer in the team should 5093 // unref task team. 5094 team->t.t_threads[f]->th.th_task_team = NULL; 5095 } 5096 __kmp_free_thread(team->t.t_threads[f]); 5097 team->t.t_threads[f] = NULL; 5098 } 5099 #if KMP_NESTED_HOT_TEAMS 5100 } // (__kmp_hot_teams_mode == 0) 5101 else { 5102 // When keeping extra threads in team, switch threads to wait on own 5103 // b_go flag 5104 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5105 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5106 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5107 for (int b = 0; b < bs_last_barrier; ++b) { 5108 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5109 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5110 } 5111 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5112 } 5113 } 5114 } 5115 #endif // KMP_NESTED_HOT_TEAMS 5116 team->t.t_nproc = new_nproc; 5117 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5118 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5119 __kmp_reinitialize_team(team, new_icvs, 5120 root->r.r_uber_thread->th.th_ident); 5121 5122 // Update remaining threads 5123 for (f = 0; f < new_nproc; ++f) { 5124 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5125 } 5126 5127 // restore the current task state of the primary thread: should be the 5128 // implicit task 5129 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5130 team->t.t_threads[0], team)); 5131 5132 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5133 5134 #ifdef KMP_DEBUG 5135 for (f = 0; f < team->t.t_nproc; f++) { 5136 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5137 team->t.t_threads[f]->th.th_team_nproc == 5138 team->t.t_nproc); 5139 } 5140 #endif 5141 5142 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5143 #if KMP_AFFINITY_SUPPORTED 5144 __kmp_partition_places(team); 5145 #endif 5146 } else { // team->t.t_nproc < new_nproc 5147 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5148 kmp_affin_mask_t *old_mask; 5149 if (KMP_AFFINITY_CAPABLE()) { 5150 KMP_CPU_ALLOC(old_mask); 5151 } 5152 #endif 5153 5154 KA_TRACE(20, 5155 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5156 new_nproc)); 5157 5158 team->t.t_size_changed = 1; 5159 5160 #if KMP_NESTED_HOT_TEAMS 5161 int avail_threads = hot_teams[level].hot_team_nth; 5162 if (new_nproc < avail_threads) 5163 avail_threads = new_nproc; 5164 kmp_info_t **other_threads = team->t.t_threads; 5165 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5166 // Adjust barrier data of reserved threads (if any) of the team 5167 // Other data will be set in __kmp_initialize_info() below. 5168 int b; 5169 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5170 for (b = 0; b < bs_last_barrier; ++b) { 5171 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5172 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5173 #if USE_DEBUGGER 5174 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5175 #endif 5176 } 5177 } 5178 if (hot_teams[level].hot_team_nth >= new_nproc) { 5179 // we have all needed threads in reserve, no need to allocate any 5180 // this only possible in mode 1, cannot have reserved threads in mode 0 5181 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5182 team->t.t_nproc = new_nproc; // just get reserved threads involved 5183 } else { 5184 // we may have some threads in reserve, but not enough 5185 team->t.t_nproc = 5186 hot_teams[level] 5187 .hot_team_nth; // get reserved threads involved if any 5188 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5189 #endif // KMP_NESTED_HOT_TEAMS 5190 if (team->t.t_max_nproc < new_nproc) { 5191 /* reallocate larger arrays */ 5192 __kmp_reallocate_team_arrays(team, new_nproc); 5193 __kmp_reinitialize_team(team, new_icvs, NULL); 5194 } 5195 5196 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5197 /* Temporarily set full mask for primary thread before creation of 5198 workers. The reason is that workers inherit the affinity from the 5199 primary thread, so if a lot of workers are created on the single 5200 core quickly, they don't get a chance to set their own affinity for 5201 a long time. */ 5202 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5203 #endif 5204 5205 /* allocate new threads for the hot team */ 5206 for (f = team->t.t_nproc; f < new_nproc; f++) { 5207 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5208 KMP_DEBUG_ASSERT(new_worker); 5209 team->t.t_threads[f] = new_worker; 5210 5211 KA_TRACE(20, 5212 ("__kmp_allocate_team: team %d init T#%d arrived: " 5213 "join=%llu, plain=%llu\n", 5214 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5215 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5216 team->t.t_bar[bs_plain_barrier].b_arrived)); 5217 5218 { // Initialize barrier data for new threads. 5219 int b; 5220 kmp_balign_t *balign = new_worker->th.th_bar; 5221 for (b = 0; b < bs_last_barrier; ++b) { 5222 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5223 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5224 KMP_BARRIER_PARENT_FLAG); 5225 #if USE_DEBUGGER 5226 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5227 #endif 5228 } 5229 } 5230 } 5231 5232 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5233 if (KMP_AFFINITY_CAPABLE()) { 5234 /* Restore initial primary thread's affinity mask */ 5235 __kmp_set_system_affinity(old_mask, TRUE); 5236 KMP_CPU_FREE(old_mask); 5237 } 5238 #endif 5239 #if KMP_NESTED_HOT_TEAMS 5240 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5241 #endif // KMP_NESTED_HOT_TEAMS 5242 /* make sure everyone is syncronized */ 5243 int old_nproc = team->t.t_nproc; // save old value and use to update only 5244 // new threads below 5245 __kmp_initialize_team(team, new_nproc, new_icvs, 5246 root->r.r_uber_thread->th.th_ident); 5247 5248 /* reinitialize the threads */ 5249 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5250 for (f = 0; f < team->t.t_nproc; ++f) 5251 __kmp_initialize_info(team->t.t_threads[f], team, f, 5252 __kmp_gtid_from_tid(f, team)); 5253 5254 if (level) { // set th_task_state for new threads in nested hot team 5255 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5256 // only need to set the th_task_state for the new threads. th_task_state 5257 // for primary thread will not be accurate until after this in 5258 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5259 // get the correct value. 5260 for (f = old_nproc; f < team->t.t_nproc; ++f) 5261 team->t.t_threads[f]->th.th_task_state = 5262 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5263 } else { // set th_task_state for new threads in non-nested hot team 5264 // copy primary thread's state 5265 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5266 for (f = old_nproc; f < team->t.t_nproc; ++f) 5267 team->t.t_threads[f]->th.th_task_state = old_state; 5268 } 5269 5270 #ifdef KMP_DEBUG 5271 for (f = 0; f < team->t.t_nproc; ++f) { 5272 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5273 team->t.t_threads[f]->th.th_team_nproc == 5274 team->t.t_nproc); 5275 } 5276 #endif 5277 5278 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5279 #if KMP_AFFINITY_SUPPORTED 5280 __kmp_partition_places(team); 5281 #endif 5282 } // Check changes in number of threads 5283 5284 kmp_info_t *master = team->t.t_threads[0]; 5285 if (master->th.th_teams_microtask) { 5286 for (f = 1; f < new_nproc; ++f) { 5287 // propagate teams construct specific info to workers 5288 kmp_info_t *thr = team->t.t_threads[f]; 5289 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5290 thr->th.th_teams_level = master->th.th_teams_level; 5291 thr->th.th_teams_size = master->th.th_teams_size; 5292 } 5293 } 5294 #if KMP_NESTED_HOT_TEAMS 5295 if (level) { 5296 // Sync barrier state for nested hot teams, not needed for outermost hot 5297 // team. 5298 for (f = 1; f < new_nproc; ++f) { 5299 kmp_info_t *thr = team->t.t_threads[f]; 5300 int b; 5301 kmp_balign_t *balign = thr->th.th_bar; 5302 for (b = 0; b < bs_last_barrier; ++b) { 5303 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5304 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5305 #if USE_DEBUGGER 5306 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5307 #endif 5308 } 5309 } 5310 } 5311 #endif // KMP_NESTED_HOT_TEAMS 5312 5313 /* reallocate space for arguments if necessary */ 5314 __kmp_alloc_argv_entries(argc, team, TRUE); 5315 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5316 // The hot team re-uses the previous task team, 5317 // if untouched during the previous release->gather phase. 5318 5319 KF_TRACE(10, (" hot_team = %p\n", team)); 5320 5321 #if KMP_DEBUG 5322 if (__kmp_tasking_mode != tskm_immediate_exec) { 5323 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5324 "task_team[1] = %p after reinit\n", 5325 team->t.t_task_team[0], team->t.t_task_team[1])); 5326 } 5327 #endif 5328 5329 #if OMPT_SUPPORT 5330 __ompt_team_assign_id(team, ompt_parallel_data); 5331 #endif 5332 5333 KMP_MB(); 5334 5335 return team; 5336 } 5337 5338 /* next, let's try to take one from the team pool */ 5339 KMP_MB(); 5340 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5341 /* TODO: consider resizing undersized teams instead of reaping them, now 5342 that we have a resizing mechanism */ 5343 if (team->t.t_max_nproc >= max_nproc) { 5344 /* take this team from the team pool */ 5345 __kmp_team_pool = team->t.t_next_pool; 5346 5347 /* setup the team for fresh use */ 5348 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5349 5350 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5351 "task_team[1] %p to NULL\n", 5352 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5353 team->t.t_task_team[0] = NULL; 5354 team->t.t_task_team[1] = NULL; 5355 5356 /* reallocate space for arguments if necessary */ 5357 __kmp_alloc_argv_entries(argc, team, TRUE); 5358 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5359 5360 KA_TRACE( 5361 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5362 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5363 { // Initialize barrier data. 5364 int b; 5365 for (b = 0; b < bs_last_barrier; ++b) { 5366 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5367 #if USE_DEBUGGER 5368 team->t.t_bar[b].b_master_arrived = 0; 5369 team->t.t_bar[b].b_team_arrived = 0; 5370 #endif 5371 } 5372 } 5373 5374 team->t.t_proc_bind = new_proc_bind; 5375 5376 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5377 team->t.t_id)); 5378 5379 #if OMPT_SUPPORT 5380 __ompt_team_assign_id(team, ompt_parallel_data); 5381 #endif 5382 5383 KMP_MB(); 5384 5385 return team; 5386 } 5387 5388 /* reap team if it is too small, then loop back and check the next one */ 5389 // not sure if this is wise, but, will be redone during the hot-teams 5390 // rewrite. 5391 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5392 team = __kmp_reap_team(team); 5393 __kmp_team_pool = team; 5394 } 5395 5396 /* nothing available in the pool, no matter, make a new team! */ 5397 KMP_MB(); 5398 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5399 5400 /* and set it up */ 5401 team->t.t_max_nproc = max_nproc; 5402 /* NOTE well, for some reason allocating one big buffer and dividing it up 5403 seems to really hurt performance a lot on the P4, so, let's not use this */ 5404 __kmp_allocate_team_arrays(team, max_nproc); 5405 5406 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5407 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5408 5409 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5410 "%p to NULL\n", 5411 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5412 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5413 // memory, no need to duplicate 5414 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5415 // memory, no need to duplicate 5416 5417 if (__kmp_storage_map) { 5418 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5419 } 5420 5421 /* allocate space for arguments */ 5422 __kmp_alloc_argv_entries(argc, team, FALSE); 5423 team->t.t_argc = argc; 5424 5425 KA_TRACE(20, 5426 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5427 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5428 { // Initialize barrier data. 5429 int b; 5430 for (b = 0; b < bs_last_barrier; ++b) { 5431 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5432 #if USE_DEBUGGER 5433 team->t.t_bar[b].b_master_arrived = 0; 5434 team->t.t_bar[b].b_team_arrived = 0; 5435 #endif 5436 } 5437 } 5438 5439 team->t.t_proc_bind = new_proc_bind; 5440 5441 #if OMPT_SUPPORT 5442 __ompt_team_assign_id(team, ompt_parallel_data); 5443 team->t.ompt_serialized_team_info = NULL; 5444 #endif 5445 5446 KMP_MB(); 5447 5448 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5449 team->t.t_id)); 5450 5451 return team; 5452 } 5453 5454 /* TODO implement hot-teams at all levels */ 5455 /* TODO implement lazy thread release on demand (disband request) */ 5456 5457 /* free the team. return it to the team pool. release all the threads 5458 * associated with it */ 5459 void __kmp_free_team(kmp_root_t *root, 5460 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5461 int f; 5462 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5463 team->t.t_id)); 5464 5465 /* verify state */ 5466 KMP_DEBUG_ASSERT(root); 5467 KMP_DEBUG_ASSERT(team); 5468 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5469 KMP_DEBUG_ASSERT(team->t.t_threads); 5470 5471 int use_hot_team = team == root->r.r_hot_team; 5472 #if KMP_NESTED_HOT_TEAMS 5473 int level; 5474 kmp_hot_team_ptr_t *hot_teams; 5475 if (master) { 5476 level = team->t.t_active_level - 1; 5477 if (master->th.th_teams_microtask) { // in teams construct? 5478 if (master->th.th_teams_size.nteams > 1) { 5479 ++level; // level was not increased in teams construct for 5480 // team_of_masters 5481 } 5482 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5483 master->th.th_teams_level == team->t.t_level) { 5484 ++level; // level was not increased in teams construct for 5485 // team_of_workers before the parallel 5486 } // team->t.t_level will be increased inside parallel 5487 } 5488 hot_teams = master->th.th_hot_teams; 5489 if (level < __kmp_hot_teams_max_level) { 5490 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5491 use_hot_team = 1; 5492 } 5493 } 5494 #endif // KMP_NESTED_HOT_TEAMS 5495 5496 /* team is done working */ 5497 TCW_SYNC_PTR(team->t.t_pkfn, 5498 NULL); // Important for Debugging Support Library. 5499 #if KMP_OS_WINDOWS 5500 team->t.t_copyin_counter = 0; // init counter for possible reuse 5501 #endif 5502 // Do not reset pointer to parent team to NULL for hot teams. 5503 5504 /* if we are non-hot team, release our threads */ 5505 if (!use_hot_team) { 5506 if (__kmp_tasking_mode != tskm_immediate_exec) { 5507 // Wait for threads to reach reapable state 5508 for (f = 1; f < team->t.t_nproc; ++f) { 5509 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5510 kmp_info_t *th = team->t.t_threads[f]; 5511 volatile kmp_uint32 *state = &th->th.th_reap_state; 5512 while (*state != KMP_SAFE_TO_REAP) { 5513 #if KMP_OS_WINDOWS 5514 // On Windows a thread can be killed at any time, check this 5515 DWORD ecode; 5516 if (!__kmp_is_thread_alive(th, &ecode)) { 5517 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5518 break; 5519 } 5520 #endif 5521 // first check if thread is sleeping 5522 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5523 if (fl.is_sleeping()) 5524 fl.resume(__kmp_gtid_from_thread(th)); 5525 KMP_CPU_PAUSE(); 5526 } 5527 } 5528 5529 // Delete task teams 5530 int tt_idx; 5531 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5532 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5533 if (task_team != NULL) { 5534 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5535 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5536 team->t.t_threads[f]->th.th_task_team = NULL; 5537 } 5538 KA_TRACE( 5539 20, 5540 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5541 __kmp_get_gtid(), task_team, team->t.t_id)); 5542 #if KMP_NESTED_HOT_TEAMS 5543 __kmp_free_task_team(master, task_team); 5544 #endif 5545 team->t.t_task_team[tt_idx] = NULL; 5546 } 5547 } 5548 } 5549 5550 // Reset pointer to parent team only for non-hot teams. 5551 team->t.t_parent = NULL; 5552 team->t.t_level = 0; 5553 team->t.t_active_level = 0; 5554 5555 /* free the worker threads */ 5556 for (f = 1; f < team->t.t_nproc; ++f) { 5557 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5558 __kmp_free_thread(team->t.t_threads[f]); 5559 team->t.t_threads[f] = NULL; 5560 } 5561 5562 /* put the team back in the team pool */ 5563 /* TODO limit size of team pool, call reap_team if pool too large */ 5564 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5565 __kmp_team_pool = (volatile kmp_team_t *)team; 5566 } else { // Check if team was created for primary threads in teams construct 5567 // See if first worker is a CG root 5568 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5569 team->t.t_threads[1]->th.th_cg_roots); 5570 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5571 // Clean up the CG root nodes on workers so that this team can be re-used 5572 for (f = 1; f < team->t.t_nproc; ++f) { 5573 kmp_info_t *thr = team->t.t_threads[f]; 5574 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5575 thr->th.th_cg_roots->cg_root == thr); 5576 // Pop current CG root off list 5577 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5578 thr->th.th_cg_roots = tmp->up; 5579 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5580 " up to node %p. cg_nthreads was %d\n", 5581 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5582 int i = tmp->cg_nthreads--; 5583 if (i == 1) { 5584 __kmp_free(tmp); // free CG if we are the last thread in it 5585 } 5586 // Restore current task's thread_limit from CG root 5587 if (thr->th.th_cg_roots) 5588 thr->th.th_current_task->td_icvs.thread_limit = 5589 thr->th.th_cg_roots->cg_thread_limit; 5590 } 5591 } 5592 } 5593 5594 KMP_MB(); 5595 } 5596 5597 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5598 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5599 kmp_team_t *next_pool = team->t.t_next_pool; 5600 5601 KMP_DEBUG_ASSERT(team); 5602 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5603 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5604 KMP_DEBUG_ASSERT(team->t.t_threads); 5605 KMP_DEBUG_ASSERT(team->t.t_argv); 5606 5607 /* TODO clean the threads that are a part of this? */ 5608 5609 /* free stuff */ 5610 __kmp_free_team_arrays(team); 5611 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5612 __kmp_free((void *)team->t.t_argv); 5613 __kmp_free(team); 5614 5615 KMP_MB(); 5616 return next_pool; 5617 } 5618 5619 // Free the thread. Don't reap it, just place it on the pool of available 5620 // threads. 5621 // 5622 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5623 // binding for the affinity mechanism to be useful. 5624 // 5625 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5626 // However, we want to avoid a potential performance problem by always 5627 // scanning through the list to find the correct point at which to insert 5628 // the thread (potential N**2 behavior). To do this we keep track of the 5629 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5630 // With single-level parallelism, threads will always be added to the tail 5631 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5632 // parallelism, all bets are off and we may need to scan through the entire 5633 // free list. 5634 // 5635 // This change also has a potentially large performance benefit, for some 5636 // applications. Previously, as threads were freed from the hot team, they 5637 // would be placed back on the free list in inverse order. If the hot team 5638 // grew back to it's original size, then the freed thread would be placed 5639 // back on the hot team in reverse order. This could cause bad cache 5640 // locality problems on programs where the size of the hot team regularly 5641 // grew and shrunk. 5642 // 5643 // Now, for single-level parallelism, the OMP tid is always == gtid. 5644 void __kmp_free_thread(kmp_info_t *this_th) { 5645 int gtid; 5646 kmp_info_t **scan; 5647 5648 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5649 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5650 5651 KMP_DEBUG_ASSERT(this_th); 5652 5653 // When moving thread to pool, switch thread to wait on own b_go flag, and 5654 // uninitialized (NULL team). 5655 int b; 5656 kmp_balign_t *balign = this_th->th.th_bar; 5657 for (b = 0; b < bs_last_barrier; ++b) { 5658 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5659 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5660 balign[b].bb.team = NULL; 5661 balign[b].bb.leaf_kids = 0; 5662 } 5663 this_th->th.th_task_state = 0; 5664 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5665 5666 /* put thread back on the free pool */ 5667 TCW_PTR(this_th->th.th_team, NULL); 5668 TCW_PTR(this_th->th.th_root, NULL); 5669 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5670 5671 while (this_th->th.th_cg_roots) { 5672 this_th->th.th_cg_roots->cg_nthreads--; 5673 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5674 " %p of thread %p to %d\n", 5675 this_th, this_th->th.th_cg_roots, 5676 this_th->th.th_cg_roots->cg_root, 5677 this_th->th.th_cg_roots->cg_nthreads)); 5678 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5679 if (tmp->cg_root == this_th) { // Thread is a cg_root 5680 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5681 KA_TRACE( 5682 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5683 this_th->th.th_cg_roots = tmp->up; 5684 __kmp_free(tmp); 5685 } else { // Worker thread 5686 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5687 __kmp_free(tmp); 5688 } 5689 this_th->th.th_cg_roots = NULL; 5690 break; 5691 } 5692 } 5693 5694 /* If the implicit task assigned to this thread can be used by other threads 5695 * -> multiple threads can share the data and try to free the task at 5696 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5697 * with higher probability when hot team is disabled but can occurs even when 5698 * the hot team is enabled */ 5699 __kmp_free_implicit_task(this_th); 5700 this_th->th.th_current_task = NULL; 5701 5702 // If the __kmp_thread_pool_insert_pt is already past the new insert 5703 // point, then we need to re-scan the entire list. 5704 gtid = this_th->th.th_info.ds.ds_gtid; 5705 if (__kmp_thread_pool_insert_pt != NULL) { 5706 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5707 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5708 __kmp_thread_pool_insert_pt = NULL; 5709 } 5710 } 5711 5712 // Scan down the list to find the place to insert the thread. 5713 // scan is the address of a link in the list, possibly the address of 5714 // __kmp_thread_pool itself. 5715 // 5716 // In the absence of nested parallelism, the for loop will have 0 iterations. 5717 if (__kmp_thread_pool_insert_pt != NULL) { 5718 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5719 } else { 5720 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5721 } 5722 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5723 scan = &((*scan)->th.th_next_pool)) 5724 ; 5725 5726 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5727 // to its address. 5728 TCW_PTR(this_th->th.th_next_pool, *scan); 5729 __kmp_thread_pool_insert_pt = *scan = this_th; 5730 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5731 (this_th->th.th_info.ds.ds_gtid < 5732 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5733 TCW_4(this_th->th.th_in_pool, TRUE); 5734 __kmp_suspend_initialize_thread(this_th); 5735 __kmp_lock_suspend_mx(this_th); 5736 if (this_th->th.th_active == TRUE) { 5737 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5738 this_th->th.th_active_in_pool = TRUE; 5739 } 5740 #if KMP_DEBUG 5741 else { 5742 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5743 } 5744 #endif 5745 __kmp_unlock_suspend_mx(this_th); 5746 5747 TCW_4(__kmp_nth, __kmp_nth - 1); 5748 5749 #ifdef KMP_ADJUST_BLOCKTIME 5750 /* Adjust blocktime back to user setting or default if necessary */ 5751 /* Middle initialization might never have occurred */ 5752 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5753 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5754 if (__kmp_nth <= __kmp_avail_proc) { 5755 __kmp_zero_bt = FALSE; 5756 } 5757 } 5758 #endif /* KMP_ADJUST_BLOCKTIME */ 5759 5760 KMP_MB(); 5761 } 5762 5763 /* ------------------------------------------------------------------------ */ 5764 5765 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5766 #if OMP_PROFILING_SUPPORT 5767 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5768 // TODO: add a configuration option for time granularity 5769 if (ProfileTraceFile) 5770 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5771 #endif 5772 5773 int gtid = this_thr->th.th_info.ds.ds_gtid; 5774 /* void *stack_data;*/ 5775 kmp_team_t **volatile pteam; 5776 5777 KMP_MB(); 5778 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5779 5780 if (__kmp_env_consistency_check) { 5781 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5782 } 5783 5784 #if OMPD_SUPPORT 5785 if (ompd_state & OMPD_ENABLE_BP) 5786 ompd_bp_thread_begin(); 5787 #endif 5788 5789 #if OMPT_SUPPORT 5790 ompt_data_t *thread_data = nullptr; 5791 if (ompt_enabled.enabled) { 5792 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5793 *thread_data = ompt_data_none; 5794 5795 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5796 this_thr->th.ompt_thread_info.wait_id = 0; 5797 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5798 this_thr->th.ompt_thread_info.parallel_flags = 0; 5799 if (ompt_enabled.ompt_callback_thread_begin) { 5800 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5801 ompt_thread_worker, thread_data); 5802 } 5803 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5804 } 5805 #endif 5806 5807 /* This is the place where threads wait for work */ 5808 while (!TCR_4(__kmp_global.g.g_done)) { 5809 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5810 KMP_MB(); 5811 5812 /* wait for work to do */ 5813 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5814 5815 /* No tid yet since not part of a team */ 5816 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5817 5818 #if OMPT_SUPPORT 5819 if (ompt_enabled.enabled) { 5820 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5821 } 5822 #endif 5823 5824 pteam = &this_thr->th.th_team; 5825 5826 /* have we been allocated? */ 5827 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5828 /* we were just woken up, so run our new task */ 5829 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5830 int rc; 5831 KA_TRACE(20, 5832 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5833 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5834 (*pteam)->t.t_pkfn)); 5835 5836 updateHWFPControl(*pteam); 5837 5838 #if OMPT_SUPPORT 5839 if (ompt_enabled.enabled) { 5840 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5841 } 5842 #endif 5843 5844 rc = (*pteam)->t.t_invoke(gtid); 5845 KMP_ASSERT(rc); 5846 5847 KMP_MB(); 5848 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5849 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5850 (*pteam)->t.t_pkfn)); 5851 } 5852 #if OMPT_SUPPORT 5853 if (ompt_enabled.enabled) { 5854 /* no frame set while outside task */ 5855 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5856 5857 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5858 } 5859 #endif 5860 /* join barrier after parallel region */ 5861 __kmp_join_barrier(gtid); 5862 } 5863 } 5864 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5865 5866 #if OMPD_SUPPORT 5867 if (ompd_state & OMPD_ENABLE_BP) 5868 ompd_bp_thread_end(); 5869 #endif 5870 5871 #if OMPT_SUPPORT 5872 if (ompt_enabled.ompt_callback_thread_end) { 5873 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5874 } 5875 #endif 5876 5877 this_thr->th.th_task_team = NULL; 5878 /* run the destructors for the threadprivate data for this thread */ 5879 __kmp_common_destroy_gtid(gtid); 5880 5881 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5882 KMP_MB(); 5883 5884 #if OMP_PROFILING_SUPPORT 5885 llvm::timeTraceProfilerFinishThread(); 5886 #endif 5887 return this_thr; 5888 } 5889 5890 /* ------------------------------------------------------------------------ */ 5891 5892 void __kmp_internal_end_dest(void *specific_gtid) { 5893 // Make sure no significant bits are lost 5894 int gtid; 5895 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5896 5897 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5898 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5899 * this is because 0 is reserved for the nothing-stored case */ 5900 5901 __kmp_internal_end_thread(gtid); 5902 } 5903 5904 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5905 5906 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5907 __kmp_internal_end_atexit(); 5908 } 5909 5910 #endif 5911 5912 /* [Windows] josh: when the atexit handler is called, there may still be more 5913 than one thread alive */ 5914 void __kmp_internal_end_atexit(void) { 5915 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5916 /* [Windows] 5917 josh: ideally, we want to completely shutdown the library in this atexit 5918 handler, but stat code that depends on thread specific data for gtid fails 5919 because that data becomes unavailable at some point during the shutdown, so 5920 we call __kmp_internal_end_thread instead. We should eventually remove the 5921 dependency on __kmp_get_specific_gtid in the stat code and use 5922 __kmp_internal_end_library to cleanly shutdown the library. 5923 5924 // TODO: Can some of this comment about GVS be removed? 5925 I suspect that the offending stat code is executed when the calling thread 5926 tries to clean up a dead root thread's data structures, resulting in GVS 5927 code trying to close the GVS structures for that thread, but since the stat 5928 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5929 the calling thread is cleaning up itself instead of another thread, it get 5930 confused. This happens because allowing a thread to unregister and cleanup 5931 another thread is a recent modification for addressing an issue. 5932 Based on the current design (20050722), a thread may end up 5933 trying to unregister another thread only if thread death does not trigger 5934 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5935 thread specific data destructor function to detect thread death. For 5936 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5937 is nothing. Thus, the workaround is applicable only for Windows static 5938 stat library. */ 5939 __kmp_internal_end_library(-1); 5940 #if KMP_OS_WINDOWS 5941 __kmp_close_console(); 5942 #endif 5943 } 5944 5945 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5946 // It is assumed __kmp_forkjoin_lock is acquired. 5947 5948 int gtid; 5949 5950 KMP_DEBUG_ASSERT(thread != NULL); 5951 5952 gtid = thread->th.th_info.ds.ds_gtid; 5953 5954 if (!is_root) { 5955 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5956 /* Assume the threads are at the fork barrier here */ 5957 KA_TRACE( 5958 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5959 gtid)); 5960 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5961 * (GEH) */ 5962 ANNOTATE_HAPPENS_BEFORE(thread); 5963 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5964 thread); 5965 __kmp_release_64(&flag); 5966 } 5967 5968 // Terminate OS thread. 5969 __kmp_reap_worker(thread); 5970 5971 // The thread was killed asynchronously. If it was actively 5972 // spinning in the thread pool, decrement the global count. 5973 // 5974 // There is a small timing hole here - if the worker thread was just waking 5975 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5976 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5977 // the global counter might not get updated. 5978 // 5979 // Currently, this can only happen as the library is unloaded, 5980 // so there are no harmful side effects. 5981 if (thread->th.th_active_in_pool) { 5982 thread->th.th_active_in_pool = FALSE; 5983 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5984 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5985 } 5986 } 5987 5988 __kmp_free_implicit_task(thread); 5989 5990 // Free the fast memory for tasking 5991 #if USE_FAST_MEMORY 5992 __kmp_free_fast_memory(thread); 5993 #endif /* USE_FAST_MEMORY */ 5994 5995 __kmp_suspend_uninitialize_thread(thread); 5996 5997 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5998 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5999 6000 --__kmp_all_nth; 6001 // __kmp_nth was decremented when thread is added to the pool. 6002 6003 #ifdef KMP_ADJUST_BLOCKTIME 6004 /* Adjust blocktime back to user setting or default if necessary */ 6005 /* Middle initialization might never have occurred */ 6006 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6007 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6008 if (__kmp_nth <= __kmp_avail_proc) { 6009 __kmp_zero_bt = FALSE; 6010 } 6011 } 6012 #endif /* KMP_ADJUST_BLOCKTIME */ 6013 6014 /* free the memory being used */ 6015 if (__kmp_env_consistency_check) { 6016 if (thread->th.th_cons) { 6017 __kmp_free_cons_stack(thread->th.th_cons); 6018 thread->th.th_cons = NULL; 6019 } 6020 } 6021 6022 if (thread->th.th_pri_common != NULL) { 6023 __kmp_free(thread->th.th_pri_common); 6024 thread->th.th_pri_common = NULL; 6025 } 6026 6027 if (thread->th.th_task_state_memo_stack != NULL) { 6028 __kmp_free(thread->th.th_task_state_memo_stack); 6029 thread->th.th_task_state_memo_stack = NULL; 6030 } 6031 6032 #if KMP_USE_BGET 6033 if (thread->th.th_local.bget_data != NULL) { 6034 __kmp_finalize_bget(thread); 6035 } 6036 #endif 6037 6038 #if KMP_AFFINITY_SUPPORTED 6039 if (thread->th.th_affin_mask != NULL) { 6040 KMP_CPU_FREE(thread->th.th_affin_mask); 6041 thread->th.th_affin_mask = NULL; 6042 } 6043 #endif /* KMP_AFFINITY_SUPPORTED */ 6044 6045 #if KMP_USE_HIER_SCHED 6046 if (thread->th.th_hier_bar_data != NULL) { 6047 __kmp_free(thread->th.th_hier_bar_data); 6048 thread->th.th_hier_bar_data = NULL; 6049 } 6050 #endif 6051 6052 __kmp_reap_team(thread->th.th_serial_team); 6053 thread->th.th_serial_team = NULL; 6054 __kmp_free(thread); 6055 6056 KMP_MB(); 6057 6058 } // __kmp_reap_thread 6059 6060 static void __kmp_internal_end(void) { 6061 int i; 6062 6063 /* First, unregister the library */ 6064 __kmp_unregister_library(); 6065 6066 #if KMP_OS_WINDOWS 6067 /* In Win static library, we can't tell when a root actually dies, so we 6068 reclaim the data structures for any root threads that have died but not 6069 unregistered themselves, in order to shut down cleanly. 6070 In Win dynamic library we also can't tell when a thread dies. */ 6071 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6072 // dead roots 6073 #endif 6074 6075 for (i = 0; i < __kmp_threads_capacity; i++) 6076 if (__kmp_root[i]) 6077 if (__kmp_root[i]->r.r_active) 6078 break; 6079 KMP_MB(); /* Flush all pending memory write invalidates. */ 6080 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6081 6082 if (i < __kmp_threads_capacity) { 6083 #if KMP_USE_MONITOR 6084 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6085 KMP_MB(); /* Flush all pending memory write invalidates. */ 6086 6087 // Need to check that monitor was initialized before reaping it. If we are 6088 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6089 // __kmp_monitor will appear to contain valid data, but it is only valid in 6090 // the parent process, not the child. 6091 // New behavior (201008): instead of keying off of the flag 6092 // __kmp_init_parallel, the monitor thread creation is keyed off 6093 // of the new flag __kmp_init_monitor. 6094 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6095 if (TCR_4(__kmp_init_monitor)) { 6096 __kmp_reap_monitor(&__kmp_monitor); 6097 TCW_4(__kmp_init_monitor, 0); 6098 } 6099 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6100 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6101 #endif // KMP_USE_MONITOR 6102 } else { 6103 /* TODO move this to cleanup code */ 6104 #ifdef KMP_DEBUG 6105 /* make sure that everything has properly ended */ 6106 for (i = 0; i < __kmp_threads_capacity; i++) { 6107 if (__kmp_root[i]) { 6108 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6109 // there can be uber threads alive here 6110 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6111 } 6112 } 6113 #endif 6114 6115 KMP_MB(); 6116 6117 // Reap the worker threads. 6118 // This is valid for now, but be careful if threads are reaped sooner. 6119 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6120 // Get the next thread from the pool. 6121 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6122 __kmp_thread_pool = thread->th.th_next_pool; 6123 // Reap it. 6124 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6125 thread->th.th_next_pool = NULL; 6126 thread->th.th_in_pool = FALSE; 6127 __kmp_reap_thread(thread, 0); 6128 } 6129 __kmp_thread_pool_insert_pt = NULL; 6130 6131 // Reap teams. 6132 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6133 // Get the next team from the pool. 6134 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6135 __kmp_team_pool = team->t.t_next_pool; 6136 // Reap it. 6137 team->t.t_next_pool = NULL; 6138 __kmp_reap_team(team); 6139 } 6140 6141 __kmp_reap_task_teams(); 6142 6143 #if KMP_OS_UNIX 6144 // Threads that are not reaped should not access any resources since they 6145 // are going to be deallocated soon, so the shutdown sequence should wait 6146 // until all threads either exit the final spin-waiting loop or begin 6147 // sleeping after the given blocktime. 6148 for (i = 0; i < __kmp_threads_capacity; i++) { 6149 kmp_info_t *thr = __kmp_threads[i]; 6150 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6151 KMP_CPU_PAUSE(); 6152 } 6153 #endif 6154 6155 for (i = 0; i < __kmp_threads_capacity; ++i) { 6156 // TBD: Add some checking... 6157 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6158 } 6159 6160 /* Make sure all threadprivate destructors get run by joining with all 6161 worker threads before resetting this flag */ 6162 TCW_SYNC_4(__kmp_init_common, FALSE); 6163 6164 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6165 KMP_MB(); 6166 6167 #if KMP_USE_MONITOR 6168 // See note above: One of the possible fixes for CQ138434 / CQ140126 6169 // 6170 // FIXME: push both code fragments down and CSE them? 6171 // push them into __kmp_cleanup() ? 6172 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6173 if (TCR_4(__kmp_init_monitor)) { 6174 __kmp_reap_monitor(&__kmp_monitor); 6175 TCW_4(__kmp_init_monitor, 0); 6176 } 6177 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6178 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6179 #endif 6180 } /* else !__kmp_global.t_active */ 6181 TCW_4(__kmp_init_gtid, FALSE); 6182 KMP_MB(); /* Flush all pending memory write invalidates. */ 6183 6184 __kmp_cleanup(); 6185 #if OMPT_SUPPORT 6186 ompt_fini(); 6187 #endif 6188 } 6189 6190 void __kmp_internal_end_library(int gtid_req) { 6191 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6192 /* this shouldn't be a race condition because __kmp_internal_end() is the 6193 only place to clear __kmp_serial_init */ 6194 /* we'll check this later too, after we get the lock */ 6195 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6196 // redundant, because the next check will work in any case. 6197 if (__kmp_global.g.g_abort) { 6198 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6199 /* TODO abort? */ 6200 return; 6201 } 6202 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6203 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6204 return; 6205 } 6206 6207 KMP_MB(); /* Flush all pending memory write invalidates. */ 6208 /* find out who we are and what we should do */ 6209 { 6210 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6211 KA_TRACE( 6212 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6213 if (gtid == KMP_GTID_SHUTDOWN) { 6214 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6215 "already shutdown\n")); 6216 return; 6217 } else if (gtid == KMP_GTID_MONITOR) { 6218 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6219 "registered, or system shutdown\n")); 6220 return; 6221 } else if (gtid == KMP_GTID_DNE) { 6222 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6223 "shutdown\n")); 6224 /* we don't know who we are, but we may still shutdown the library */ 6225 } else if (KMP_UBER_GTID(gtid)) { 6226 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6227 if (__kmp_root[gtid]->r.r_active) { 6228 __kmp_global.g.g_abort = -1; 6229 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6230 __kmp_unregister_library(); 6231 KA_TRACE(10, 6232 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6233 gtid)); 6234 return; 6235 } else { 6236 KA_TRACE( 6237 10, 6238 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6239 __kmp_unregister_root_current_thread(gtid); 6240 } 6241 } else { 6242 /* worker threads may call this function through the atexit handler, if they 6243 * call exit() */ 6244 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6245 TODO: do a thorough shutdown instead */ 6246 #ifdef DUMP_DEBUG_ON_EXIT 6247 if (__kmp_debug_buf) 6248 __kmp_dump_debug_buffer(); 6249 #endif 6250 // added unregister library call here when we switch to shm linux 6251 // if we don't, it will leave lots of files in /dev/shm 6252 // cleanup shared memory file before exiting. 6253 __kmp_unregister_library(); 6254 return; 6255 } 6256 } 6257 /* synchronize the termination process */ 6258 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6259 6260 /* have we already finished */ 6261 if (__kmp_global.g.g_abort) { 6262 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6263 /* TODO abort? */ 6264 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6265 return; 6266 } 6267 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6268 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6269 return; 6270 } 6271 6272 /* We need this lock to enforce mutex between this reading of 6273 __kmp_threads_capacity and the writing by __kmp_register_root. 6274 Alternatively, we can use a counter of roots that is atomically updated by 6275 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6276 __kmp_internal_end_*. */ 6277 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6278 6279 /* now we can safely conduct the actual termination */ 6280 __kmp_internal_end(); 6281 6282 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6283 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6284 6285 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6286 6287 #ifdef DUMP_DEBUG_ON_EXIT 6288 if (__kmp_debug_buf) 6289 __kmp_dump_debug_buffer(); 6290 #endif 6291 6292 #if KMP_OS_WINDOWS 6293 __kmp_close_console(); 6294 #endif 6295 6296 __kmp_fini_allocator(); 6297 6298 } // __kmp_internal_end_library 6299 6300 void __kmp_internal_end_thread(int gtid_req) { 6301 int i; 6302 6303 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6304 /* this shouldn't be a race condition because __kmp_internal_end() is the 6305 * only place to clear __kmp_serial_init */ 6306 /* we'll check this later too, after we get the lock */ 6307 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6308 // redundant, because the next check will work in any case. 6309 if (__kmp_global.g.g_abort) { 6310 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6311 /* TODO abort? */ 6312 return; 6313 } 6314 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6315 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6316 return; 6317 } 6318 6319 // If hidden helper team has been initialized, we need to deinit it 6320 if (TCR_4(__kmp_init_hidden_helper)) { 6321 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6322 // First release the main thread to let it continue its work 6323 __kmp_hidden_helper_main_thread_release(); 6324 // Wait until the hidden helper team has been destroyed 6325 __kmp_hidden_helper_threads_deinitz_wait(); 6326 } 6327 6328 KMP_MB(); /* Flush all pending memory write invalidates. */ 6329 6330 /* find out who we are and what we should do */ 6331 { 6332 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6333 KA_TRACE(10, 6334 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6335 if (gtid == KMP_GTID_SHUTDOWN) { 6336 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6337 "already shutdown\n")); 6338 return; 6339 } else if (gtid == KMP_GTID_MONITOR) { 6340 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6341 "registered, or system shutdown\n")); 6342 return; 6343 } else if (gtid == KMP_GTID_DNE) { 6344 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6345 "shutdown\n")); 6346 return; 6347 /* we don't know who we are */ 6348 } else if (KMP_UBER_GTID(gtid)) { 6349 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6350 if (__kmp_root[gtid]->r.r_active) { 6351 __kmp_global.g.g_abort = -1; 6352 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6353 KA_TRACE(10, 6354 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6355 gtid)); 6356 return; 6357 } else { 6358 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6359 gtid)); 6360 __kmp_unregister_root_current_thread(gtid); 6361 } 6362 } else { 6363 /* just a worker thread, let's leave */ 6364 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6365 6366 if (gtid >= 0) { 6367 __kmp_threads[gtid]->th.th_task_team = NULL; 6368 } 6369 6370 KA_TRACE(10, 6371 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6372 gtid)); 6373 return; 6374 } 6375 } 6376 #if KMP_DYNAMIC_LIB 6377 if (__kmp_pause_status != kmp_hard_paused) 6378 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6379 // because we will better shutdown later in the library destructor. 6380 { 6381 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6382 return; 6383 } 6384 #endif 6385 /* synchronize the termination process */ 6386 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6387 6388 /* have we already finished */ 6389 if (__kmp_global.g.g_abort) { 6390 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6391 /* TODO abort? */ 6392 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6393 return; 6394 } 6395 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6396 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6397 return; 6398 } 6399 6400 /* We need this lock to enforce mutex between this reading of 6401 __kmp_threads_capacity and the writing by __kmp_register_root. 6402 Alternatively, we can use a counter of roots that is atomically updated by 6403 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6404 __kmp_internal_end_*. */ 6405 6406 /* should we finish the run-time? are all siblings done? */ 6407 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6408 6409 for (i = 0; i < __kmp_threads_capacity; ++i) { 6410 if (KMP_UBER_GTID(i)) { 6411 KA_TRACE( 6412 10, 6413 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6414 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6415 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6416 return; 6417 } 6418 } 6419 6420 /* now we can safely conduct the actual termination */ 6421 6422 __kmp_internal_end(); 6423 6424 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6425 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6426 6427 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6428 6429 #ifdef DUMP_DEBUG_ON_EXIT 6430 if (__kmp_debug_buf) 6431 __kmp_dump_debug_buffer(); 6432 #endif 6433 } // __kmp_internal_end_thread 6434 6435 // ----------------------------------------------------------------------------- 6436 // Library registration stuff. 6437 6438 static long __kmp_registration_flag = 0; 6439 // Random value used to indicate library initialization. 6440 static char *__kmp_registration_str = NULL; 6441 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6442 6443 static inline char *__kmp_reg_status_name() { 6444 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6445 each thread. If registration and unregistration go in different threads 6446 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6447 env var can not be found, because the name will contain different pid. */ 6448 // macOS* complains about name being too long with additional getuid() 6449 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6450 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6451 (int)getuid()); 6452 #else 6453 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6454 #endif 6455 } // __kmp_reg_status_get 6456 6457 void __kmp_register_library_startup(void) { 6458 6459 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6460 int done = 0; 6461 union { 6462 double dtime; 6463 long ltime; 6464 } time; 6465 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6466 __kmp_initialize_system_tick(); 6467 #endif 6468 __kmp_read_system_time(&time.dtime); 6469 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6470 __kmp_registration_str = 6471 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6472 __kmp_registration_flag, KMP_LIBRARY_FILE); 6473 6474 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6475 __kmp_registration_str)); 6476 6477 while (!done) { 6478 6479 char *value = NULL; // Actual value of the environment variable. 6480 6481 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6482 char *shm_name = __kmp_str_format("/%s", name); 6483 int shm_preexist = 0; 6484 char *data1; 6485 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6486 if ((fd1 == -1) && (errno == EEXIST)) { 6487 // file didn't open because it already exists. 6488 // try opening existing file 6489 fd1 = shm_open(shm_name, O_RDWR, 0666); 6490 if (fd1 == -1) { // file didn't open 6491 // error out here 6492 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6493 __kmp_msg_null); 6494 } else { 6495 // able to open existing file 6496 shm_preexist = 1; 6497 } 6498 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6499 // already exists. 6500 // error out here. 6501 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6502 __kmp_msg_null); 6503 } 6504 if (shm_preexist == 0) { 6505 // we created SHM now set size 6506 if (ftruncate(fd1, SHM_SIZE) == -1) { 6507 // error occured setting size; 6508 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6509 KMP_ERR(errno), __kmp_msg_null); 6510 } 6511 } 6512 data1 = 6513 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6514 if (data1 == MAP_FAILED) { 6515 // failed to map shared memory 6516 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6517 __kmp_msg_null); 6518 } 6519 if (shm_preexist == 0) { // set data to SHM, set value 6520 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6521 } 6522 // Read value from either what we just wrote or existing file. 6523 value = __kmp_str_format("%s", data1); // read value from SHM 6524 munmap(data1, SHM_SIZE); 6525 close(fd1); 6526 #else // Windows and unix with static library 6527 // Set environment variable, but do not overwrite if it is exist. 6528 __kmp_env_set(name, __kmp_registration_str, 0); 6529 // read value to see if it got set 6530 value = __kmp_env_get(name); 6531 #endif 6532 6533 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6534 done = 1; // Ok, environment variable set successfully, exit the loop. 6535 } else { 6536 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6537 // Check whether it alive or dead. 6538 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6539 char *tail = value; 6540 char *flag_addr_str = NULL; 6541 char *flag_val_str = NULL; 6542 char const *file_name = NULL; 6543 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6544 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6545 file_name = tail; 6546 if (tail != NULL) { 6547 long *flag_addr = 0; 6548 unsigned long flag_val = 0; 6549 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6550 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6551 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6552 // First, check whether environment-encoded address is mapped into 6553 // addr space. 6554 // If so, dereference it to see if it still has the right value. 6555 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6556 neighbor = 1; 6557 } else { 6558 // If not, then we know the other copy of the library is no longer 6559 // running. 6560 neighbor = 2; 6561 } 6562 } 6563 } 6564 switch (neighbor) { 6565 case 0: // Cannot parse environment variable -- neighbor status unknown. 6566 // Assume it is the incompatible format of future version of the 6567 // library. Assume the other library is alive. 6568 // WARN( ... ); // TODO: Issue a warning. 6569 file_name = "unknown library"; 6570 KMP_FALLTHROUGH(); 6571 // Attention! Falling to the next case. That's intentional. 6572 case 1: { // Neighbor is alive. 6573 // Check it is allowed. 6574 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6575 if (!__kmp_str_match_true(duplicate_ok)) { 6576 // That's not allowed. Issue fatal error. 6577 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6578 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6579 } 6580 KMP_INTERNAL_FREE(duplicate_ok); 6581 __kmp_duplicate_library_ok = 1; 6582 done = 1; // Exit the loop. 6583 } break; 6584 case 2: { // Neighbor is dead. 6585 6586 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6587 // close shared memory. 6588 shm_unlink(shm_name); // this removes file in /dev/shm 6589 #else 6590 // Clear the variable and try to register library again. 6591 __kmp_env_unset(name); 6592 #endif 6593 } break; 6594 default: { 6595 KMP_DEBUG_ASSERT(0); 6596 } break; 6597 } 6598 } 6599 KMP_INTERNAL_FREE((void *)value); 6600 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6601 KMP_INTERNAL_FREE((void *)shm_name); 6602 #endif 6603 } // while 6604 KMP_INTERNAL_FREE((void *)name); 6605 6606 } // func __kmp_register_library_startup 6607 6608 void __kmp_unregister_library(void) { 6609 6610 char *name = __kmp_reg_status_name(); 6611 char *value = NULL; 6612 6613 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6614 char *shm_name = __kmp_str_format("/%s", name); 6615 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6616 if (fd1 == -1) { 6617 // file did not open. return. 6618 return; 6619 } 6620 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6621 if (data1 != MAP_FAILED) { 6622 value = __kmp_str_format("%s", data1); // read value from SHM 6623 munmap(data1, SHM_SIZE); 6624 } 6625 close(fd1); 6626 #else 6627 value = __kmp_env_get(name); 6628 #endif 6629 6630 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6631 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6632 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6633 // Ok, this is our variable. Delete it. 6634 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6635 shm_unlink(shm_name); // this removes file in /dev/shm 6636 #else 6637 __kmp_env_unset(name); 6638 #endif 6639 } 6640 6641 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6642 KMP_INTERNAL_FREE(shm_name); 6643 #endif 6644 6645 KMP_INTERNAL_FREE(__kmp_registration_str); 6646 KMP_INTERNAL_FREE(value); 6647 KMP_INTERNAL_FREE(name); 6648 6649 __kmp_registration_flag = 0; 6650 __kmp_registration_str = NULL; 6651 6652 } // __kmp_unregister_library 6653 6654 // End of Library registration stuff. 6655 // ----------------------------------------------------------------------------- 6656 6657 #if KMP_MIC_SUPPORTED 6658 6659 static void __kmp_check_mic_type() { 6660 kmp_cpuid_t cpuid_state = {0}; 6661 kmp_cpuid_t *cs_p = &cpuid_state; 6662 __kmp_x86_cpuid(1, 0, cs_p); 6663 // We don't support mic1 at the moment 6664 if ((cs_p->eax & 0xff0) == 0xB10) { 6665 __kmp_mic_type = mic2; 6666 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6667 __kmp_mic_type = mic3; 6668 } else { 6669 __kmp_mic_type = non_mic; 6670 } 6671 } 6672 6673 #endif /* KMP_MIC_SUPPORTED */ 6674 6675 #if KMP_HAVE_UMWAIT 6676 static void __kmp_user_level_mwait_init() { 6677 struct kmp_cpuid buf; 6678 __kmp_x86_cpuid(7, 0, &buf); 6679 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6680 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6681 __kmp_umwait_enabled)); 6682 } 6683 #elif KMP_HAVE_MWAIT 6684 #ifndef AT_INTELPHIUSERMWAIT 6685 // Spurious, non-existent value that should always fail to return anything. 6686 // Will be replaced with the correct value when we know that. 6687 #define AT_INTELPHIUSERMWAIT 10000 6688 #endif 6689 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6690 // earlier OS is used to build the RTL, we'll use the following internal 6691 // function when the entry is not found. 6692 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6693 unsigned long getauxval(unsigned long) { return 0; } 6694 6695 static void __kmp_user_level_mwait_init() { 6696 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6697 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6698 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6699 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6700 if (__kmp_mic_type == mic3) { 6701 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6702 if ((res & 0x1) || __kmp_user_level_mwait) { 6703 __kmp_mwait_enabled = TRUE; 6704 if (__kmp_user_level_mwait) { 6705 KMP_INFORM(EnvMwaitWarn); 6706 } 6707 } else { 6708 __kmp_mwait_enabled = FALSE; 6709 } 6710 } 6711 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6712 "__kmp_mwait_enabled = %d\n", 6713 __kmp_mic_type, __kmp_mwait_enabled)); 6714 } 6715 #endif /* KMP_HAVE_UMWAIT */ 6716 6717 static void __kmp_do_serial_initialize(void) { 6718 int i, gtid; 6719 size_t size; 6720 6721 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6722 6723 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6724 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6725 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6726 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6727 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6728 6729 #if OMPT_SUPPORT 6730 ompt_pre_init(); 6731 #endif 6732 #if OMPD_SUPPORT 6733 __kmp_env_dump(); 6734 ompd_init(); 6735 #endif 6736 6737 __kmp_validate_locks(); 6738 6739 /* Initialize internal memory allocator */ 6740 __kmp_init_allocator(); 6741 6742 /* Register the library startup via an environment variable and check to see 6743 whether another copy of the library is already registered. */ 6744 6745 __kmp_register_library_startup(); 6746 6747 /* TODO reinitialization of library */ 6748 if (TCR_4(__kmp_global.g.g_done)) { 6749 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6750 } 6751 6752 __kmp_global.g.g_abort = 0; 6753 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6754 6755 /* initialize the locks */ 6756 #if KMP_USE_ADAPTIVE_LOCKS 6757 #if KMP_DEBUG_ADAPTIVE_LOCKS 6758 __kmp_init_speculative_stats(); 6759 #endif 6760 #endif 6761 #if KMP_STATS_ENABLED 6762 __kmp_stats_init(); 6763 #endif 6764 __kmp_init_lock(&__kmp_global_lock); 6765 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6766 __kmp_init_lock(&__kmp_debug_lock); 6767 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6768 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6769 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6770 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6771 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6772 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6773 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6774 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6775 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6776 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6777 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6778 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6779 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6780 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6781 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6782 #if KMP_USE_MONITOR 6783 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6784 #endif 6785 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6786 6787 /* conduct initialization and initial setup of configuration */ 6788 6789 __kmp_runtime_initialize(); 6790 6791 #if KMP_MIC_SUPPORTED 6792 __kmp_check_mic_type(); 6793 #endif 6794 6795 // Some global variable initialization moved here from kmp_env_initialize() 6796 #ifdef KMP_DEBUG 6797 kmp_diag = 0; 6798 #endif 6799 __kmp_abort_delay = 0; 6800 6801 // From __kmp_init_dflt_team_nth() 6802 /* assume the entire machine will be used */ 6803 __kmp_dflt_team_nth_ub = __kmp_xproc; 6804 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6805 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6806 } 6807 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6808 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6809 } 6810 __kmp_max_nth = __kmp_sys_max_nth; 6811 __kmp_cg_max_nth = __kmp_sys_max_nth; 6812 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6813 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6814 __kmp_teams_max_nth = __kmp_sys_max_nth; 6815 } 6816 6817 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6818 // part 6819 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6820 #if KMP_USE_MONITOR 6821 __kmp_monitor_wakeups = 6822 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6823 __kmp_bt_intervals = 6824 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6825 #endif 6826 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6827 __kmp_library = library_throughput; 6828 // From KMP_SCHEDULE initialization 6829 __kmp_static = kmp_sch_static_balanced; 6830 // AC: do not use analytical here, because it is non-monotonous 6831 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6832 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6833 // need to repeat assignment 6834 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6835 // bit control and barrier method control parts 6836 #if KMP_FAST_REDUCTION_BARRIER 6837 #define kmp_reduction_barrier_gather_bb ((int)1) 6838 #define kmp_reduction_barrier_release_bb ((int)1) 6839 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6840 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6841 #endif // KMP_FAST_REDUCTION_BARRIER 6842 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6843 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6844 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6845 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6846 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6847 #if KMP_FAST_REDUCTION_BARRIER 6848 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6849 // lin_64 ): hyper,1 6850 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6851 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6852 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6853 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6854 } 6855 #endif // KMP_FAST_REDUCTION_BARRIER 6856 } 6857 #if KMP_FAST_REDUCTION_BARRIER 6858 #undef kmp_reduction_barrier_release_pat 6859 #undef kmp_reduction_barrier_gather_pat 6860 #undef kmp_reduction_barrier_release_bb 6861 #undef kmp_reduction_barrier_gather_bb 6862 #endif // KMP_FAST_REDUCTION_BARRIER 6863 #if KMP_MIC_SUPPORTED 6864 if (__kmp_mic_type == mic2) { // KNC 6865 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6866 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6867 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6868 1; // forkjoin release 6869 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6870 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6871 } 6872 #if KMP_FAST_REDUCTION_BARRIER 6873 if (__kmp_mic_type == mic2) { // KNC 6874 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6875 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6876 } 6877 #endif // KMP_FAST_REDUCTION_BARRIER 6878 #endif // KMP_MIC_SUPPORTED 6879 6880 // From KMP_CHECKS initialization 6881 #ifdef KMP_DEBUG 6882 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6883 #else 6884 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6885 #endif 6886 6887 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6888 __kmp_foreign_tp = TRUE; 6889 6890 __kmp_global.g.g_dynamic = FALSE; 6891 __kmp_global.g.g_dynamic_mode = dynamic_default; 6892 6893 __kmp_init_nesting_mode(); 6894 6895 __kmp_env_initialize(NULL); 6896 6897 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6898 __kmp_user_level_mwait_init(); 6899 #endif 6900 // Print all messages in message catalog for testing purposes. 6901 #ifdef KMP_DEBUG 6902 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6903 if (__kmp_str_match_true(val)) { 6904 kmp_str_buf_t buffer; 6905 __kmp_str_buf_init(&buffer); 6906 __kmp_i18n_dump_catalog(&buffer); 6907 __kmp_printf("%s", buffer.str); 6908 __kmp_str_buf_free(&buffer); 6909 } 6910 __kmp_env_free(&val); 6911 #endif 6912 6913 __kmp_threads_capacity = 6914 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6915 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6916 __kmp_tp_capacity = __kmp_default_tp_capacity( 6917 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6918 6919 // If the library is shut down properly, both pools must be NULL. Just in 6920 // case, set them to NULL -- some memory may leak, but subsequent code will 6921 // work even if pools are not freed. 6922 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6923 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6924 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6925 __kmp_thread_pool = NULL; 6926 __kmp_thread_pool_insert_pt = NULL; 6927 __kmp_team_pool = NULL; 6928 6929 /* Allocate all of the variable sized records */ 6930 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6931 * expandable */ 6932 /* Since allocation is cache-aligned, just add extra padding at the end */ 6933 size = 6934 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6935 CACHE_LINE; 6936 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6937 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6938 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6939 6940 /* init thread counts */ 6941 KMP_DEBUG_ASSERT(__kmp_all_nth == 6942 0); // Asserts fail if the library is reinitializing and 6943 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6944 __kmp_all_nth = 0; 6945 __kmp_nth = 0; 6946 6947 /* setup the uber master thread and hierarchy */ 6948 gtid = __kmp_register_root(TRUE); 6949 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6950 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6951 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6952 6953 KMP_MB(); /* Flush all pending memory write invalidates. */ 6954 6955 __kmp_common_initialize(); 6956 6957 #if KMP_OS_UNIX 6958 /* invoke the child fork handler */ 6959 __kmp_register_atfork(); 6960 #endif 6961 6962 #if !KMP_DYNAMIC_LIB 6963 { 6964 /* Invoke the exit handler when the program finishes, only for static 6965 library. For dynamic library, we already have _fini and DllMain. */ 6966 int rc = atexit(__kmp_internal_end_atexit); 6967 if (rc != 0) { 6968 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6969 __kmp_msg_null); 6970 } 6971 } 6972 #endif 6973 6974 #if KMP_HANDLE_SIGNALS 6975 #if KMP_OS_UNIX 6976 /* NOTE: make sure that this is called before the user installs their own 6977 signal handlers so that the user handlers are called first. this way they 6978 can return false, not call our handler, avoid terminating the library, and 6979 continue execution where they left off. */ 6980 __kmp_install_signals(FALSE); 6981 #endif /* KMP_OS_UNIX */ 6982 #if KMP_OS_WINDOWS 6983 __kmp_install_signals(TRUE); 6984 #endif /* KMP_OS_WINDOWS */ 6985 #endif 6986 6987 /* we have finished the serial initialization */ 6988 __kmp_init_counter++; 6989 6990 __kmp_init_serial = TRUE; 6991 6992 if (__kmp_settings) { 6993 __kmp_env_print(); 6994 } 6995 6996 if (__kmp_display_env || __kmp_display_env_verbose) { 6997 __kmp_env_print_2(); 6998 } 6999 7000 #if OMPT_SUPPORT 7001 ompt_post_init(); 7002 #endif 7003 7004 KMP_MB(); 7005 7006 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7007 } 7008 7009 void __kmp_serial_initialize(void) { 7010 if (__kmp_init_serial) { 7011 return; 7012 } 7013 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7014 if (__kmp_init_serial) { 7015 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7016 return; 7017 } 7018 __kmp_do_serial_initialize(); 7019 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7020 } 7021 7022 static void __kmp_do_middle_initialize(void) { 7023 int i, j; 7024 int prev_dflt_team_nth; 7025 7026 if (!__kmp_init_serial) { 7027 __kmp_do_serial_initialize(); 7028 } 7029 7030 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7031 7032 // Save the previous value for the __kmp_dflt_team_nth so that 7033 // we can avoid some reinitialization if it hasn't changed. 7034 prev_dflt_team_nth = __kmp_dflt_team_nth; 7035 7036 #if KMP_AFFINITY_SUPPORTED 7037 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7038 // number of cores on the machine. 7039 __kmp_affinity_initialize(); 7040 7041 #endif /* KMP_AFFINITY_SUPPORTED */ 7042 7043 KMP_ASSERT(__kmp_xproc > 0); 7044 if (__kmp_avail_proc == 0) { 7045 __kmp_avail_proc = __kmp_xproc; 7046 } 7047 7048 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7049 // correct them now 7050 j = 0; 7051 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7052 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7053 __kmp_avail_proc; 7054 j++; 7055 } 7056 7057 if (__kmp_dflt_team_nth == 0) { 7058 #ifdef KMP_DFLT_NTH_CORES 7059 // Default #threads = #cores 7060 __kmp_dflt_team_nth = __kmp_ncores; 7061 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7062 "__kmp_ncores (%d)\n", 7063 __kmp_dflt_team_nth)); 7064 #else 7065 // Default #threads = #available OS procs 7066 __kmp_dflt_team_nth = __kmp_avail_proc; 7067 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7068 "__kmp_avail_proc(%d)\n", 7069 __kmp_dflt_team_nth)); 7070 #endif /* KMP_DFLT_NTH_CORES */ 7071 } 7072 7073 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7074 __kmp_dflt_team_nth = KMP_MIN_NTH; 7075 } 7076 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7077 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7078 } 7079 7080 if (__kmp_nesting_mode > 0) 7081 __kmp_set_nesting_mode_threads(); 7082 7083 // There's no harm in continuing if the following check fails, 7084 // but it indicates an error in the previous logic. 7085 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7086 7087 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7088 // Run through the __kmp_threads array and set the num threads icv for each 7089 // root thread that is currently registered with the RTL (which has not 7090 // already explicitly set its nthreads-var with a call to 7091 // omp_set_num_threads()). 7092 for (i = 0; i < __kmp_threads_capacity; i++) { 7093 kmp_info_t *thread = __kmp_threads[i]; 7094 if (thread == NULL) 7095 continue; 7096 if (thread->th.th_current_task->td_icvs.nproc != 0) 7097 continue; 7098 7099 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7100 } 7101 } 7102 KA_TRACE( 7103 20, 7104 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7105 __kmp_dflt_team_nth)); 7106 7107 #ifdef KMP_ADJUST_BLOCKTIME 7108 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7109 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7110 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7111 if (__kmp_nth > __kmp_avail_proc) { 7112 __kmp_zero_bt = TRUE; 7113 } 7114 } 7115 #endif /* KMP_ADJUST_BLOCKTIME */ 7116 7117 /* we have finished middle initialization */ 7118 TCW_SYNC_4(__kmp_init_middle, TRUE); 7119 7120 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7121 } 7122 7123 void __kmp_middle_initialize(void) { 7124 if (__kmp_init_middle) { 7125 return; 7126 } 7127 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7128 if (__kmp_init_middle) { 7129 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7130 return; 7131 } 7132 __kmp_do_middle_initialize(); 7133 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7134 } 7135 7136 void __kmp_parallel_initialize(void) { 7137 int gtid = __kmp_entry_gtid(); // this might be a new root 7138 7139 /* synchronize parallel initialization (for sibling) */ 7140 if (TCR_4(__kmp_init_parallel)) 7141 return; 7142 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7143 if (TCR_4(__kmp_init_parallel)) { 7144 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7145 return; 7146 } 7147 7148 /* TODO reinitialization after we have already shut down */ 7149 if (TCR_4(__kmp_global.g.g_done)) { 7150 KA_TRACE( 7151 10, 7152 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7153 __kmp_infinite_loop(); 7154 } 7155 7156 /* jc: The lock __kmp_initz_lock is already held, so calling 7157 __kmp_serial_initialize would cause a deadlock. So we call 7158 __kmp_do_serial_initialize directly. */ 7159 if (!__kmp_init_middle) { 7160 __kmp_do_middle_initialize(); 7161 } 7162 __kmp_assign_root_init_mask(); 7163 __kmp_resume_if_hard_paused(); 7164 7165 /* begin initialization */ 7166 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7167 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7168 7169 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7170 // Save the FP control regs. 7171 // Worker threads will set theirs to these values at thread startup. 7172 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7173 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7174 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7175 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7176 7177 #if KMP_OS_UNIX 7178 #if KMP_HANDLE_SIGNALS 7179 /* must be after __kmp_serial_initialize */ 7180 __kmp_install_signals(TRUE); 7181 #endif 7182 #endif 7183 7184 __kmp_suspend_initialize(); 7185 7186 #if defined(USE_LOAD_BALANCE) 7187 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7188 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7189 } 7190 #else 7191 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7192 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7193 } 7194 #endif 7195 7196 if (__kmp_version) { 7197 __kmp_print_version_2(); 7198 } 7199 7200 /* we have finished parallel initialization */ 7201 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7202 7203 KMP_MB(); 7204 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7205 7206 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7207 } 7208 7209 void __kmp_hidden_helper_initialize() { 7210 if (TCR_4(__kmp_init_hidden_helper)) 7211 return; 7212 7213 // __kmp_parallel_initialize is required before we initialize hidden helper 7214 if (!TCR_4(__kmp_init_parallel)) 7215 __kmp_parallel_initialize(); 7216 7217 // Double check. Note that this double check should not be placed before 7218 // __kmp_parallel_initialize as it will cause dead lock. 7219 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7220 if (TCR_4(__kmp_init_hidden_helper)) { 7221 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7222 return; 7223 } 7224 7225 // Set the count of hidden helper tasks to be executed to zero 7226 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7227 7228 // Set the global variable indicating that we're initializing hidden helper 7229 // team/threads 7230 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7231 7232 // Platform independent initialization 7233 __kmp_do_initialize_hidden_helper_threads(); 7234 7235 // Wait here for the finish of initialization of hidden helper teams 7236 __kmp_hidden_helper_threads_initz_wait(); 7237 7238 // We have finished hidden helper initialization 7239 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7240 7241 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7242 } 7243 7244 /* ------------------------------------------------------------------------ */ 7245 7246 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7247 kmp_team_t *team) { 7248 kmp_disp_t *dispatch; 7249 7250 KMP_MB(); 7251 7252 /* none of the threads have encountered any constructs, yet. */ 7253 this_thr->th.th_local.this_construct = 0; 7254 #if KMP_CACHE_MANAGE 7255 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7256 #endif /* KMP_CACHE_MANAGE */ 7257 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7258 KMP_DEBUG_ASSERT(dispatch); 7259 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7260 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7261 // this_thr->th.th_info.ds.ds_tid ] ); 7262 7263 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7264 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7265 if (__kmp_env_consistency_check) 7266 __kmp_push_parallel(gtid, team->t.t_ident); 7267 7268 KMP_MB(); /* Flush all pending memory write invalidates. */ 7269 } 7270 7271 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7272 kmp_team_t *team) { 7273 if (__kmp_env_consistency_check) 7274 __kmp_pop_parallel(gtid, team->t.t_ident); 7275 7276 __kmp_finish_implicit_task(this_thr); 7277 } 7278 7279 int __kmp_invoke_task_func(int gtid) { 7280 int rc; 7281 int tid = __kmp_tid_from_gtid(gtid); 7282 kmp_info_t *this_thr = __kmp_threads[gtid]; 7283 kmp_team_t *team = this_thr->th.th_team; 7284 7285 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7286 #if USE_ITT_BUILD 7287 if (__itt_stack_caller_create_ptr) { 7288 // inform ittnotify about entering user's code 7289 if (team->t.t_stack_id != NULL) { 7290 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7291 } else { 7292 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7293 __kmp_itt_stack_callee_enter( 7294 (__itt_caller)team->t.t_parent->t.t_stack_id); 7295 } 7296 } 7297 #endif /* USE_ITT_BUILD */ 7298 #if INCLUDE_SSC_MARKS 7299 SSC_MARK_INVOKING(); 7300 #endif 7301 7302 #if OMPT_SUPPORT 7303 void *dummy; 7304 void **exit_frame_p; 7305 ompt_data_t *my_task_data; 7306 ompt_data_t *my_parallel_data; 7307 int ompt_team_size; 7308 7309 if (ompt_enabled.enabled) { 7310 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7311 .ompt_task_info.frame.exit_frame.ptr); 7312 } else { 7313 exit_frame_p = &dummy; 7314 } 7315 7316 my_task_data = 7317 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7318 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7319 if (ompt_enabled.ompt_callback_implicit_task) { 7320 ompt_team_size = team->t.t_nproc; 7321 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7322 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7323 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7324 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7325 } 7326 #endif 7327 7328 #if KMP_STATS_ENABLED 7329 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7330 if (previous_state == stats_state_e::TEAMS_REGION) { 7331 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7332 } else { 7333 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7334 } 7335 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7336 #endif 7337 7338 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7339 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7340 #if OMPT_SUPPORT 7341 , 7342 exit_frame_p 7343 #endif 7344 ); 7345 #if OMPT_SUPPORT 7346 *exit_frame_p = NULL; 7347 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7348 #endif 7349 7350 #if KMP_STATS_ENABLED 7351 if (previous_state == stats_state_e::TEAMS_REGION) { 7352 KMP_SET_THREAD_STATE(previous_state); 7353 } 7354 KMP_POP_PARTITIONED_TIMER(); 7355 #endif 7356 7357 #if USE_ITT_BUILD 7358 if (__itt_stack_caller_create_ptr) { 7359 // inform ittnotify about leaving user's code 7360 if (team->t.t_stack_id != NULL) { 7361 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7362 } else { 7363 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7364 __kmp_itt_stack_callee_leave( 7365 (__itt_caller)team->t.t_parent->t.t_stack_id); 7366 } 7367 } 7368 #endif /* USE_ITT_BUILD */ 7369 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7370 7371 return rc; 7372 } 7373 7374 void __kmp_teams_master(int gtid) { 7375 // This routine is called by all primary threads in teams construct 7376 kmp_info_t *thr = __kmp_threads[gtid]; 7377 kmp_team_t *team = thr->th.th_team; 7378 ident_t *loc = team->t.t_ident; 7379 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7380 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7381 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7382 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7383 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7384 7385 // This thread is a new CG root. Set up the proper variables. 7386 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7387 tmp->cg_root = thr; // Make thr the CG root 7388 // Init to thread limit stored when league primary threads were forked 7389 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7390 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7391 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7392 " cg_nthreads to 1\n", 7393 thr, tmp)); 7394 tmp->up = thr->th.th_cg_roots; 7395 thr->th.th_cg_roots = tmp; 7396 7397 // Launch league of teams now, but not let workers execute 7398 // (they hang on fork barrier until next parallel) 7399 #if INCLUDE_SSC_MARKS 7400 SSC_MARK_FORKING(); 7401 #endif 7402 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7403 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7404 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7405 #if INCLUDE_SSC_MARKS 7406 SSC_MARK_JOINING(); 7407 #endif 7408 // If the team size was reduced from the limit, set it to the new size 7409 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7410 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7411 // AC: last parameter "1" eliminates join barrier which won't work because 7412 // worker threads are in a fork barrier waiting for more parallel regions 7413 __kmp_join_call(loc, gtid 7414 #if OMPT_SUPPORT 7415 , 7416 fork_context_intel 7417 #endif 7418 , 7419 1); 7420 } 7421 7422 int __kmp_invoke_teams_master(int gtid) { 7423 kmp_info_t *this_thr = __kmp_threads[gtid]; 7424 kmp_team_t *team = this_thr->th.th_team; 7425 #if KMP_DEBUG 7426 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7427 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7428 (void *)__kmp_teams_master); 7429 #endif 7430 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7431 #if OMPT_SUPPORT 7432 int tid = __kmp_tid_from_gtid(gtid); 7433 ompt_data_t *task_data = 7434 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7435 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7436 if (ompt_enabled.ompt_callback_implicit_task) { 7437 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7438 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7439 ompt_task_initial); 7440 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7441 } 7442 #endif 7443 __kmp_teams_master(gtid); 7444 #if OMPT_SUPPORT 7445 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7446 #endif 7447 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7448 return 1; 7449 } 7450 7451 /* this sets the requested number of threads for the next parallel region 7452 encountered by this team. since this should be enclosed in the forkjoin 7453 critical section it should avoid race conditions with asymmetrical nested 7454 parallelism */ 7455 7456 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7457 kmp_info_t *thr = __kmp_threads[gtid]; 7458 7459 if (num_threads > 0) 7460 thr->th.th_set_nproc = num_threads; 7461 } 7462 7463 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7464 int num_threads) { 7465 KMP_DEBUG_ASSERT(thr); 7466 // Remember the number of threads for inner parallel regions 7467 if (!TCR_4(__kmp_init_middle)) 7468 __kmp_middle_initialize(); // get internal globals calculated 7469 __kmp_assign_root_init_mask(); 7470 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7471 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7472 7473 if (num_threads == 0) { 7474 if (__kmp_teams_thread_limit > 0) { 7475 num_threads = __kmp_teams_thread_limit; 7476 } else { 7477 num_threads = __kmp_avail_proc / num_teams; 7478 } 7479 // adjust num_threads w/o warning as it is not user setting 7480 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7481 // no thread_limit clause specified - do not change thread-limit-var ICV 7482 if (num_threads > __kmp_dflt_team_nth) { 7483 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7484 } 7485 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7486 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7487 } // prevent team size to exceed thread-limit-var 7488 if (num_teams * num_threads > __kmp_teams_max_nth) { 7489 num_threads = __kmp_teams_max_nth / num_teams; 7490 } 7491 if (num_threads == 0) { 7492 num_threads = 1; 7493 } 7494 } else { 7495 // This thread will be the primary thread of the league primary threads 7496 // Store new thread limit; old limit is saved in th_cg_roots list 7497 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7498 // num_threads = min(num_threads, nthreads-var) 7499 if (num_threads > __kmp_dflt_team_nth) { 7500 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7501 } 7502 if (num_teams * num_threads > __kmp_teams_max_nth) { 7503 int new_threads = __kmp_teams_max_nth / num_teams; 7504 if (new_threads == 0) { 7505 new_threads = 1; 7506 } 7507 if (new_threads != num_threads) { 7508 if (!__kmp_reserve_warn) { // user asked for too many threads 7509 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7510 __kmp_msg(kmp_ms_warning, 7511 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7512 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7513 } 7514 } 7515 num_threads = new_threads; 7516 } 7517 } 7518 thr->th.th_teams_size.nth = num_threads; 7519 } 7520 7521 /* this sets the requested number of teams for the teams region and/or 7522 the number of threads for the next parallel region encountered */ 7523 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7524 int num_threads) { 7525 kmp_info_t *thr = __kmp_threads[gtid]; 7526 KMP_DEBUG_ASSERT(num_teams >= 0); 7527 KMP_DEBUG_ASSERT(num_threads >= 0); 7528 7529 if (num_teams == 0) { 7530 if (__kmp_nteams > 0) { 7531 num_teams = __kmp_nteams; 7532 } else { 7533 num_teams = 1; // default number of teams is 1. 7534 } 7535 } 7536 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7537 if (!__kmp_reserve_warn) { 7538 __kmp_reserve_warn = 1; 7539 __kmp_msg(kmp_ms_warning, 7540 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7541 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7542 } 7543 num_teams = __kmp_teams_max_nth; 7544 } 7545 // Set number of teams (number of threads in the outer "parallel" of the 7546 // teams) 7547 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7548 7549 __kmp_push_thread_limit(thr, num_teams, num_threads); 7550 } 7551 7552 /* This sets the requested number of teams for the teams region and/or 7553 the number of threads for the next parallel region encountered */ 7554 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7555 int num_teams_ub, int num_threads) { 7556 kmp_info_t *thr = __kmp_threads[gtid]; 7557 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7558 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7559 KMP_DEBUG_ASSERT(num_threads >= 0); 7560 7561 if (num_teams_lb > num_teams_ub) { 7562 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7563 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7564 } 7565 7566 int num_teams = 1; // defalt number of teams is 1. 7567 7568 if (num_teams_lb == 0 && num_teams_ub > 0) 7569 num_teams_lb = num_teams_ub; 7570 7571 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7572 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7573 if (num_teams > __kmp_teams_max_nth) { 7574 if (!__kmp_reserve_warn) { 7575 __kmp_reserve_warn = 1; 7576 __kmp_msg(kmp_ms_warning, 7577 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7578 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7579 } 7580 num_teams = __kmp_teams_max_nth; 7581 } 7582 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7583 num_teams = num_teams_ub; 7584 } else { // num_teams_lb <= num_teams <= num_teams_ub 7585 if (num_threads == 0) { 7586 if (num_teams_ub > __kmp_teams_max_nth) { 7587 num_teams = num_teams_lb; 7588 } else { 7589 num_teams = num_teams_ub; 7590 } 7591 } else { 7592 num_teams = (num_threads > __kmp_teams_max_nth) 7593 ? num_teams 7594 : __kmp_teams_max_nth / num_threads; 7595 if (num_teams < num_teams_lb) { 7596 num_teams = num_teams_lb; 7597 } else if (num_teams > num_teams_ub) { 7598 num_teams = num_teams_ub; 7599 } 7600 } 7601 } 7602 // Set number of teams (number of threads in the outer "parallel" of the 7603 // teams) 7604 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7605 7606 __kmp_push_thread_limit(thr, num_teams, num_threads); 7607 } 7608 7609 // Set the proc_bind var to use in the following parallel region. 7610 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7611 kmp_info_t *thr = __kmp_threads[gtid]; 7612 thr->th.th_set_proc_bind = proc_bind; 7613 } 7614 7615 /* Launch the worker threads into the microtask. */ 7616 7617 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7618 kmp_info_t *this_thr = __kmp_threads[gtid]; 7619 7620 #ifdef KMP_DEBUG 7621 int f; 7622 #endif /* KMP_DEBUG */ 7623 7624 KMP_DEBUG_ASSERT(team); 7625 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7626 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7627 KMP_MB(); /* Flush all pending memory write invalidates. */ 7628 7629 team->t.t_construct = 0; /* no single directives seen yet */ 7630 team->t.t_ordered.dt.t_value = 7631 0; /* thread 0 enters the ordered section first */ 7632 7633 /* Reset the identifiers on the dispatch buffer */ 7634 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7635 if (team->t.t_max_nproc > 1) { 7636 int i; 7637 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7638 team->t.t_disp_buffer[i].buffer_index = i; 7639 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7640 } 7641 } else { 7642 team->t.t_disp_buffer[0].buffer_index = 0; 7643 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7644 } 7645 7646 KMP_MB(); /* Flush all pending memory write invalidates. */ 7647 KMP_ASSERT(this_thr->th.th_team == team); 7648 7649 #ifdef KMP_DEBUG 7650 for (f = 0; f < team->t.t_nproc; f++) { 7651 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7652 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7653 } 7654 #endif /* KMP_DEBUG */ 7655 7656 /* release the worker threads so they may begin working */ 7657 __kmp_fork_barrier(gtid, 0); 7658 } 7659 7660 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7661 kmp_info_t *this_thr = __kmp_threads[gtid]; 7662 7663 KMP_DEBUG_ASSERT(team); 7664 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7665 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7666 KMP_MB(); /* Flush all pending memory write invalidates. */ 7667 7668 /* Join barrier after fork */ 7669 7670 #ifdef KMP_DEBUG 7671 if (__kmp_threads[gtid] && 7672 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7673 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7674 __kmp_threads[gtid]); 7675 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7676 "team->t.t_nproc=%d\n", 7677 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7678 team->t.t_nproc); 7679 __kmp_print_structure(); 7680 } 7681 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7682 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7683 #endif /* KMP_DEBUG */ 7684 7685 __kmp_join_barrier(gtid); /* wait for everyone */ 7686 #if OMPT_SUPPORT 7687 if (ompt_enabled.enabled && 7688 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7689 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7690 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7691 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7692 #if OMPT_OPTIONAL 7693 void *codeptr = NULL; 7694 if (KMP_MASTER_TID(ds_tid) && 7695 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7696 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7697 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7698 7699 if (ompt_enabled.ompt_callback_sync_region_wait) { 7700 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7701 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7702 codeptr); 7703 } 7704 if (ompt_enabled.ompt_callback_sync_region) { 7705 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7706 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7707 codeptr); 7708 } 7709 #endif 7710 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7711 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7712 ompt_scope_end, NULL, task_data, 0, ds_tid, 7713 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7714 } 7715 } 7716 #endif 7717 7718 KMP_MB(); /* Flush all pending memory write invalidates. */ 7719 KMP_ASSERT(this_thr->th.th_team == team); 7720 } 7721 7722 /* ------------------------------------------------------------------------ */ 7723 7724 #ifdef USE_LOAD_BALANCE 7725 7726 // Return the worker threads actively spinning in the hot team, if we 7727 // are at the outermost level of parallelism. Otherwise, return 0. 7728 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7729 int i; 7730 int retval; 7731 kmp_team_t *hot_team; 7732 7733 if (root->r.r_active) { 7734 return 0; 7735 } 7736 hot_team = root->r.r_hot_team; 7737 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7738 return hot_team->t.t_nproc - 1; // Don't count primary thread 7739 } 7740 7741 // Skip the primary thread - it is accounted for elsewhere. 7742 retval = 0; 7743 for (i = 1; i < hot_team->t.t_nproc; i++) { 7744 if (hot_team->t.t_threads[i]->th.th_active) { 7745 retval++; 7746 } 7747 } 7748 return retval; 7749 } 7750 7751 // Perform an automatic adjustment to the number of 7752 // threads used by the next parallel region. 7753 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7754 int retval; 7755 int pool_active; 7756 int hot_team_active; 7757 int team_curr_active; 7758 int system_active; 7759 7760 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7761 set_nproc)); 7762 KMP_DEBUG_ASSERT(root); 7763 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7764 ->th.th_current_task->td_icvs.dynamic == TRUE); 7765 KMP_DEBUG_ASSERT(set_nproc > 1); 7766 7767 if (set_nproc == 1) { 7768 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7769 return 1; 7770 } 7771 7772 // Threads that are active in the thread pool, active in the hot team for this 7773 // particular root (if we are at the outer par level), and the currently 7774 // executing thread (to become the primary thread) are available to add to the 7775 // new team, but are currently contributing to the system load, and must be 7776 // accounted for. 7777 pool_active = __kmp_thread_pool_active_nth; 7778 hot_team_active = __kmp_active_hot_team_nproc(root); 7779 team_curr_active = pool_active + hot_team_active + 1; 7780 7781 // Check the system load. 7782 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7783 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7784 "hot team active = %d\n", 7785 system_active, pool_active, hot_team_active)); 7786 7787 if (system_active < 0) { 7788 // There was an error reading the necessary info from /proc, so use the 7789 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7790 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7791 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7792 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7793 7794 // Make this call behave like the thread limit algorithm. 7795 retval = __kmp_avail_proc - __kmp_nth + 7796 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7797 if (retval > set_nproc) { 7798 retval = set_nproc; 7799 } 7800 if (retval < KMP_MIN_NTH) { 7801 retval = KMP_MIN_NTH; 7802 } 7803 7804 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7805 retval)); 7806 return retval; 7807 } 7808 7809 // There is a slight delay in the load balance algorithm in detecting new 7810 // running procs. The real system load at this instant should be at least as 7811 // large as the #active omp thread that are available to add to the team. 7812 if (system_active < team_curr_active) { 7813 system_active = team_curr_active; 7814 } 7815 retval = __kmp_avail_proc - system_active + team_curr_active; 7816 if (retval > set_nproc) { 7817 retval = set_nproc; 7818 } 7819 if (retval < KMP_MIN_NTH) { 7820 retval = KMP_MIN_NTH; 7821 } 7822 7823 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7824 return retval; 7825 } // __kmp_load_balance_nproc() 7826 7827 #endif /* USE_LOAD_BALANCE */ 7828 7829 /* ------------------------------------------------------------------------ */ 7830 7831 /* NOTE: this is called with the __kmp_init_lock held */ 7832 void __kmp_cleanup(void) { 7833 int f; 7834 7835 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7836 7837 if (TCR_4(__kmp_init_parallel)) { 7838 #if KMP_HANDLE_SIGNALS 7839 __kmp_remove_signals(); 7840 #endif 7841 TCW_4(__kmp_init_parallel, FALSE); 7842 } 7843 7844 if (TCR_4(__kmp_init_middle)) { 7845 #if KMP_AFFINITY_SUPPORTED 7846 __kmp_affinity_uninitialize(); 7847 #endif /* KMP_AFFINITY_SUPPORTED */ 7848 __kmp_cleanup_hierarchy(); 7849 TCW_4(__kmp_init_middle, FALSE); 7850 } 7851 7852 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7853 7854 if (__kmp_init_serial) { 7855 __kmp_runtime_destroy(); 7856 __kmp_init_serial = FALSE; 7857 } 7858 7859 __kmp_cleanup_threadprivate_caches(); 7860 7861 for (f = 0; f < __kmp_threads_capacity; f++) { 7862 if (__kmp_root[f] != NULL) { 7863 __kmp_free(__kmp_root[f]); 7864 __kmp_root[f] = NULL; 7865 } 7866 } 7867 __kmp_free(__kmp_threads); 7868 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7869 // there is no need in freeing __kmp_root. 7870 __kmp_threads = NULL; 7871 __kmp_root = NULL; 7872 __kmp_threads_capacity = 0; 7873 7874 #if KMP_USE_DYNAMIC_LOCK 7875 __kmp_cleanup_indirect_user_locks(); 7876 #else 7877 __kmp_cleanup_user_locks(); 7878 #endif 7879 #if OMPD_SUPPORT 7880 if (ompd_state) { 7881 __kmp_free(ompd_env_block); 7882 ompd_env_block = NULL; 7883 ompd_env_block_size = 0; 7884 } 7885 #endif 7886 7887 #if KMP_AFFINITY_SUPPORTED 7888 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7889 __kmp_cpuinfo_file = NULL; 7890 #endif /* KMP_AFFINITY_SUPPORTED */ 7891 7892 #if KMP_USE_ADAPTIVE_LOCKS 7893 #if KMP_DEBUG_ADAPTIVE_LOCKS 7894 __kmp_print_speculative_stats(); 7895 #endif 7896 #endif 7897 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7898 __kmp_nested_nth.nth = NULL; 7899 __kmp_nested_nth.size = 0; 7900 __kmp_nested_nth.used = 0; 7901 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7902 __kmp_nested_proc_bind.bind_types = NULL; 7903 __kmp_nested_proc_bind.size = 0; 7904 __kmp_nested_proc_bind.used = 0; 7905 if (__kmp_affinity_format) { 7906 KMP_INTERNAL_FREE(__kmp_affinity_format); 7907 __kmp_affinity_format = NULL; 7908 } 7909 7910 __kmp_i18n_catclose(); 7911 7912 #if KMP_USE_HIER_SCHED 7913 __kmp_hier_scheds.deallocate(); 7914 #endif 7915 7916 #if KMP_STATS_ENABLED 7917 __kmp_stats_fini(); 7918 #endif 7919 7920 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7921 } 7922 7923 /* ------------------------------------------------------------------------ */ 7924 7925 int __kmp_ignore_mppbeg(void) { 7926 char *env; 7927 7928 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7929 if (__kmp_str_match_false(env)) 7930 return FALSE; 7931 } 7932 // By default __kmpc_begin() is no-op. 7933 return TRUE; 7934 } 7935 7936 int __kmp_ignore_mppend(void) { 7937 char *env; 7938 7939 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7940 if (__kmp_str_match_false(env)) 7941 return FALSE; 7942 } 7943 // By default __kmpc_end() is no-op. 7944 return TRUE; 7945 } 7946 7947 void __kmp_internal_begin(void) { 7948 int gtid; 7949 kmp_root_t *root; 7950 7951 /* this is a very important step as it will register new sibling threads 7952 and assign these new uber threads a new gtid */ 7953 gtid = __kmp_entry_gtid(); 7954 root = __kmp_threads[gtid]->th.th_root; 7955 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7956 7957 if (root->r.r_begin) 7958 return; 7959 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7960 if (root->r.r_begin) { 7961 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7962 return; 7963 } 7964 7965 root->r.r_begin = TRUE; 7966 7967 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7968 } 7969 7970 /* ------------------------------------------------------------------------ */ 7971 7972 void __kmp_user_set_library(enum library_type arg) { 7973 int gtid; 7974 kmp_root_t *root; 7975 kmp_info_t *thread; 7976 7977 /* first, make sure we are initialized so we can get our gtid */ 7978 7979 gtid = __kmp_entry_gtid(); 7980 thread = __kmp_threads[gtid]; 7981 7982 root = thread->th.th_root; 7983 7984 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7985 library_serial)); 7986 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7987 thread */ 7988 KMP_WARNING(SetLibraryIncorrectCall); 7989 return; 7990 } 7991 7992 switch (arg) { 7993 case library_serial: 7994 thread->th.th_set_nproc = 0; 7995 set__nproc(thread, 1); 7996 break; 7997 case library_turnaround: 7998 thread->th.th_set_nproc = 0; 7999 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8000 : __kmp_dflt_team_nth_ub); 8001 break; 8002 case library_throughput: 8003 thread->th.th_set_nproc = 0; 8004 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8005 : __kmp_dflt_team_nth_ub); 8006 break; 8007 default: 8008 KMP_FATAL(UnknownLibraryType, arg); 8009 } 8010 8011 __kmp_aux_set_library(arg); 8012 } 8013 8014 void __kmp_aux_set_stacksize(size_t arg) { 8015 if (!__kmp_init_serial) 8016 __kmp_serial_initialize(); 8017 8018 #if KMP_OS_DARWIN 8019 if (arg & (0x1000 - 1)) { 8020 arg &= ~(0x1000 - 1); 8021 if (arg + 0x1000) /* check for overflow if we round up */ 8022 arg += 0x1000; 8023 } 8024 #endif 8025 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8026 8027 /* only change the default stacksize before the first parallel region */ 8028 if (!TCR_4(__kmp_init_parallel)) { 8029 size_t value = arg; /* argument is in bytes */ 8030 8031 if (value < __kmp_sys_min_stksize) 8032 value = __kmp_sys_min_stksize; 8033 else if (value > KMP_MAX_STKSIZE) 8034 value = KMP_MAX_STKSIZE; 8035 8036 __kmp_stksize = value; 8037 8038 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8039 } 8040 8041 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8042 } 8043 8044 /* set the behaviour of the runtime library */ 8045 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8046 void __kmp_aux_set_library(enum library_type arg) { 8047 __kmp_library = arg; 8048 8049 switch (__kmp_library) { 8050 case library_serial: { 8051 KMP_INFORM(LibraryIsSerial); 8052 } break; 8053 case library_turnaround: 8054 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8055 __kmp_use_yield = 2; // only yield when oversubscribed 8056 break; 8057 case library_throughput: 8058 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8059 __kmp_dflt_blocktime = 200; 8060 break; 8061 default: 8062 KMP_FATAL(UnknownLibraryType, arg); 8063 } 8064 } 8065 8066 /* Getting team information common for all team API */ 8067 // Returns NULL if not in teams construct 8068 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8069 kmp_info_t *thr = __kmp_entry_thread(); 8070 teams_serialized = 0; 8071 if (thr->th.th_teams_microtask) { 8072 kmp_team_t *team = thr->th.th_team; 8073 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8074 int ii = team->t.t_level; 8075 teams_serialized = team->t.t_serialized; 8076 int level = tlevel + 1; 8077 KMP_DEBUG_ASSERT(ii >= tlevel); 8078 while (ii > level) { 8079 for (teams_serialized = team->t.t_serialized; 8080 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8081 } 8082 if (team->t.t_serialized && (!teams_serialized)) { 8083 team = team->t.t_parent; 8084 continue; 8085 } 8086 if (ii > level) { 8087 team = team->t.t_parent; 8088 ii--; 8089 } 8090 } 8091 return team; 8092 } 8093 return NULL; 8094 } 8095 8096 int __kmp_aux_get_team_num() { 8097 int serialized; 8098 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8099 if (team) { 8100 if (serialized > 1) { 8101 return 0; // teams region is serialized ( 1 team of 1 thread ). 8102 } else { 8103 return team->t.t_master_tid; 8104 } 8105 } 8106 return 0; 8107 } 8108 8109 int __kmp_aux_get_num_teams() { 8110 int serialized; 8111 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8112 if (team) { 8113 if (serialized > 1) { 8114 return 1; 8115 } else { 8116 return team->t.t_parent->t.t_nproc; 8117 } 8118 } 8119 return 1; 8120 } 8121 8122 /* ------------------------------------------------------------------------ */ 8123 8124 /* 8125 * Affinity Format Parser 8126 * 8127 * Field is in form of: %[[[0].]size]type 8128 * % and type are required (%% means print a literal '%') 8129 * type is either single char or long name surrounded by {}, 8130 * e.g., N or {num_threads} 8131 * 0 => leading zeros 8132 * . => right justified when size is specified 8133 * by default output is left justified 8134 * size is the *minimum* field length 8135 * All other characters are printed as is 8136 * 8137 * Available field types: 8138 * L {thread_level} - omp_get_level() 8139 * n {thread_num} - omp_get_thread_num() 8140 * h {host} - name of host machine 8141 * P {process_id} - process id (integer) 8142 * T {thread_identifier} - native thread identifier (integer) 8143 * N {num_threads} - omp_get_num_threads() 8144 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8145 * a {thread_affinity} - comma separated list of integers or integer ranges 8146 * (values of affinity mask) 8147 * 8148 * Implementation-specific field types can be added 8149 * If a type is unknown, print "undefined" 8150 */ 8151 8152 // Structure holding the short name, long name, and corresponding data type 8153 // for snprintf. A table of these will represent the entire valid keyword 8154 // field types. 8155 typedef struct kmp_affinity_format_field_t { 8156 char short_name; // from spec e.g., L -> thread level 8157 const char *long_name; // from spec thread_level -> thread level 8158 char field_format; // data type for snprintf (typically 'd' or 's' 8159 // for integer or string) 8160 } kmp_affinity_format_field_t; 8161 8162 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8163 #if KMP_AFFINITY_SUPPORTED 8164 {'A', "thread_affinity", 's'}, 8165 #endif 8166 {'t', "team_num", 'd'}, 8167 {'T', "num_teams", 'd'}, 8168 {'L', "nesting_level", 'd'}, 8169 {'n', "thread_num", 'd'}, 8170 {'N', "num_threads", 'd'}, 8171 {'a', "ancestor_tnum", 'd'}, 8172 {'H', "host", 's'}, 8173 {'P', "process_id", 'd'}, 8174 {'i', "native_thread_id", 'd'}}; 8175 8176 // Return the number of characters it takes to hold field 8177 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8178 const char **ptr, 8179 kmp_str_buf_t *field_buffer) { 8180 int rc, format_index, field_value; 8181 const char *width_left, *width_right; 8182 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8183 static const int FORMAT_SIZE = 20; 8184 char format[FORMAT_SIZE] = {0}; 8185 char absolute_short_name = 0; 8186 8187 KMP_DEBUG_ASSERT(gtid >= 0); 8188 KMP_DEBUG_ASSERT(th); 8189 KMP_DEBUG_ASSERT(**ptr == '%'); 8190 KMP_DEBUG_ASSERT(field_buffer); 8191 8192 __kmp_str_buf_clear(field_buffer); 8193 8194 // Skip the initial % 8195 (*ptr)++; 8196 8197 // Check for %% first 8198 if (**ptr == '%') { 8199 __kmp_str_buf_cat(field_buffer, "%", 1); 8200 (*ptr)++; // skip over the second % 8201 return 1; 8202 } 8203 8204 // Parse field modifiers if they are present 8205 pad_zeros = false; 8206 if (**ptr == '0') { 8207 pad_zeros = true; 8208 (*ptr)++; // skip over 0 8209 } 8210 right_justify = false; 8211 if (**ptr == '.') { 8212 right_justify = true; 8213 (*ptr)++; // skip over . 8214 } 8215 // Parse width of field: [width_left, width_right) 8216 width_left = width_right = NULL; 8217 if (**ptr >= '0' && **ptr <= '9') { 8218 width_left = *ptr; 8219 SKIP_DIGITS(*ptr); 8220 width_right = *ptr; 8221 } 8222 8223 // Create the format for KMP_SNPRINTF based on flags parsed above 8224 format_index = 0; 8225 format[format_index++] = '%'; 8226 if (!right_justify) 8227 format[format_index++] = '-'; 8228 if (pad_zeros) 8229 format[format_index++] = '0'; 8230 if (width_left && width_right) { 8231 int i = 0; 8232 // Only allow 8 digit number widths. 8233 // This also prevents overflowing format variable 8234 while (i < 8 && width_left < width_right) { 8235 format[format_index++] = *width_left; 8236 width_left++; 8237 i++; 8238 } 8239 } 8240 8241 // Parse a name (long or short) 8242 // Canonicalize the name into absolute_short_name 8243 found_valid_name = false; 8244 parse_long_name = (**ptr == '{'); 8245 if (parse_long_name) 8246 (*ptr)++; // skip initial left brace 8247 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8248 sizeof(__kmp_affinity_format_table[0]); 8249 ++i) { 8250 char short_name = __kmp_affinity_format_table[i].short_name; 8251 const char *long_name = __kmp_affinity_format_table[i].long_name; 8252 char field_format = __kmp_affinity_format_table[i].field_format; 8253 if (parse_long_name) { 8254 size_t length = KMP_STRLEN(long_name); 8255 if (strncmp(*ptr, long_name, length) == 0) { 8256 found_valid_name = true; 8257 (*ptr) += length; // skip the long name 8258 } 8259 } else if (**ptr == short_name) { 8260 found_valid_name = true; 8261 (*ptr)++; // skip the short name 8262 } 8263 if (found_valid_name) { 8264 format[format_index++] = field_format; 8265 format[format_index++] = '\0'; 8266 absolute_short_name = short_name; 8267 break; 8268 } 8269 } 8270 if (parse_long_name) { 8271 if (**ptr != '}') { 8272 absolute_short_name = 0; 8273 } else { 8274 (*ptr)++; // skip over the right brace 8275 } 8276 } 8277 8278 // Attempt to fill the buffer with the requested 8279 // value using snprintf within __kmp_str_buf_print() 8280 switch (absolute_short_name) { 8281 case 't': 8282 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8283 break; 8284 case 'T': 8285 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8286 break; 8287 case 'L': 8288 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8289 break; 8290 case 'n': 8291 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8292 break; 8293 case 'H': { 8294 static const int BUFFER_SIZE = 256; 8295 char buf[BUFFER_SIZE]; 8296 __kmp_expand_host_name(buf, BUFFER_SIZE); 8297 rc = __kmp_str_buf_print(field_buffer, format, buf); 8298 } break; 8299 case 'P': 8300 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8301 break; 8302 case 'i': 8303 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8304 break; 8305 case 'N': 8306 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8307 break; 8308 case 'a': 8309 field_value = 8310 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8311 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8312 break; 8313 #if KMP_AFFINITY_SUPPORTED 8314 case 'A': { 8315 kmp_str_buf_t buf; 8316 __kmp_str_buf_init(&buf); 8317 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8318 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8319 __kmp_str_buf_free(&buf); 8320 } break; 8321 #endif 8322 default: 8323 // According to spec, If an implementation does not have info for field 8324 // type, then "undefined" is printed 8325 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8326 // Skip the field 8327 if (parse_long_name) { 8328 SKIP_TOKEN(*ptr); 8329 if (**ptr == '}') 8330 (*ptr)++; 8331 } else { 8332 (*ptr)++; 8333 } 8334 } 8335 8336 KMP_ASSERT(format_index <= FORMAT_SIZE); 8337 return rc; 8338 } 8339 8340 /* 8341 * Return number of characters needed to hold the affinity string 8342 * (not including null byte character) 8343 * The resultant string is printed to buffer, which the caller can then 8344 * handle afterwards 8345 */ 8346 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8347 kmp_str_buf_t *buffer) { 8348 const char *parse_ptr; 8349 size_t retval; 8350 const kmp_info_t *th; 8351 kmp_str_buf_t field; 8352 8353 KMP_DEBUG_ASSERT(buffer); 8354 KMP_DEBUG_ASSERT(gtid >= 0); 8355 8356 __kmp_str_buf_init(&field); 8357 __kmp_str_buf_clear(buffer); 8358 8359 th = __kmp_threads[gtid]; 8360 retval = 0; 8361 8362 // If format is NULL or zero-length string, then we use 8363 // affinity-format-var ICV 8364 parse_ptr = format; 8365 if (parse_ptr == NULL || *parse_ptr == '\0') { 8366 parse_ptr = __kmp_affinity_format; 8367 } 8368 KMP_DEBUG_ASSERT(parse_ptr); 8369 8370 while (*parse_ptr != '\0') { 8371 // Parse a field 8372 if (*parse_ptr == '%') { 8373 // Put field in the buffer 8374 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8375 __kmp_str_buf_catbuf(buffer, &field); 8376 retval += rc; 8377 } else { 8378 // Put literal character in buffer 8379 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8380 retval++; 8381 parse_ptr++; 8382 } 8383 } 8384 __kmp_str_buf_free(&field); 8385 return retval; 8386 } 8387 8388 // Displays the affinity string to stdout 8389 void __kmp_aux_display_affinity(int gtid, const char *format) { 8390 kmp_str_buf_t buf; 8391 __kmp_str_buf_init(&buf); 8392 __kmp_aux_capture_affinity(gtid, format, &buf); 8393 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8394 __kmp_str_buf_free(&buf); 8395 } 8396 8397 /* ------------------------------------------------------------------------ */ 8398 8399 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8400 int blocktime = arg; /* argument is in milliseconds */ 8401 #if KMP_USE_MONITOR 8402 int bt_intervals; 8403 #endif 8404 kmp_int8 bt_set; 8405 8406 __kmp_save_internal_controls(thread); 8407 8408 /* Normalize and set blocktime for the teams */ 8409 if (blocktime < KMP_MIN_BLOCKTIME) 8410 blocktime = KMP_MIN_BLOCKTIME; 8411 else if (blocktime > KMP_MAX_BLOCKTIME) 8412 blocktime = KMP_MAX_BLOCKTIME; 8413 8414 set__blocktime_team(thread->th.th_team, tid, blocktime); 8415 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8416 8417 #if KMP_USE_MONITOR 8418 /* Calculate and set blocktime intervals for the teams */ 8419 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8420 8421 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8422 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8423 #endif 8424 8425 /* Set whether blocktime has been set to "TRUE" */ 8426 bt_set = TRUE; 8427 8428 set__bt_set_team(thread->th.th_team, tid, bt_set); 8429 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8430 #if KMP_USE_MONITOR 8431 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8432 "bt_intervals=%d, monitor_updates=%d\n", 8433 __kmp_gtid_from_tid(tid, thread->th.th_team), 8434 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8435 __kmp_monitor_wakeups)); 8436 #else 8437 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8438 __kmp_gtid_from_tid(tid, thread->th.th_team), 8439 thread->th.th_team->t.t_id, tid, blocktime)); 8440 #endif 8441 } 8442 8443 void __kmp_aux_set_defaults(char const *str, size_t len) { 8444 if (!__kmp_init_serial) { 8445 __kmp_serial_initialize(); 8446 } 8447 __kmp_env_initialize(str); 8448 8449 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8450 __kmp_env_print(); 8451 } 8452 } // __kmp_aux_set_defaults 8453 8454 /* ------------------------------------------------------------------------ */ 8455 /* internal fast reduction routines */ 8456 8457 PACKED_REDUCTION_METHOD_T 8458 __kmp_determine_reduction_method( 8459 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8460 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8461 kmp_critical_name *lck) { 8462 8463 // Default reduction method: critical construct ( lck != NULL, like in current 8464 // PAROPT ) 8465 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8466 // can be selected by RTL 8467 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8468 // can be selected by RTL 8469 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8470 // among generated by PAROPT. 8471 8472 PACKED_REDUCTION_METHOD_T retval; 8473 8474 int team_size; 8475 8476 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8477 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8478 8479 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8480 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8481 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8482 8483 retval = critical_reduce_block; 8484 8485 // another choice of getting a team size (with 1 dynamic deference) is slower 8486 team_size = __kmp_get_team_num_threads(global_tid); 8487 if (team_size == 1) { 8488 8489 retval = empty_reduce_block; 8490 8491 } else { 8492 8493 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8494 8495 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8496 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8497 8498 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8499 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8500 8501 int teamsize_cutoff = 4; 8502 8503 #if KMP_MIC_SUPPORTED 8504 if (__kmp_mic_type != non_mic) { 8505 teamsize_cutoff = 8; 8506 } 8507 #endif 8508 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8509 if (tree_available) { 8510 if (team_size <= teamsize_cutoff) { 8511 if (atomic_available) { 8512 retval = atomic_reduce_block; 8513 } 8514 } else { 8515 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8516 } 8517 } else if (atomic_available) { 8518 retval = atomic_reduce_block; 8519 } 8520 #else 8521 #error "Unknown or unsupported OS" 8522 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8523 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8524 8525 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8526 8527 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8528 8529 // basic tuning 8530 8531 if (atomic_available) { 8532 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8533 retval = atomic_reduce_block; 8534 } 8535 } // otherwise: use critical section 8536 8537 #elif KMP_OS_DARWIN 8538 8539 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8540 if (atomic_available && (num_vars <= 3)) { 8541 retval = atomic_reduce_block; 8542 } else if (tree_available) { 8543 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8544 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8545 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8546 } 8547 } // otherwise: use critical section 8548 8549 #else 8550 #error "Unknown or unsupported OS" 8551 #endif 8552 8553 #else 8554 #error "Unknown or unsupported architecture" 8555 #endif 8556 } 8557 8558 // KMP_FORCE_REDUCTION 8559 8560 // If the team is serialized (team_size == 1), ignore the forced reduction 8561 // method and stay with the unsynchronized method (empty_reduce_block) 8562 if (__kmp_force_reduction_method != reduction_method_not_defined && 8563 team_size != 1) { 8564 8565 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8566 8567 int atomic_available, tree_available; 8568 8569 switch ((forced_retval = __kmp_force_reduction_method)) { 8570 case critical_reduce_block: 8571 KMP_ASSERT(lck); // lck should be != 0 8572 break; 8573 8574 case atomic_reduce_block: 8575 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8576 if (!atomic_available) { 8577 KMP_WARNING(RedMethodNotSupported, "atomic"); 8578 forced_retval = critical_reduce_block; 8579 } 8580 break; 8581 8582 case tree_reduce_block: 8583 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8584 if (!tree_available) { 8585 KMP_WARNING(RedMethodNotSupported, "tree"); 8586 forced_retval = critical_reduce_block; 8587 } else { 8588 #if KMP_FAST_REDUCTION_BARRIER 8589 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8590 #endif 8591 } 8592 break; 8593 8594 default: 8595 KMP_ASSERT(0); // "unsupported method specified" 8596 } 8597 8598 retval = forced_retval; 8599 } 8600 8601 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8602 8603 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8604 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8605 8606 return (retval); 8607 } 8608 // this function is for testing set/get/determine reduce method 8609 kmp_int32 __kmp_get_reduce_method(void) { 8610 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8611 } 8612 8613 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8614 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8615 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8616 8617 // Hard pause shuts down the runtime completely. Resume happens naturally when 8618 // OpenMP is used subsequently. 8619 void __kmp_hard_pause() { 8620 __kmp_pause_status = kmp_hard_paused; 8621 __kmp_internal_end_thread(-1); 8622 } 8623 8624 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8625 void __kmp_resume_if_soft_paused() { 8626 if (__kmp_pause_status == kmp_soft_paused) { 8627 __kmp_pause_status = kmp_not_paused; 8628 8629 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8630 kmp_info_t *thread = __kmp_threads[gtid]; 8631 if (thread) { // Wake it if sleeping 8632 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8633 thread); 8634 if (fl.is_sleeping()) 8635 fl.resume(gtid); 8636 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8637 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8638 } else { // thread holds the lock and may sleep soon 8639 do { // until either the thread sleeps, or we can get the lock 8640 if (fl.is_sleeping()) { 8641 fl.resume(gtid); 8642 break; 8643 } else if (__kmp_try_suspend_mx(thread)) { 8644 __kmp_unlock_suspend_mx(thread); 8645 break; 8646 } 8647 } while (1); 8648 } 8649 } 8650 } 8651 } 8652 } 8653 8654 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8655 // TODO: add warning messages 8656 int __kmp_pause_resource(kmp_pause_status_t level) { 8657 if (level == kmp_not_paused) { // requesting resume 8658 if (__kmp_pause_status == kmp_not_paused) { 8659 // error message about runtime not being paused, so can't resume 8660 return 1; 8661 } else { 8662 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8663 __kmp_pause_status == kmp_hard_paused); 8664 __kmp_pause_status = kmp_not_paused; 8665 return 0; 8666 } 8667 } else if (level == kmp_soft_paused) { // requesting soft pause 8668 if (__kmp_pause_status != kmp_not_paused) { 8669 // error message about already being paused 8670 return 1; 8671 } else { 8672 __kmp_soft_pause(); 8673 return 0; 8674 } 8675 } else if (level == kmp_hard_paused) { // requesting hard pause 8676 if (__kmp_pause_status != kmp_not_paused) { 8677 // error message about already being paused 8678 return 1; 8679 } else { 8680 __kmp_hard_pause(); 8681 return 0; 8682 } 8683 } else { 8684 // error message about invalid level 8685 return 1; 8686 } 8687 } 8688 8689 void __kmp_omp_display_env(int verbose) { 8690 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8691 if (__kmp_init_serial == 0) 8692 __kmp_do_serial_initialize(); 8693 __kmp_display_env_impl(!verbose, verbose); 8694 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8695 } 8696 8697 // Globals and functions for hidden helper task 8698 kmp_info_t **__kmp_hidden_helper_threads; 8699 kmp_info_t *__kmp_hidden_helper_main_thread; 8700 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8701 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8702 #if KMP_OS_LINUX 8703 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8704 #else 8705 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8706 #endif 8707 8708 namespace { 8709 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8710 8711 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8712 // This is an explicit synchronization on all hidden helper threads in case 8713 // that when a regular thread pushes a hidden helper task to one hidden 8714 // helper thread, the thread has not been awaken once since they're released 8715 // by the main thread after creating the team. 8716 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8717 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8718 __kmp_hidden_helper_threads_num) 8719 ; 8720 8721 // If main thread, then wait for signal 8722 if (__kmpc_master(nullptr, *gtid)) { 8723 // First, unset the initial state and release the initial thread 8724 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8725 __kmp_hidden_helper_initz_release(); 8726 __kmp_hidden_helper_main_thread_wait(); 8727 // Now wake up all worker threads 8728 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8729 __kmp_hidden_helper_worker_thread_signal(); 8730 } 8731 } 8732 } 8733 } // namespace 8734 8735 void __kmp_hidden_helper_threads_initz_routine() { 8736 // Create a new root for hidden helper team/threads 8737 const int gtid = __kmp_register_root(TRUE); 8738 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8739 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8740 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8741 __kmp_hidden_helper_threads_num; 8742 8743 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8744 8745 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8746 8747 // Set the initialization flag to FALSE 8748 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8749 8750 __kmp_hidden_helper_threads_deinitz_release(); 8751 } 8752 8753 /* Nesting Mode: 8754 Set via KMP_NESTING_MODE, which takes an integer. 8755 Note: we skip duplicate topology levels, and skip levels with only 8756 one entity. 8757 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 8758 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 8759 in the topology, and initializes the number of threads at each of those 8760 levels to the number of entities at each level, respectively, below the 8761 entity at the parent level. 8762 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 8763 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 8764 the user to turn nesting on explicitly. This is an even more experimental 8765 option to this experimental feature, and may change or go away in the 8766 future. 8767 */ 8768 8769 // Allocate space to store nesting levels 8770 void __kmp_init_nesting_mode() { 8771 int levels = KMP_HW_LAST; 8772 __kmp_nesting_mode_nlevels = levels; 8773 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 8774 for (int i = 0; i < levels; ++i) 8775 __kmp_nesting_nth_level[i] = 0; 8776 if (__kmp_nested_nth.size < levels) { 8777 __kmp_nested_nth.nth = 8778 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 8779 __kmp_nested_nth.size = levels; 8780 } 8781 } 8782 8783 // Set # threads for top levels of nesting; must be called after topology set 8784 void __kmp_set_nesting_mode_threads() { 8785 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 8786 8787 if (__kmp_nesting_mode == 1) 8788 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 8789 else if (__kmp_nesting_mode > 1) 8790 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8791 8792 if (__kmp_topology) { // use topology info 8793 int loc, hw_level; 8794 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 8795 loc < __kmp_nesting_mode_nlevels; 8796 loc++, hw_level++) { 8797 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 8798 if (__kmp_nesting_nth_level[loc] == 1) 8799 loc--; 8800 } 8801 // Make sure all cores are used 8802 if (__kmp_nesting_mode > 1 && loc > 1) { 8803 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 8804 int num_cores = __kmp_topology->get_count(core_level); 8805 int upper_levels = 1; 8806 for (int level = 0; level < loc - 1; ++level) 8807 upper_levels *= __kmp_nesting_nth_level[level]; 8808 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 8809 __kmp_nesting_nth_level[loc - 1] = 8810 num_cores / __kmp_nesting_nth_level[loc - 2]; 8811 } 8812 __kmp_nesting_mode_nlevels = loc; 8813 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8814 } else { // no topology info available; provide a reasonable guesstimation 8815 if (__kmp_avail_proc >= 4) { 8816 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 8817 __kmp_nesting_nth_level[1] = 2; 8818 __kmp_nesting_mode_nlevels = 2; 8819 } else { 8820 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 8821 __kmp_nesting_mode_nlevels = 1; 8822 } 8823 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8824 } 8825 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 8826 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 8827 } 8828 set__nproc(thread, __kmp_nesting_nth_level[0]); 8829 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 8830 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8831 if (get__max_active_levels(thread) > 1) { 8832 // if max levels was set, set nesting mode levels to same 8833 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 8834 } 8835 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 8836 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 8837 } 8838