1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #include "tsan_annotations.h" 51 52 #if KMP_OS_WINDOWS 53 // windows does not need include files as it doesn't use shared memory 54 #else 55 #include <sys/mman.h> 56 #include <sys/stat.h> 57 #include <fcntl.h> 58 #define SHM_SIZE 1024 59 #endif 60 61 #if defined(KMP_GOMP_COMPAT) 62 char const __kmp_version_alt_comp[] = 63 KMP_VERSION_PREFIX "alternative compiler support: yes"; 64 #endif /* defined(KMP_GOMP_COMPAT) */ 65 66 char const __kmp_version_omp_api[] = 67 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 68 69 #ifdef KMP_DEBUG 70 char const __kmp_version_lock[] = 71 KMP_VERSION_PREFIX "lock type: run time selectable"; 72 #endif /* KMP_DEBUG */ 73 74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 75 76 /* ------------------------------------------------------------------------ */ 77 78 #if KMP_USE_MONITOR 79 kmp_info_t __kmp_monitor; 80 #endif 81 82 /* Forward declarations */ 83 84 void __kmp_cleanup(void); 85 86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 87 int gtid); 88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 89 kmp_internal_control_t *new_icvs, 90 ident_t *loc); 91 #if KMP_AFFINITY_SUPPORTED 92 static void __kmp_partition_places(kmp_team_t *team, 93 int update_master_only = 0); 94 #endif 95 static void __kmp_do_serial_initialize(void); 96 void __kmp_fork_barrier(int gtid, int tid); 97 void __kmp_join_barrier(int gtid); 98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 99 kmp_internal_control_t *new_icvs, ident_t *loc); 100 101 #ifdef USE_LOAD_BALANCE 102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 103 #endif 104 105 static int __kmp_expand_threads(int nNeed); 106 #if KMP_OS_WINDOWS 107 static int __kmp_unregister_root_other_thread(int gtid); 108 #endif 109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 111 112 /* Calculate the identifier of the current thread */ 113 /* fast (and somewhat portable) way to get unique identifier of executing 114 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 115 int __kmp_get_global_thread_id() { 116 int i; 117 kmp_info_t **other_threads; 118 size_t stack_data; 119 char *stack_addr; 120 size_t stack_size; 121 char *stack_base; 122 123 KA_TRACE( 124 1000, 125 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 126 __kmp_nth, __kmp_all_nth)); 127 128 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 129 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 130 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 131 __kmp_init_gtid for this to work. */ 132 133 if (!TCR_4(__kmp_init_gtid)) 134 return KMP_GTID_DNE; 135 136 #ifdef KMP_TDATA_GTID 137 if (TCR_4(__kmp_gtid_mode) >= 3) { 138 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 139 return __kmp_gtid; 140 } 141 #endif 142 if (TCR_4(__kmp_gtid_mode) >= 2) { 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 144 return __kmp_gtid_get_specific(); 145 } 146 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 147 148 stack_addr = (char *)&stack_data; 149 other_threads = __kmp_threads; 150 151 /* ATT: The code below is a source of potential bugs due to unsynchronized 152 access to __kmp_threads array. For example: 153 1. Current thread loads other_threads[i] to thr and checks it, it is 154 non-NULL. 155 2. Current thread is suspended by OS. 156 3. Another thread unregisters and finishes (debug versions of free() 157 may fill memory with something like 0xEF). 158 4. Current thread is resumed. 159 5. Current thread reads junk from *thr. 160 TODO: Fix it. --ln */ 161 162 for (i = 0; i < __kmp_threads_capacity; i++) { 163 164 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 165 if (!thr) 166 continue; 167 168 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 169 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 170 171 /* stack grows down -- search through all of the active threads */ 172 173 if (stack_addr <= stack_base) { 174 size_t stack_diff = stack_base - stack_addr; 175 176 if (stack_diff <= stack_size) { 177 /* The only way we can be closer than the allocated */ 178 /* stack size is if we are running on this thread. */ 179 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 180 return i; 181 } 182 } 183 } 184 185 /* get specific to try and determine our gtid */ 186 KA_TRACE(1000, 187 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 188 "thread, using TLS\n")); 189 i = __kmp_gtid_get_specific(); 190 191 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 192 193 /* if we havn't been assigned a gtid, then return code */ 194 if (i < 0) 195 return i; 196 197 /* dynamically updated stack window for uber threads to avoid get_specific 198 call */ 199 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 200 KMP_FATAL(StackOverflow, i); 201 } 202 203 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 204 if (stack_addr > stack_base) { 205 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 206 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 207 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 208 stack_base); 209 } else { 210 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 211 stack_base - stack_addr); 212 } 213 214 /* Reprint stack bounds for ubermaster since they have been refined */ 215 if (__kmp_storage_map) { 216 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 217 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 218 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 219 other_threads[i]->th.th_info.ds.ds_stacksize, 220 "th_%d stack (refinement)", i); 221 } 222 return i; 223 } 224 225 int __kmp_get_global_thread_id_reg() { 226 int gtid; 227 228 if (!__kmp_init_serial) { 229 gtid = KMP_GTID_DNE; 230 } else 231 #ifdef KMP_TDATA_GTID 232 if (TCR_4(__kmp_gtid_mode) >= 3) { 233 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 234 gtid = __kmp_gtid; 235 } else 236 #endif 237 if (TCR_4(__kmp_gtid_mode) >= 2) { 238 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 239 gtid = __kmp_gtid_get_specific(); 240 } else { 241 KA_TRACE(1000, 242 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 243 gtid = __kmp_get_global_thread_id(); 244 } 245 246 /* we must be a new uber master sibling thread */ 247 if (gtid == KMP_GTID_DNE) { 248 KA_TRACE(10, 249 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 250 "Registering a new gtid.\n")); 251 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 252 if (!__kmp_init_serial) { 253 __kmp_do_serial_initialize(); 254 gtid = __kmp_gtid_get_specific(); 255 } else { 256 gtid = __kmp_register_root(FALSE); 257 } 258 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 259 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 260 } 261 262 KMP_DEBUG_ASSERT(gtid >= 0); 263 264 return gtid; 265 } 266 267 /* caller must hold forkjoin_lock */ 268 void __kmp_check_stack_overlap(kmp_info_t *th) { 269 int f; 270 char *stack_beg = NULL; 271 char *stack_end = NULL; 272 int gtid; 273 274 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 275 if (__kmp_storage_map) { 276 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 277 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 278 279 gtid = __kmp_gtid_from_thread(th); 280 281 if (gtid == KMP_GTID_MONITOR) { 282 __kmp_print_storage_map_gtid( 283 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 284 "th_%s stack (%s)", "mon", 285 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 286 } else { 287 __kmp_print_storage_map_gtid( 288 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 289 "th_%d stack (%s)", gtid, 290 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 291 } 292 } 293 294 /* No point in checking ubermaster threads since they use refinement and 295 * cannot overlap */ 296 gtid = __kmp_gtid_from_thread(th); 297 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 298 KA_TRACE(10, 299 ("__kmp_check_stack_overlap: performing extensive checking\n")); 300 if (stack_beg == NULL) { 301 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 302 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 303 } 304 305 for (f = 0; f < __kmp_threads_capacity; f++) { 306 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 307 308 if (f_th && f_th != th) { 309 char *other_stack_end = 310 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 311 char *other_stack_beg = 312 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 313 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 314 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 315 316 /* Print the other stack values before the abort */ 317 if (__kmp_storage_map) 318 __kmp_print_storage_map_gtid( 319 -1, other_stack_beg, other_stack_end, 320 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 321 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 322 323 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 324 __kmp_msg_null); 325 } 326 } 327 } 328 } 329 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 330 } 331 332 /* ------------------------------------------------------------------------ */ 333 334 void __kmp_infinite_loop(void) { 335 static int done = FALSE; 336 337 while (!done) { 338 KMP_YIELD(TRUE); 339 } 340 } 341 342 #define MAX_MESSAGE 512 343 344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 345 char const *format, ...) { 346 char buffer[MAX_MESSAGE]; 347 va_list ap; 348 349 va_start(ap, format); 350 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 351 p2, (unsigned long)size, format); 352 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 353 __kmp_vprintf(kmp_err, buffer, ap); 354 #if KMP_PRINT_DATA_PLACEMENT 355 int node; 356 if (gtid >= 0) { 357 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 358 if (__kmp_storage_map_verbose) { 359 node = __kmp_get_host_node(p1); 360 if (node < 0) /* doesn't work, so don't try this next time */ 361 __kmp_storage_map_verbose = FALSE; 362 else { 363 char *last; 364 int lastNode; 365 int localProc = __kmp_get_cpu_from_gtid(gtid); 366 367 const int page_size = KMP_GET_PAGE_SIZE(); 368 369 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 370 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 371 if (localProc >= 0) 372 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 373 localProc >> 1); 374 else 375 __kmp_printf_no_lock(" GTID %d\n", gtid); 376 #if KMP_USE_PRCTL 377 /* The more elaborate format is disabled for now because of the prctl 378 * hanging bug. */ 379 do { 380 last = p1; 381 lastNode = node; 382 /* This loop collates adjacent pages with the same host node. */ 383 do { 384 (char *)p1 += page_size; 385 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 386 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 387 lastNode); 388 } while (p1 <= p2); 389 #else 390 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 391 (char *)p1 + (page_size - 1), 392 __kmp_get_host_node(p1)); 393 if (p1 < p2) { 394 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 395 (char *)p2 + (page_size - 1), 396 __kmp_get_host_node(p2)); 397 } 398 #endif 399 } 400 } 401 } else 402 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 403 } 404 #endif /* KMP_PRINT_DATA_PLACEMENT */ 405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 406 } 407 408 void __kmp_warn(char const *format, ...) { 409 char buffer[MAX_MESSAGE]; 410 va_list ap; 411 412 if (__kmp_generate_warnings == kmp_warnings_off) { 413 return; 414 } 415 416 va_start(ap, format); 417 418 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 419 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 420 __kmp_vprintf(kmp_err, buffer, ap); 421 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 422 423 va_end(ap); 424 } 425 426 void __kmp_abort_process() { 427 // Later threads may stall here, but that's ok because abort() will kill them. 428 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 429 430 if (__kmp_debug_buf) { 431 __kmp_dump_debug_buffer(); 432 } 433 434 if (KMP_OS_WINDOWS) { 435 // Let other threads know of abnormal termination and prevent deadlock 436 // if abort happened during library initialization or shutdown 437 __kmp_global.g.g_abort = SIGABRT; 438 439 /* On Windows* OS by default abort() causes pop-up error box, which stalls 440 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 441 boxes. _set_abort_behavior() works well, but this function is not 442 available in VS7 (this is not problem for DLL, but it is a problem for 443 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 444 help, at least in some versions of MS C RTL. 445 446 It seems following sequence is the only way to simulate abort() and 447 avoid pop-up error box. */ 448 raise(SIGABRT); 449 _exit(3); // Just in case, if signal ignored, exit anyway. 450 } else { 451 __kmp_unregister_library(); 452 abort(); 453 } 454 455 __kmp_infinite_loop(); 456 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 457 458 } // __kmp_abort_process 459 460 void __kmp_abort_thread(void) { 461 // TODO: Eliminate g_abort global variable and this function. 462 // In case of abort just call abort(), it will kill all the threads. 463 __kmp_infinite_loop(); 464 } // __kmp_abort_thread 465 466 /* Print out the storage map for the major kmp_info_t thread data structures 467 that are allocated together. */ 468 469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 470 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 471 gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 474 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 475 476 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 477 sizeof(kmp_local_t), "th_%d.th_local", gtid); 478 479 __kmp_print_storage_map_gtid( 480 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 481 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 482 483 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 484 &thr->th.th_bar[bs_plain_barrier + 1], 485 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 486 gtid); 487 488 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 489 &thr->th.th_bar[bs_forkjoin_barrier + 1], 490 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 491 gtid); 492 493 #if KMP_FAST_REDUCTION_BARRIER 494 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 495 &thr->th.th_bar[bs_reduction_barrier + 1], 496 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 497 gtid); 498 #endif // KMP_FAST_REDUCTION_BARRIER 499 } 500 501 /* Print out the storage map for the major kmp_team_t team data structures 502 that are allocated together. */ 503 504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 505 int team_id, int num_thr) { 506 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 507 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 508 header, team_id); 509 510 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 511 &team->t.t_bar[bs_last_barrier], 512 sizeof(kmp_balign_team_t) * bs_last_barrier, 513 "%s_%d.t_bar", header, team_id); 514 515 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 516 &team->t.t_bar[bs_plain_barrier + 1], 517 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 518 header, team_id); 519 520 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 521 &team->t.t_bar[bs_forkjoin_barrier + 1], 522 sizeof(kmp_balign_team_t), 523 "%s_%d.t_bar[forkjoin]", header, team_id); 524 525 #if KMP_FAST_REDUCTION_BARRIER 526 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 527 &team->t.t_bar[bs_reduction_barrier + 1], 528 sizeof(kmp_balign_team_t), 529 "%s_%d.t_bar[reduction]", header, team_id); 530 #endif // KMP_FAST_REDUCTION_BARRIER 531 532 __kmp_print_storage_map_gtid( 533 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 534 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 535 536 __kmp_print_storage_map_gtid( 537 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 538 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 539 540 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 541 &team->t.t_disp_buffer[num_disp_buff], 542 sizeof(dispatch_shared_info_t) * num_disp_buff, 543 "%s_%d.t_disp_buffer", header, team_id); 544 } 545 546 static void __kmp_init_allocator() { 547 __kmp_init_memkind(); 548 __kmp_init_target_mem(); 549 } 550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 551 552 /* ------------------------------------------------------------------------ */ 553 554 #if KMP_DYNAMIC_LIB 555 #if KMP_OS_WINDOWS 556 557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 558 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 559 560 switch (fdwReason) { 561 562 case DLL_PROCESS_ATTACH: 563 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 564 565 return TRUE; 566 567 case DLL_PROCESS_DETACH: 568 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 569 570 // According to Windows* documentation for DllMain entry point: 571 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 572 // lpReserved == NULL when FreeLibrary() is called, 573 // lpReserved != NULL when the process is terminated. 574 // When FreeLibrary() is called, worker threads remain alive. So the 575 // runtime's state is consistent and executing proper shutdown is OK. 576 // When the process is terminated, worker threads have exited or been 577 // forcefully terminated by the OS and only the shutdown thread remains. 578 // This can leave the runtime in an inconsistent state. 579 // Hence, only attempt proper cleanup when FreeLibrary() is called. 580 // Otherwise, rely on OS to reclaim resources. 581 if (lpReserved == NULL) 582 __kmp_internal_end_library(__kmp_gtid_get_specific()); 583 584 return TRUE; 585 586 case DLL_THREAD_ATTACH: 587 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 588 589 /* if we want to register new siblings all the time here call 590 * __kmp_get_gtid(); */ 591 return TRUE; 592 593 case DLL_THREAD_DETACH: 594 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 595 596 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 597 return TRUE; 598 } 599 600 return TRUE; 601 } 602 603 #endif /* KMP_OS_WINDOWS */ 604 #endif /* KMP_DYNAMIC_LIB */ 605 606 /* __kmp_parallel_deo -- Wait until it's our turn. */ 607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 608 int gtid = *gtid_ref; 609 #ifdef BUILD_PARALLEL_ORDERED 610 kmp_team_t *team = __kmp_team_from_gtid(gtid); 611 #endif /* BUILD_PARALLEL_ORDERED */ 612 613 if (__kmp_env_consistency_check) { 614 if (__kmp_threads[gtid]->th.th_root->r.r_active) 615 #if KMP_USE_DYNAMIC_LOCK 616 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 617 #else 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 619 #endif 620 } 621 #ifdef BUILD_PARALLEL_ORDERED 622 if (!team->t.t_serialized) { 623 KMP_MB(); 624 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 625 NULL); 626 KMP_MB(); 627 } 628 #endif /* BUILD_PARALLEL_ORDERED */ 629 } 630 631 /* __kmp_parallel_dxo -- Signal the next task. */ 632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 633 int gtid = *gtid_ref; 634 #ifdef BUILD_PARALLEL_ORDERED 635 int tid = __kmp_tid_from_gtid(gtid); 636 kmp_team_t *team = __kmp_team_from_gtid(gtid); 637 #endif /* BUILD_PARALLEL_ORDERED */ 638 639 if (__kmp_env_consistency_check) { 640 if (__kmp_threads[gtid]->th.th_root->r.r_active) 641 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 642 } 643 #ifdef BUILD_PARALLEL_ORDERED 644 if (!team->t.t_serialized) { 645 KMP_MB(); /* Flush all pending memory write invalidates. */ 646 647 /* use the tid of the next thread in this team */ 648 /* TODO replace with general release procedure */ 649 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 650 651 KMP_MB(); /* Flush all pending memory write invalidates. */ 652 } 653 #endif /* BUILD_PARALLEL_ORDERED */ 654 } 655 656 /* ------------------------------------------------------------------------ */ 657 /* The BARRIER for a SINGLE process section is always explicit */ 658 659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 660 int status; 661 kmp_info_t *th; 662 kmp_team_t *team; 663 664 if (!TCR_4(__kmp_init_parallel)) 665 __kmp_parallel_initialize(); 666 __kmp_resume_if_soft_paused(); 667 668 th = __kmp_threads[gtid]; 669 team = th->th.th_team; 670 status = 0; 671 672 th->th.th_ident = id_ref; 673 674 if (team->t.t_serialized) { 675 status = 1; 676 } else { 677 kmp_int32 old_this = th->th.th_local.this_construct; 678 679 ++th->th.th_local.this_construct; 680 /* try to set team count to thread count--success means thread got the 681 single block */ 682 /* TODO: Should this be acquire or release? */ 683 if (team->t.t_construct == old_this) { 684 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 685 th->th.th_local.this_construct); 686 } 687 #if USE_ITT_BUILD 688 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 689 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 690 team->t.t_active_level == 1) { 691 // Only report metadata by primary thread of active team at level 1 692 __kmp_itt_metadata_single(id_ref); 693 } 694 #endif /* USE_ITT_BUILD */ 695 } 696 697 if (__kmp_env_consistency_check) { 698 if (status && push_ws) { 699 __kmp_push_workshare(gtid, ct_psingle, id_ref); 700 } else { 701 __kmp_check_workshare(gtid, ct_psingle, id_ref); 702 } 703 } 704 #if USE_ITT_BUILD 705 if (status) { 706 __kmp_itt_single_start(gtid); 707 } 708 #endif /* USE_ITT_BUILD */ 709 return status; 710 } 711 712 void __kmp_exit_single(int gtid) { 713 #if USE_ITT_BUILD 714 __kmp_itt_single_end(gtid); 715 #endif /* USE_ITT_BUILD */ 716 if (__kmp_env_consistency_check) 717 __kmp_pop_workshare(gtid, ct_psingle, NULL); 718 } 719 720 /* determine if we can go parallel or must use a serialized parallel region and 721 * how many threads we can use 722 * set_nproc is the number of threads requested for the team 723 * returns 0 if we should serialize or only use one thread, 724 * otherwise the number of threads to use 725 * The forkjoin lock is held by the caller. */ 726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 727 int master_tid, int set_nthreads, 728 int enter_teams) { 729 int capacity; 730 int new_nthreads; 731 KMP_DEBUG_ASSERT(__kmp_init_serial); 732 KMP_DEBUG_ASSERT(root && parent_team); 733 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 734 735 // If dyn-var is set, dynamically adjust the number of desired threads, 736 // according to the method specified by dynamic_mode. 737 new_nthreads = set_nthreads; 738 if (!get__dynamic_2(parent_team, master_tid)) { 739 ; 740 } 741 #ifdef USE_LOAD_BALANCE 742 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 743 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 744 if (new_nthreads == 1) { 745 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 746 "reservation to 1 thread\n", 747 master_tid)); 748 return 1; 749 } 750 if (new_nthreads < set_nthreads) { 751 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 752 "reservation to %d threads\n", 753 master_tid, new_nthreads)); 754 } 755 } 756 #endif /* USE_LOAD_BALANCE */ 757 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 758 new_nthreads = __kmp_avail_proc - __kmp_nth + 759 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 760 if (new_nthreads <= 1) { 761 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 762 "reservation to 1 thread\n", 763 master_tid)); 764 return 1; 765 } 766 if (new_nthreads < set_nthreads) { 767 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 768 "reservation to %d threads\n", 769 master_tid, new_nthreads)); 770 } else { 771 new_nthreads = set_nthreads; 772 } 773 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 774 if (set_nthreads > 2) { 775 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 776 new_nthreads = (new_nthreads % set_nthreads) + 1; 777 if (new_nthreads == 1) { 778 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 779 "reservation to 1 thread\n", 780 master_tid)); 781 return 1; 782 } 783 if (new_nthreads < set_nthreads) { 784 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 785 "reservation to %d threads\n", 786 master_tid, new_nthreads)); 787 } 788 } 789 } else { 790 KMP_ASSERT(0); 791 } 792 793 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 794 if (__kmp_nth + new_nthreads - 795 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 796 __kmp_max_nth) { 797 int tl_nthreads = __kmp_max_nth - __kmp_nth + 798 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 799 if (tl_nthreads <= 0) { 800 tl_nthreads = 1; 801 } 802 803 // If dyn-var is false, emit a 1-time warning. 804 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 805 __kmp_reserve_warn = 1; 806 __kmp_msg(kmp_ms_warning, 807 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 808 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 809 } 810 if (tl_nthreads == 1) { 811 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 812 "reduced reservation to 1 thread\n", 813 master_tid)); 814 return 1; 815 } 816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 817 "reservation to %d threads\n", 818 master_tid, tl_nthreads)); 819 new_nthreads = tl_nthreads; 820 } 821 822 // Respect OMP_THREAD_LIMIT 823 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 824 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 825 if (cg_nthreads + new_nthreads - 826 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 827 max_cg_threads) { 828 int tl_nthreads = max_cg_threads - cg_nthreads + 829 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 830 if (tl_nthreads <= 0) { 831 tl_nthreads = 1; 832 } 833 834 // If dyn-var is false, emit a 1-time warning. 835 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 836 __kmp_reserve_warn = 1; 837 __kmp_msg(kmp_ms_warning, 838 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 839 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 840 } 841 if (tl_nthreads == 1) { 842 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 843 "reduced reservation to 1 thread\n", 844 master_tid)); 845 return 1; 846 } 847 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 848 "reservation to %d threads\n", 849 master_tid, tl_nthreads)); 850 new_nthreads = tl_nthreads; 851 } 852 853 // Check if the threads array is large enough, or needs expanding. 854 // See comment in __kmp_register_root() about the adjustment if 855 // __kmp_threads[0] == NULL. 856 capacity = __kmp_threads_capacity; 857 if (TCR_PTR(__kmp_threads[0]) == NULL) { 858 --capacity; 859 } 860 // If it is not for initializing the hidden helper team, we need to take 861 // __kmp_hidden_helper_threads_num out of the capacity because it is included 862 // in __kmp_threads_capacity. 863 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 864 capacity -= __kmp_hidden_helper_threads_num; 865 } 866 if (__kmp_nth + new_nthreads - 867 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 868 capacity) { 869 // Expand the threads array. 870 int slotsRequired = __kmp_nth + new_nthreads - 871 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 872 capacity; 873 int slotsAdded = __kmp_expand_threads(slotsRequired); 874 if (slotsAdded < slotsRequired) { 875 // The threads array was not expanded enough. 876 new_nthreads -= (slotsRequired - slotsAdded); 877 KMP_ASSERT(new_nthreads >= 1); 878 879 // If dyn-var is false, emit a 1-time warning. 880 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 881 __kmp_reserve_warn = 1; 882 if (__kmp_tp_cached) { 883 __kmp_msg(kmp_ms_warning, 884 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 885 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 886 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 887 } else { 888 __kmp_msg(kmp_ms_warning, 889 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 890 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 891 } 892 } 893 } 894 } 895 896 #ifdef KMP_DEBUG 897 if (new_nthreads == 1) { 898 KC_TRACE(10, 899 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 900 "dead roots and rechecking; requested %d threads\n", 901 __kmp_get_gtid(), set_nthreads)); 902 } else { 903 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 904 " %d threads\n", 905 __kmp_get_gtid(), new_nthreads, set_nthreads)); 906 } 907 #endif // KMP_DEBUG 908 return new_nthreads; 909 } 910 911 /* Allocate threads from the thread pool and assign them to the new team. We are 912 assured that there are enough threads available, because we checked on that 913 earlier within critical section forkjoin */ 914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 915 kmp_info_t *master_th, int master_gtid) { 916 int i; 917 int use_hot_team; 918 919 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 920 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 921 KMP_MB(); 922 923 /* first, let's setup the primary thread */ 924 master_th->th.th_info.ds.ds_tid = 0; 925 master_th->th.th_team = team; 926 master_th->th.th_team_nproc = team->t.t_nproc; 927 master_th->th.th_team_master = master_th; 928 master_th->th.th_team_serialized = FALSE; 929 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 930 931 /* make sure we are not the optimized hot team */ 932 #if KMP_NESTED_HOT_TEAMS 933 use_hot_team = 0; 934 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 935 if (hot_teams) { // hot teams array is not allocated if 936 // KMP_HOT_TEAMS_MAX_LEVEL=0 937 int level = team->t.t_active_level - 1; // index in array of hot teams 938 if (master_th->th.th_teams_microtask) { // are we inside the teams? 939 if (master_th->th.th_teams_size.nteams > 1) { 940 ++level; // level was not increased in teams construct for 941 // team_of_masters 942 } 943 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 944 master_th->th.th_teams_level == team->t.t_level) { 945 ++level; // level was not increased in teams construct for 946 // team_of_workers before the parallel 947 } // team->t.t_level will be increased inside parallel 948 } 949 if (level < __kmp_hot_teams_max_level) { 950 if (hot_teams[level].hot_team) { 951 // hot team has already been allocated for given level 952 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 953 use_hot_team = 1; // the team is ready to use 954 } else { 955 use_hot_team = 0; // AC: threads are not allocated yet 956 hot_teams[level].hot_team = team; // remember new hot team 957 hot_teams[level].hot_team_nth = team->t.t_nproc; 958 } 959 } else { 960 use_hot_team = 0; 961 } 962 } 963 #else 964 use_hot_team = team == root->r.r_hot_team; 965 #endif 966 if (!use_hot_team) { 967 968 /* install the primary thread */ 969 team->t.t_threads[0] = master_th; 970 __kmp_initialize_info(master_th, team, 0, master_gtid); 971 972 /* now, install the worker threads */ 973 for (i = 1; i < team->t.t_nproc; i++) { 974 975 /* fork or reallocate a new thread and install it in team */ 976 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 977 team->t.t_threads[i] = thr; 978 KMP_DEBUG_ASSERT(thr); 979 KMP_DEBUG_ASSERT(thr->th.th_team == team); 980 /* align team and thread arrived states */ 981 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 982 "T#%d(%d:%d) join =%llu, plain=%llu\n", 983 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 984 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 985 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 986 team->t.t_bar[bs_plain_barrier].b_arrived)); 987 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 988 thr->th.th_teams_level = master_th->th.th_teams_level; 989 thr->th.th_teams_size = master_th->th.th_teams_size; 990 { // Initialize threads' barrier data. 991 int b; 992 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 993 for (b = 0; b < bs_last_barrier; ++b) { 994 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 995 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 996 #if USE_DEBUGGER 997 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 998 #endif 999 } 1000 } 1001 } 1002 1003 #if KMP_AFFINITY_SUPPORTED 1004 __kmp_partition_places(team); 1005 #endif 1006 } 1007 1008 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1009 for (i = 0; i < team->t.t_nproc; i++) { 1010 kmp_info_t *thr = team->t.t_threads[i]; 1011 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1012 thr->th.th_prev_level != team->t.t_level) { 1013 team->t.t_display_affinity = 1; 1014 break; 1015 } 1016 } 1017 } 1018 1019 KMP_MB(); 1020 } 1021 1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1023 // Propagate any changes to the floating point control registers out to the team 1024 // We try to avoid unnecessary writes to the relevant cache line in the team 1025 // structure, so we don't make changes unless they are needed. 1026 inline static void propagateFPControl(kmp_team_t *team) { 1027 if (__kmp_inherit_fp_control) { 1028 kmp_int16 x87_fpu_control_word; 1029 kmp_uint32 mxcsr; 1030 1031 // Get primary thread's values of FPU control flags (both X87 and vector) 1032 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1033 __kmp_store_mxcsr(&mxcsr); 1034 mxcsr &= KMP_X86_MXCSR_MASK; 1035 1036 // There is no point looking at t_fp_control_saved here. 1037 // If it is TRUE, we still have to update the values if they are different 1038 // from those we now have. If it is FALSE we didn't save anything yet, but 1039 // our objective is the same. We have to ensure that the values in the team 1040 // are the same as those we have. 1041 // So, this code achieves what we need whether or not t_fp_control_saved is 1042 // true. By checking whether the value needs updating we avoid unnecessary 1043 // writes that would put the cache-line into a written state, causing all 1044 // threads in the team to have to read it again. 1045 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1046 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1047 // Although we don't use this value, other code in the runtime wants to know 1048 // whether it should restore them. So we must ensure it is correct. 1049 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1050 } else { 1051 // Similarly here. Don't write to this cache-line in the team structure 1052 // unless we have to. 1053 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1054 } 1055 } 1056 1057 // Do the opposite, setting the hardware registers to the updated values from 1058 // the team. 1059 inline static void updateHWFPControl(kmp_team_t *team) { 1060 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1061 // Only reset the fp control regs if they have been changed in the team. 1062 // the parallel region that we are exiting. 1063 kmp_int16 x87_fpu_control_word; 1064 kmp_uint32 mxcsr; 1065 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1066 __kmp_store_mxcsr(&mxcsr); 1067 mxcsr &= KMP_X86_MXCSR_MASK; 1068 1069 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1070 __kmp_clear_x87_fpu_status_word(); 1071 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1072 } 1073 1074 if (team->t.t_mxcsr != mxcsr) { 1075 __kmp_load_mxcsr(&team->t.t_mxcsr); 1076 } 1077 } 1078 } 1079 #else 1080 #define propagateFPControl(x) ((void)0) 1081 #define updateHWFPControl(x) ((void)0) 1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1083 1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1085 int realloc); // forward declaration 1086 1087 /* Run a parallel region that has been serialized, so runs only in a team of the 1088 single primary thread. */ 1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1090 kmp_info_t *this_thr; 1091 kmp_team_t *serial_team; 1092 1093 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1094 1095 /* Skip all this code for autopar serialized loops since it results in 1096 unacceptable overhead */ 1097 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1098 return; 1099 1100 if (!TCR_4(__kmp_init_parallel)) 1101 __kmp_parallel_initialize(); 1102 __kmp_resume_if_soft_paused(); 1103 1104 this_thr = __kmp_threads[global_tid]; 1105 serial_team = this_thr->th.th_serial_team; 1106 1107 /* utilize the serialized team held by this thread */ 1108 KMP_DEBUG_ASSERT(serial_team); 1109 KMP_MB(); 1110 1111 if (__kmp_tasking_mode != tskm_immediate_exec) { 1112 KMP_DEBUG_ASSERT( 1113 this_thr->th.th_task_team == 1114 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1115 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1116 NULL); 1117 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1118 "team %p, new task_team = NULL\n", 1119 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1120 this_thr->th.th_task_team = NULL; 1121 } 1122 1123 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1124 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1125 proc_bind = proc_bind_false; 1126 } else if (proc_bind == proc_bind_default) { 1127 // No proc_bind clause was specified, so use the current value 1128 // of proc-bind-var for this parallel region. 1129 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1130 } 1131 // Reset for next parallel region 1132 this_thr->th.th_set_proc_bind = proc_bind_default; 1133 1134 #if OMPT_SUPPORT 1135 ompt_data_t ompt_parallel_data = ompt_data_none; 1136 ompt_data_t *implicit_task_data; 1137 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1138 if (ompt_enabled.enabled && 1139 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1140 1141 ompt_task_info_t *parent_task_info; 1142 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1143 1144 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1145 if (ompt_enabled.ompt_callback_parallel_begin) { 1146 int team_size = 1; 1147 1148 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1149 &(parent_task_info->task_data), &(parent_task_info->frame), 1150 &ompt_parallel_data, team_size, 1151 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1152 } 1153 } 1154 #endif // OMPT_SUPPORT 1155 1156 if (this_thr->th.th_team != serial_team) { 1157 // Nested level will be an index in the nested nthreads array 1158 int level = this_thr->th.th_team->t.t_level; 1159 1160 if (serial_team->t.t_serialized) { 1161 /* this serial team was already used 1162 TODO increase performance by making this locks more specific */ 1163 kmp_team_t *new_team; 1164 1165 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1166 1167 new_team = 1168 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1169 #if OMPT_SUPPORT 1170 ompt_parallel_data, 1171 #endif 1172 proc_bind, &this_thr->th.th_current_task->td_icvs, 1173 0 USE_NESTED_HOT_ARG(NULL)); 1174 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1175 KMP_ASSERT(new_team); 1176 1177 /* setup new serialized team and install it */ 1178 new_team->t.t_threads[0] = this_thr; 1179 new_team->t.t_parent = this_thr->th.th_team; 1180 serial_team = new_team; 1181 this_thr->th.th_serial_team = serial_team; 1182 1183 KF_TRACE( 1184 10, 1185 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1186 global_tid, serial_team)); 1187 1188 /* TODO the above breaks the requirement that if we run out of resources, 1189 then we can still guarantee that serialized teams are ok, since we may 1190 need to allocate a new one */ 1191 } else { 1192 KF_TRACE( 1193 10, 1194 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1195 global_tid, serial_team)); 1196 } 1197 1198 /* we have to initialize this serial team */ 1199 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1200 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1201 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1202 serial_team->t.t_ident = loc; 1203 serial_team->t.t_serialized = 1; 1204 serial_team->t.t_nproc = 1; 1205 serial_team->t.t_parent = this_thr->th.th_team; 1206 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1207 this_thr->th.th_team = serial_team; 1208 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1209 1210 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1211 this_thr->th.th_current_task)); 1212 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1213 this_thr->th.th_current_task->td_flags.executing = 0; 1214 1215 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1216 1217 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1218 implicit task for each serialized task represented by 1219 team->t.t_serialized? */ 1220 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1221 &this_thr->th.th_current_task->td_parent->td_icvs); 1222 1223 // Thread value exists in the nested nthreads array for the next nested 1224 // level 1225 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1226 this_thr->th.th_current_task->td_icvs.nproc = 1227 __kmp_nested_nth.nth[level + 1]; 1228 } 1229 1230 if (__kmp_nested_proc_bind.used && 1231 (level + 1 < __kmp_nested_proc_bind.used)) { 1232 this_thr->th.th_current_task->td_icvs.proc_bind = 1233 __kmp_nested_proc_bind.bind_types[level + 1]; 1234 } 1235 1236 #if USE_DEBUGGER 1237 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1238 #endif 1239 this_thr->th.th_info.ds.ds_tid = 0; 1240 1241 /* set thread cache values */ 1242 this_thr->th.th_team_nproc = 1; 1243 this_thr->th.th_team_master = this_thr; 1244 this_thr->th.th_team_serialized = 1; 1245 1246 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1247 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1248 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1249 1250 propagateFPControl(serial_team); 1251 1252 /* check if we need to allocate dispatch buffers stack */ 1253 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1254 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1255 serial_team->t.t_dispatch->th_disp_buffer = 1256 (dispatch_private_info_t *)__kmp_allocate( 1257 sizeof(dispatch_private_info_t)); 1258 } 1259 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1260 1261 KMP_MB(); 1262 1263 } else { 1264 /* this serialized team is already being used, 1265 * that's fine, just add another nested level */ 1266 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1267 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1268 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1269 ++serial_team->t.t_serialized; 1270 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1271 1272 // Nested level will be an index in the nested nthreads array 1273 int level = this_thr->th.th_team->t.t_level; 1274 // Thread value exists in the nested nthreads array for the next nested 1275 // level 1276 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1277 this_thr->th.th_current_task->td_icvs.nproc = 1278 __kmp_nested_nth.nth[level + 1]; 1279 } 1280 serial_team->t.t_level++; 1281 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1282 "of serial team %p to %d\n", 1283 global_tid, serial_team, serial_team->t.t_level)); 1284 1285 /* allocate/push dispatch buffers stack */ 1286 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1287 { 1288 dispatch_private_info_t *disp_buffer = 1289 (dispatch_private_info_t *)__kmp_allocate( 1290 sizeof(dispatch_private_info_t)); 1291 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1292 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1293 } 1294 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1295 1296 KMP_MB(); 1297 } 1298 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1299 1300 // Perform the display affinity functionality for 1301 // serialized parallel regions 1302 if (__kmp_display_affinity) { 1303 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1304 this_thr->th.th_prev_num_threads != 1) { 1305 // NULL means use the affinity-format-var ICV 1306 __kmp_aux_display_affinity(global_tid, NULL); 1307 this_thr->th.th_prev_level = serial_team->t.t_level; 1308 this_thr->th.th_prev_num_threads = 1; 1309 } 1310 } 1311 1312 if (__kmp_env_consistency_check) 1313 __kmp_push_parallel(global_tid, NULL); 1314 #if OMPT_SUPPORT 1315 serial_team->t.ompt_team_info.master_return_address = codeptr; 1316 if (ompt_enabled.enabled && 1317 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1318 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1319 OMPT_GET_FRAME_ADDRESS(0); 1320 1321 ompt_lw_taskteam_t lw_taskteam; 1322 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1323 &ompt_parallel_data, codeptr); 1324 1325 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1326 // don't use lw_taskteam after linking. content was swaped 1327 1328 /* OMPT implicit task begin */ 1329 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1330 if (ompt_enabled.ompt_callback_implicit_task) { 1331 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1332 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1333 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1334 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1335 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1336 __kmp_tid_from_gtid(global_tid); 1337 } 1338 1339 /* OMPT state */ 1340 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1341 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1342 OMPT_GET_FRAME_ADDRESS(0); 1343 } 1344 #endif 1345 } 1346 1347 /* most of the work for a fork */ 1348 /* return true if we really went parallel, false if serialized */ 1349 int __kmp_fork_call(ident_t *loc, int gtid, 1350 enum fork_context_e call_context, // Intel, GNU, ... 1351 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1352 kmp_va_list ap) { 1353 void **argv; 1354 int i; 1355 int master_tid; 1356 int master_this_cons; 1357 kmp_team_t *team; 1358 kmp_team_t *parent_team; 1359 kmp_info_t *master_th; 1360 kmp_root_t *root; 1361 int nthreads; 1362 int master_active; 1363 int master_set_numthreads; 1364 int level; 1365 int active_level; 1366 int teams_level; 1367 #if KMP_NESTED_HOT_TEAMS 1368 kmp_hot_team_ptr_t **p_hot_teams; 1369 #endif 1370 { // KMP_TIME_BLOCK 1371 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1372 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1373 1374 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1375 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1376 /* Some systems prefer the stack for the root thread(s) to start with */ 1377 /* some gap from the parent stack to prevent false sharing. */ 1378 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1379 /* These 2 lines below are so this does not get optimized out */ 1380 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1381 __kmp_stkpadding += (short)((kmp_int64)dummy); 1382 } 1383 1384 /* initialize if needed */ 1385 KMP_DEBUG_ASSERT( 1386 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1387 if (!TCR_4(__kmp_init_parallel)) 1388 __kmp_parallel_initialize(); 1389 __kmp_resume_if_soft_paused(); 1390 1391 /* setup current data */ 1392 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1393 // shutdown 1394 parent_team = master_th->th.th_team; 1395 master_tid = master_th->th.th_info.ds.ds_tid; 1396 master_this_cons = master_th->th.th_local.this_construct; 1397 root = master_th->th.th_root; 1398 master_active = root->r.r_active; 1399 master_set_numthreads = master_th->th.th_set_nproc; 1400 1401 #if OMPT_SUPPORT 1402 ompt_data_t ompt_parallel_data = ompt_data_none; 1403 ompt_data_t *parent_task_data; 1404 ompt_frame_t *ompt_frame; 1405 ompt_data_t *implicit_task_data; 1406 void *return_address = NULL; 1407 1408 if (ompt_enabled.enabled) { 1409 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1410 NULL, NULL); 1411 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1412 } 1413 #endif 1414 1415 // Assign affinity to root thread if it hasn't happened yet 1416 __kmp_assign_root_init_mask(); 1417 1418 // Nested level will be an index in the nested nthreads array 1419 level = parent_team->t.t_level; 1420 // used to launch non-serial teams even if nested is not allowed 1421 active_level = parent_team->t.t_active_level; 1422 // needed to check nesting inside the teams 1423 teams_level = master_th->th.th_teams_level; 1424 #if KMP_NESTED_HOT_TEAMS 1425 p_hot_teams = &master_th->th.th_hot_teams; 1426 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1427 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1428 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1429 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1430 // it is either actual or not needed (when active_level > 0) 1431 (*p_hot_teams)[0].hot_team_nth = 1; 1432 } 1433 #endif 1434 1435 #if OMPT_SUPPORT 1436 if (ompt_enabled.enabled) { 1437 if (ompt_enabled.ompt_callback_parallel_begin) { 1438 int team_size = master_set_numthreads 1439 ? master_set_numthreads 1440 : get__nproc_2(parent_team, master_tid); 1441 int flags = OMPT_INVOKER(call_context) | 1442 ((microtask == (microtask_t)__kmp_teams_master) 1443 ? ompt_parallel_league 1444 : ompt_parallel_team); 1445 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1446 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1447 return_address); 1448 } 1449 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1450 } 1451 #endif 1452 1453 master_th->th.th_ident = loc; 1454 1455 if (master_th->th.th_teams_microtask && ap && 1456 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1457 // AC: This is start of parallel that is nested inside teams construct. 1458 // The team is actual (hot), all workers are ready at the fork barrier. 1459 // No lock needed to initialize the team a bit, then free workers. 1460 parent_team->t.t_ident = loc; 1461 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1462 parent_team->t.t_argc = argc; 1463 argv = (void **)parent_team->t.t_argv; 1464 for (i = argc - 1; i >= 0; --i) 1465 *argv++ = va_arg(kmp_va_deref(ap), void *); 1466 // Increment our nested depth levels, but not increase the serialization 1467 if (parent_team == master_th->th.th_serial_team) { 1468 // AC: we are in serialized parallel 1469 __kmpc_serialized_parallel(loc, gtid); 1470 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1471 1472 if (call_context == fork_context_gnu) { 1473 // AC: need to decrement t_serialized for enquiry functions to work 1474 // correctly, will restore at join time 1475 parent_team->t.t_serialized--; 1476 return TRUE; 1477 } 1478 1479 #if OMPD_SUPPORT 1480 parent_team->t.t_pkfn = microtask; 1481 #endif 1482 1483 #if OMPT_SUPPORT 1484 void *dummy; 1485 void **exit_frame_p; 1486 1487 ompt_lw_taskteam_t lw_taskteam; 1488 1489 if (ompt_enabled.enabled) { 1490 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1491 &ompt_parallel_data, return_address); 1492 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1493 1494 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1495 // don't use lw_taskteam after linking. content was swaped 1496 1497 /* OMPT implicit task begin */ 1498 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1499 if (ompt_enabled.ompt_callback_implicit_task) { 1500 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1501 __kmp_tid_from_gtid(gtid); 1502 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1503 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1504 implicit_task_data, 1, 1505 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1506 } 1507 1508 /* OMPT state */ 1509 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1510 } else { 1511 exit_frame_p = &dummy; 1512 } 1513 #endif 1514 // AC: need to decrement t_serialized for enquiry functions to work 1515 // correctly, will restore at join time 1516 parent_team->t.t_serialized--; 1517 1518 { 1519 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1520 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1521 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1522 #if OMPT_SUPPORT 1523 , 1524 exit_frame_p 1525 #endif 1526 ); 1527 } 1528 1529 #if OMPT_SUPPORT 1530 if (ompt_enabled.enabled) { 1531 *exit_frame_p = NULL; 1532 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1533 if (ompt_enabled.ompt_callback_implicit_task) { 1534 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1535 ompt_scope_end, NULL, implicit_task_data, 1, 1536 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1537 } 1538 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1539 __ompt_lw_taskteam_unlink(master_th); 1540 if (ompt_enabled.ompt_callback_parallel_end) { 1541 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1542 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1543 OMPT_INVOKER(call_context) | ompt_parallel_team, 1544 return_address); 1545 } 1546 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1547 } 1548 #endif 1549 return TRUE; 1550 } 1551 1552 parent_team->t.t_pkfn = microtask; 1553 parent_team->t.t_invoke = invoker; 1554 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1555 parent_team->t.t_active_level++; 1556 parent_team->t.t_level++; 1557 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1558 1559 #if OMPT_SUPPORT 1560 if (ompt_enabled.enabled) { 1561 ompt_lw_taskteam_t lw_taskteam; 1562 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1563 &ompt_parallel_data, return_address); 1564 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1565 } 1566 #endif 1567 1568 /* Change number of threads in the team if requested */ 1569 if (master_set_numthreads) { // The parallel has num_threads clause 1570 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1571 // AC: only can reduce number of threads dynamically, can't increase 1572 kmp_info_t **other_threads = parent_team->t.t_threads; 1573 parent_team->t.t_nproc = master_set_numthreads; 1574 for (i = 0; i < master_set_numthreads; ++i) { 1575 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1576 } 1577 // Keep extra threads hot in the team for possible next parallels 1578 } 1579 master_th->th.th_set_nproc = 0; 1580 } 1581 1582 #if USE_DEBUGGER 1583 if (__kmp_debugging) { // Let debugger override number of threads. 1584 int nth = __kmp_omp_num_threads(loc); 1585 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1586 master_set_numthreads = nth; 1587 } 1588 } 1589 #endif 1590 1591 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1592 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1593 KMP_ITT_DEBUG) && 1594 __kmp_forkjoin_frames_mode == 3 && 1595 parent_team->t.t_active_level == 1 // only report frames at level 1 1596 && master_th->th.th_teams_size.nteams == 1) { 1597 kmp_uint64 tmp_time = __itt_get_timestamp(); 1598 master_th->th.th_frame_time = tmp_time; 1599 parent_team->t.t_region_time = tmp_time; 1600 } 1601 if (__itt_stack_caller_create_ptr) { 1602 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1603 // create new stack stitching id before entering fork barrier 1604 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1605 } 1606 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1607 1608 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1609 "master_th=%p, gtid=%d\n", 1610 root, parent_team, master_th, gtid)); 1611 __kmp_internal_fork(loc, gtid, parent_team); 1612 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1613 "master_th=%p, gtid=%d\n", 1614 root, parent_team, master_th, gtid)); 1615 1616 if (call_context == fork_context_gnu) 1617 return TRUE; 1618 1619 /* Invoke microtask for PRIMARY thread */ 1620 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1621 parent_team->t.t_id, parent_team->t.t_pkfn)); 1622 1623 if (!parent_team->t.t_invoke(gtid)) { 1624 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1625 } 1626 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1627 parent_team->t.t_id, parent_team->t.t_pkfn)); 1628 KMP_MB(); /* Flush all pending memory write invalidates. */ 1629 1630 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1631 1632 return TRUE; 1633 } // Parallel closely nested in teams construct 1634 1635 #if KMP_DEBUG 1636 if (__kmp_tasking_mode != tskm_immediate_exec) { 1637 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1638 parent_team->t.t_task_team[master_th->th.th_task_state]); 1639 } 1640 #endif 1641 1642 int enter_teams = 0; 1643 if (parent_team->t.t_active_level >= 1644 master_th->th.th_current_task->td_icvs.max_active_levels) { 1645 nthreads = 1; 1646 } else { 1647 enter_teams = ((ap == NULL && active_level == 0) || 1648 (ap && teams_level > 0 && teams_level == level)); 1649 nthreads = 1650 master_set_numthreads 1651 ? master_set_numthreads 1652 : get__nproc_2( 1653 parent_team, 1654 master_tid); // TODO: get nproc directly from current task 1655 1656 // Check if we need to take forkjoin lock? (no need for serialized 1657 // parallel out of teams construct). This code moved here from 1658 // __kmp_reserve_threads() to speedup nested serialized parallels. 1659 if (nthreads > 1) { 1660 if ((get__max_active_levels(master_th) == 1 && 1661 (root->r.r_in_parallel && !enter_teams)) || 1662 (__kmp_library == library_serial)) { 1663 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1664 " threads\n", 1665 gtid, nthreads)); 1666 nthreads = 1; 1667 } 1668 } 1669 if (nthreads > 1) { 1670 /* determine how many new threads we can use */ 1671 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1672 /* AC: If we execute teams from parallel region (on host), then teams 1673 should be created but each can only have 1 thread if nesting is 1674 disabled. If teams called from serial region, then teams and their 1675 threads should be created regardless of the nesting setting. */ 1676 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1677 nthreads, enter_teams); 1678 if (nthreads == 1) { 1679 // Free lock for single thread execution here; for multi-thread 1680 // execution it will be freed later after team of threads created 1681 // and initialized 1682 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1683 } 1684 } 1685 } 1686 KMP_DEBUG_ASSERT(nthreads > 0); 1687 1688 // If we temporarily changed the set number of threads then restore it now 1689 master_th->th.th_set_nproc = 0; 1690 1691 /* create a serialized parallel region? */ 1692 if (nthreads == 1) { 1693 /* josh todo: hypothetical question: what do we do for OS X*? */ 1694 #if KMP_OS_LINUX && \ 1695 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1696 void *args[argc]; 1697 #else 1698 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1699 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1700 KMP_ARCH_AARCH64) */ 1701 1702 KA_TRACE(20, 1703 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1704 1705 __kmpc_serialized_parallel(loc, gtid); 1706 1707 #if OMPD_SUPPORT 1708 master_th->th.th_serial_team->t.t_pkfn = microtask; 1709 #endif 1710 1711 if (call_context == fork_context_intel) { 1712 /* TODO this sucks, use the compiler itself to pass args! :) */ 1713 master_th->th.th_serial_team->t.t_ident = loc; 1714 if (!ap) { 1715 // revert change made in __kmpc_serialized_parallel() 1716 master_th->th.th_serial_team->t.t_level--; 1717 // Get args from parent team for teams construct 1718 1719 #if OMPT_SUPPORT 1720 void *dummy; 1721 void **exit_frame_p; 1722 ompt_task_info_t *task_info; 1723 1724 ompt_lw_taskteam_t lw_taskteam; 1725 1726 if (ompt_enabled.enabled) { 1727 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1728 &ompt_parallel_data, return_address); 1729 1730 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1731 // don't use lw_taskteam after linking. content was swaped 1732 1733 task_info = OMPT_CUR_TASK_INFO(master_th); 1734 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1735 if (ompt_enabled.ompt_callback_implicit_task) { 1736 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1737 __kmp_tid_from_gtid(gtid); 1738 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1739 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1740 &(task_info->task_data), 1, 1741 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1742 ompt_task_implicit); 1743 } 1744 1745 /* OMPT state */ 1746 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1747 } else { 1748 exit_frame_p = &dummy; 1749 } 1750 #endif 1751 1752 { 1753 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1754 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1755 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1756 parent_team->t.t_argv 1757 #if OMPT_SUPPORT 1758 , 1759 exit_frame_p 1760 #endif 1761 ); 1762 } 1763 1764 #if OMPT_SUPPORT 1765 if (ompt_enabled.enabled) { 1766 *exit_frame_p = NULL; 1767 if (ompt_enabled.ompt_callback_implicit_task) { 1768 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1769 ompt_scope_end, NULL, &(task_info->task_data), 1, 1770 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1771 ompt_task_implicit); 1772 } 1773 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1774 __ompt_lw_taskteam_unlink(master_th); 1775 if (ompt_enabled.ompt_callback_parallel_end) { 1776 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1777 &ompt_parallel_data, parent_task_data, 1778 OMPT_INVOKER(call_context) | ompt_parallel_team, 1779 return_address); 1780 } 1781 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1782 } 1783 #endif 1784 } else if (microtask == (microtask_t)__kmp_teams_master) { 1785 KMP_DEBUG_ASSERT(master_th->th.th_team == 1786 master_th->th.th_serial_team); 1787 team = master_th->th.th_team; 1788 // team->t.t_pkfn = microtask; 1789 team->t.t_invoke = invoker; 1790 __kmp_alloc_argv_entries(argc, team, TRUE); 1791 team->t.t_argc = argc; 1792 argv = (void **)team->t.t_argv; 1793 if (ap) { 1794 for (i = argc - 1; i >= 0; --i) 1795 *argv++ = va_arg(kmp_va_deref(ap), void *); 1796 } else { 1797 for (i = 0; i < argc; ++i) 1798 // Get args from parent team for teams construct 1799 argv[i] = parent_team->t.t_argv[i]; 1800 } 1801 // AC: revert change made in __kmpc_serialized_parallel() 1802 // because initial code in teams should have level=0 1803 team->t.t_level--; 1804 // AC: call special invoker for outer "parallel" of teams construct 1805 invoker(gtid); 1806 #if OMPT_SUPPORT 1807 if (ompt_enabled.enabled) { 1808 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1809 if (ompt_enabled.ompt_callback_implicit_task) { 1810 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1811 ompt_scope_end, NULL, &(task_info->task_data), 0, 1812 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1813 } 1814 if (ompt_enabled.ompt_callback_parallel_end) { 1815 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1816 &ompt_parallel_data, parent_task_data, 1817 OMPT_INVOKER(call_context) | ompt_parallel_league, 1818 return_address); 1819 } 1820 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1821 } 1822 #endif 1823 } else { 1824 argv = args; 1825 for (i = argc - 1; i >= 0; --i) 1826 *argv++ = va_arg(kmp_va_deref(ap), void *); 1827 KMP_MB(); 1828 1829 #if OMPT_SUPPORT 1830 void *dummy; 1831 void **exit_frame_p; 1832 ompt_task_info_t *task_info; 1833 1834 ompt_lw_taskteam_t lw_taskteam; 1835 1836 if (ompt_enabled.enabled) { 1837 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1838 &ompt_parallel_data, return_address); 1839 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1840 // don't use lw_taskteam after linking. content was swaped 1841 task_info = OMPT_CUR_TASK_INFO(master_th); 1842 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1843 1844 /* OMPT implicit task begin */ 1845 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1846 if (ompt_enabled.ompt_callback_implicit_task) { 1847 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1848 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1849 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1850 ompt_task_implicit); 1851 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1852 __kmp_tid_from_gtid(gtid); 1853 } 1854 1855 /* OMPT state */ 1856 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1857 } else { 1858 exit_frame_p = &dummy; 1859 } 1860 #endif 1861 1862 { 1863 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1864 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1865 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1866 #if OMPT_SUPPORT 1867 , 1868 exit_frame_p 1869 #endif 1870 ); 1871 } 1872 1873 #if OMPT_SUPPORT 1874 if (ompt_enabled.enabled) { 1875 *exit_frame_p = NULL; 1876 if (ompt_enabled.ompt_callback_implicit_task) { 1877 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1878 ompt_scope_end, NULL, &(task_info->task_data), 1, 1879 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1880 ompt_task_implicit); 1881 } 1882 1883 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1884 __ompt_lw_taskteam_unlink(master_th); 1885 if (ompt_enabled.ompt_callback_parallel_end) { 1886 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1887 &ompt_parallel_data, parent_task_data, 1888 OMPT_INVOKER(call_context) | ompt_parallel_team, 1889 return_address); 1890 } 1891 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1892 } 1893 #endif 1894 } 1895 } else if (call_context == fork_context_gnu) { 1896 #if OMPT_SUPPORT 1897 ompt_lw_taskteam_t lwt; 1898 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1899 return_address); 1900 1901 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1902 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1903 // don't use lw_taskteam after linking. content was swaped 1904 #endif 1905 1906 // we were called from GNU native code 1907 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1908 return FALSE; 1909 } else { 1910 KMP_ASSERT2(call_context < fork_context_last, 1911 "__kmp_fork_call: unknown fork_context parameter"); 1912 } 1913 1914 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1915 KMP_MB(); 1916 return FALSE; 1917 } // if (nthreads == 1) 1918 1919 // GEH: only modify the executing flag in the case when not serialized 1920 // serialized case is handled in kmpc_serialized_parallel 1921 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1922 "curtask=%p, curtask_max_aclevel=%d\n", 1923 parent_team->t.t_active_level, master_th, 1924 master_th->th.th_current_task, 1925 master_th->th.th_current_task->td_icvs.max_active_levels)); 1926 // TODO: GEH - cannot do this assertion because root thread not set up as 1927 // executing 1928 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1929 master_th->th.th_current_task->td_flags.executing = 0; 1930 1931 if (!master_th->th.th_teams_microtask || level > teams_level) { 1932 /* Increment our nested depth level */ 1933 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1934 } 1935 1936 // See if we need to make a copy of the ICVs. 1937 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1938 if ((level + 1 < __kmp_nested_nth.used) && 1939 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1940 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1941 } else { 1942 nthreads_icv = 0; // don't update 1943 } 1944 1945 // Figure out the proc_bind_policy for the new team. 1946 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1947 kmp_proc_bind_t proc_bind_icv = 1948 proc_bind_default; // proc_bind_default means don't update 1949 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1950 proc_bind = proc_bind_false; 1951 } else { 1952 if (proc_bind == proc_bind_default) { 1953 // No proc_bind clause specified; use current proc-bind-var for this 1954 // parallel region 1955 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1956 } 1957 /* else: The proc_bind policy was specified explicitly on parallel clause. 1958 This overrides proc-bind-var for this parallel region, but does not 1959 change proc-bind-var. */ 1960 // Figure the value of proc-bind-var for the child threads. 1961 if ((level + 1 < __kmp_nested_proc_bind.used) && 1962 (__kmp_nested_proc_bind.bind_types[level + 1] != 1963 master_th->th.th_current_task->td_icvs.proc_bind)) { 1964 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1965 } 1966 } 1967 1968 // Reset for next parallel region 1969 master_th->th.th_set_proc_bind = proc_bind_default; 1970 1971 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1972 kmp_internal_control_t new_icvs; 1973 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1974 new_icvs.next = NULL; 1975 if (nthreads_icv > 0) { 1976 new_icvs.nproc = nthreads_icv; 1977 } 1978 if (proc_bind_icv != proc_bind_default) { 1979 new_icvs.proc_bind = proc_bind_icv; 1980 } 1981 1982 /* allocate a new parallel team */ 1983 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1984 team = __kmp_allocate_team(root, nthreads, nthreads, 1985 #if OMPT_SUPPORT 1986 ompt_parallel_data, 1987 #endif 1988 proc_bind, &new_icvs, 1989 argc USE_NESTED_HOT_ARG(master_th)); 1990 } else { 1991 /* allocate a new parallel team */ 1992 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1993 team = __kmp_allocate_team(root, nthreads, nthreads, 1994 #if OMPT_SUPPORT 1995 ompt_parallel_data, 1996 #endif 1997 proc_bind, 1998 &master_th->th.th_current_task->td_icvs, 1999 argc USE_NESTED_HOT_ARG(master_th)); 2000 } 2001 KF_TRACE( 2002 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2003 2004 /* setup the new team */ 2005 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2006 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2007 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2008 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2009 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2010 #if OMPT_SUPPORT 2011 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2012 return_address); 2013 #endif 2014 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2015 // TODO: parent_team->t.t_level == INT_MAX ??? 2016 if (!master_th->th.th_teams_microtask || level > teams_level) { 2017 int new_level = parent_team->t.t_level + 1; 2018 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2019 new_level = parent_team->t.t_active_level + 1; 2020 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2021 } else { 2022 // AC: Do not increase parallel level at start of the teams construct 2023 int new_level = parent_team->t.t_level; 2024 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2025 new_level = parent_team->t.t_active_level; 2026 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2027 } 2028 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2029 // set primary thread's schedule as new run-time schedule 2030 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2031 2032 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2033 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2034 2035 // Update the floating point rounding in the team if required. 2036 propagateFPControl(team); 2037 #if OMPD_SUPPORT 2038 if (ompd_state & OMPD_ENABLE_BP) 2039 ompd_bp_parallel_begin(); 2040 #endif 2041 2042 if (__kmp_tasking_mode != tskm_immediate_exec) { 2043 // Set primary thread's task team to team's task team. Unless this is hot 2044 // team, it should be NULL. 2045 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2046 parent_team->t.t_task_team[master_th->th.th_task_state]); 2047 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2048 "%p, new task_team %p / team %p\n", 2049 __kmp_gtid_from_thread(master_th), 2050 master_th->th.th_task_team, parent_team, 2051 team->t.t_task_team[master_th->th.th_task_state], team)); 2052 2053 if (active_level || master_th->th.th_task_team) { 2054 // Take a memo of primary thread's task_state 2055 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2056 if (master_th->th.th_task_state_top >= 2057 master_th->th.th_task_state_stack_sz) { // increase size 2058 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2059 kmp_uint8 *old_stack, *new_stack; 2060 kmp_uint32 i; 2061 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2062 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2063 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2064 } 2065 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2066 ++i) { // zero-init rest of stack 2067 new_stack[i] = 0; 2068 } 2069 old_stack = master_th->th.th_task_state_memo_stack; 2070 master_th->th.th_task_state_memo_stack = new_stack; 2071 master_th->th.th_task_state_stack_sz = new_size; 2072 __kmp_free(old_stack); 2073 } 2074 // Store primary thread's task_state on stack 2075 master_th->th 2076 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2077 master_th->th.th_task_state; 2078 master_th->th.th_task_state_top++; 2079 #if KMP_NESTED_HOT_TEAMS 2080 if (master_th->th.th_hot_teams && 2081 active_level < __kmp_hot_teams_max_level && 2082 team == master_th->th.th_hot_teams[active_level].hot_team) { 2083 // Restore primary thread's nested state if nested hot team 2084 master_th->th.th_task_state = 2085 master_th->th 2086 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2087 } else { 2088 #endif 2089 master_th->th.th_task_state = 0; 2090 #if KMP_NESTED_HOT_TEAMS 2091 } 2092 #endif 2093 } 2094 #if !KMP_NESTED_HOT_TEAMS 2095 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2096 (team == root->r.r_hot_team)); 2097 #endif 2098 } 2099 2100 KA_TRACE( 2101 20, 2102 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2103 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2104 team->t.t_nproc)); 2105 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2106 (team->t.t_master_tid == 0 && 2107 (team->t.t_parent == root->r.r_root_team || 2108 team->t.t_parent->t.t_serialized))); 2109 KMP_MB(); 2110 2111 /* now, setup the arguments */ 2112 argv = (void **)team->t.t_argv; 2113 if (ap) { 2114 for (i = argc - 1; i >= 0; --i) { 2115 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2116 KMP_CHECK_UPDATE(*argv, new_argv); 2117 argv++; 2118 } 2119 } else { 2120 for (i = 0; i < argc; ++i) { 2121 // Get args from parent team for teams construct 2122 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2123 } 2124 } 2125 2126 /* now actually fork the threads */ 2127 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2128 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2129 root->r.r_active = TRUE; 2130 2131 __kmp_fork_team_threads(root, team, master_th, gtid); 2132 __kmp_setup_icv_copy(team, nthreads, 2133 &master_th->th.th_current_task->td_icvs, loc); 2134 2135 #if OMPT_SUPPORT 2136 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2137 #endif 2138 2139 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2140 2141 #if USE_ITT_BUILD 2142 if (team->t.t_active_level == 1 // only report frames at level 1 2143 && !master_th->th.th_teams_microtask) { // not in teams construct 2144 #if USE_ITT_NOTIFY 2145 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2146 (__kmp_forkjoin_frames_mode == 3 || 2147 __kmp_forkjoin_frames_mode == 1)) { 2148 kmp_uint64 tmp_time = 0; 2149 if (__itt_get_timestamp_ptr) 2150 tmp_time = __itt_get_timestamp(); 2151 // Internal fork - report frame begin 2152 master_th->th.th_frame_time = tmp_time; 2153 if (__kmp_forkjoin_frames_mode == 3) 2154 team->t.t_region_time = tmp_time; 2155 } else 2156 // only one notification scheme (either "submit" or "forking/joined", not both) 2157 #endif /* USE_ITT_NOTIFY */ 2158 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2159 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2160 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2161 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2162 } 2163 } 2164 #endif /* USE_ITT_BUILD */ 2165 2166 /* now go on and do the work */ 2167 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2168 KMP_MB(); 2169 KF_TRACE(10, 2170 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2171 root, team, master_th, gtid)); 2172 2173 #if USE_ITT_BUILD 2174 if (__itt_stack_caller_create_ptr) { 2175 // create new stack stitching id before entering fork barrier 2176 if (!enter_teams) { 2177 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2178 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2179 } else if (parent_team->t.t_serialized) { 2180 // keep stack stitching id in the serialized parent_team; 2181 // current team will be used for parallel inside the teams; 2182 // if parent_team is active, then it already keeps stack stitching id 2183 // for the league of teams 2184 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2185 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2186 } 2187 } 2188 #endif /* USE_ITT_BUILD */ 2189 2190 // AC: skip __kmp_internal_fork at teams construct, let only primary 2191 // threads execute 2192 if (ap) { 2193 __kmp_internal_fork(loc, gtid, team); 2194 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2195 "master_th=%p, gtid=%d\n", 2196 root, team, master_th, gtid)); 2197 } 2198 2199 if (call_context == fork_context_gnu) { 2200 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2201 return TRUE; 2202 } 2203 2204 /* Invoke microtask for PRIMARY thread */ 2205 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2206 team->t.t_id, team->t.t_pkfn)); 2207 } // END of timer KMP_fork_call block 2208 2209 #if KMP_STATS_ENABLED 2210 // If beginning a teams construct, then change thread state 2211 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2212 if (!ap) { 2213 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2214 } 2215 #endif 2216 2217 if (!team->t.t_invoke(gtid)) { 2218 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2219 } 2220 2221 #if KMP_STATS_ENABLED 2222 // If was beginning of a teams construct, then reset thread state 2223 if (!ap) { 2224 KMP_SET_THREAD_STATE(previous_state); 2225 } 2226 #endif 2227 2228 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2229 team->t.t_id, team->t.t_pkfn)); 2230 KMP_MB(); /* Flush all pending memory write invalidates. */ 2231 2232 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2233 #if OMPT_SUPPORT 2234 if (ompt_enabled.enabled) { 2235 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2236 } 2237 #endif 2238 2239 return TRUE; 2240 } 2241 2242 #if OMPT_SUPPORT 2243 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2244 kmp_team_t *team) { 2245 // restore state outside the region 2246 thread->th.ompt_thread_info.state = 2247 ((team->t.t_serialized) ? ompt_state_work_serial 2248 : ompt_state_work_parallel); 2249 } 2250 2251 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2252 kmp_team_t *team, ompt_data_t *parallel_data, 2253 int flags, void *codeptr) { 2254 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2255 if (ompt_enabled.ompt_callback_parallel_end) { 2256 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2257 parallel_data, &(task_info->task_data), flags, codeptr); 2258 } 2259 2260 task_info->frame.enter_frame = ompt_data_none; 2261 __kmp_join_restore_state(thread, team); 2262 } 2263 #endif 2264 2265 void __kmp_join_call(ident_t *loc, int gtid 2266 #if OMPT_SUPPORT 2267 , 2268 enum fork_context_e fork_context 2269 #endif 2270 , 2271 int exit_teams) { 2272 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2273 kmp_team_t *team; 2274 kmp_team_t *parent_team; 2275 kmp_info_t *master_th; 2276 kmp_root_t *root; 2277 int master_active; 2278 2279 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2280 2281 /* setup current data */ 2282 master_th = __kmp_threads[gtid]; 2283 root = master_th->th.th_root; 2284 team = master_th->th.th_team; 2285 parent_team = team->t.t_parent; 2286 2287 master_th->th.th_ident = loc; 2288 2289 #if OMPT_SUPPORT 2290 void *team_microtask = (void *)team->t.t_pkfn; 2291 // For GOMP interface with serialized parallel, need the 2292 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2293 // and end-parallel events. 2294 if (ompt_enabled.enabled && 2295 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2296 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2297 } 2298 #endif 2299 2300 #if KMP_DEBUG 2301 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2302 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2303 "th_task_team = %p\n", 2304 __kmp_gtid_from_thread(master_th), team, 2305 team->t.t_task_team[master_th->th.th_task_state], 2306 master_th->th.th_task_team)); 2307 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2308 team->t.t_task_team[master_th->th.th_task_state]); 2309 } 2310 #endif 2311 2312 if (team->t.t_serialized) { 2313 if (master_th->th.th_teams_microtask) { 2314 // We are in teams construct 2315 int level = team->t.t_level; 2316 int tlevel = master_th->th.th_teams_level; 2317 if (level == tlevel) { 2318 // AC: we haven't incremented it earlier at start of teams construct, 2319 // so do it here - at the end of teams construct 2320 team->t.t_level++; 2321 } else if (level == tlevel + 1) { 2322 // AC: we are exiting parallel inside teams, need to increment 2323 // serialization in order to restore it in the next call to 2324 // __kmpc_end_serialized_parallel 2325 team->t.t_serialized++; 2326 } 2327 } 2328 __kmpc_end_serialized_parallel(loc, gtid); 2329 2330 #if OMPT_SUPPORT 2331 if (ompt_enabled.enabled) { 2332 __kmp_join_restore_state(master_th, parent_team); 2333 } 2334 #endif 2335 2336 return; 2337 } 2338 2339 master_active = team->t.t_master_active; 2340 2341 if (!exit_teams) { 2342 // AC: No barrier for internal teams at exit from teams construct. 2343 // But there is barrier for external team (league). 2344 __kmp_internal_join(loc, gtid, team); 2345 #if USE_ITT_BUILD 2346 if (__itt_stack_caller_create_ptr) { 2347 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2348 // destroy the stack stitching id after join barrier 2349 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2350 team->t.t_stack_id = NULL; 2351 } 2352 #endif 2353 } else { 2354 master_th->th.th_task_state = 2355 0; // AC: no tasking in teams (out of any parallel) 2356 #if USE_ITT_BUILD 2357 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2358 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2359 // destroy the stack stitching id on exit from the teams construct 2360 // if parent_team is active, then the id will be destroyed later on 2361 // by master of the league of teams 2362 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2363 parent_team->t.t_stack_id = NULL; 2364 } 2365 #endif 2366 } 2367 2368 KMP_MB(); 2369 2370 #if OMPT_SUPPORT 2371 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2372 void *codeptr = team->t.ompt_team_info.master_return_address; 2373 #endif 2374 2375 #if USE_ITT_BUILD 2376 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2377 if (team->t.t_active_level == 1 && 2378 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2379 master_th->th.th_teams_size.nteams == 1)) { 2380 master_th->th.th_ident = loc; 2381 // only one notification scheme (either "submit" or "forking/joined", not 2382 // both) 2383 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2384 __kmp_forkjoin_frames_mode == 3) 2385 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2386 master_th->th.th_frame_time, 0, loc, 2387 master_th->th.th_team_nproc, 1); 2388 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2389 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2390 __kmp_itt_region_joined(gtid); 2391 } // active_level == 1 2392 #endif /* USE_ITT_BUILD */ 2393 2394 if (master_th->th.th_teams_microtask && !exit_teams && 2395 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2396 team->t.t_level == master_th->th.th_teams_level + 1) { 2397 // AC: We need to leave the team structure intact at the end of parallel 2398 // inside the teams construct, so that at the next parallel same (hot) team 2399 // works, only adjust nesting levels 2400 #if OMPT_SUPPORT 2401 ompt_data_t ompt_parallel_data = ompt_data_none; 2402 if (ompt_enabled.enabled) { 2403 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2404 if (ompt_enabled.ompt_callback_implicit_task) { 2405 int ompt_team_size = team->t.t_nproc; 2406 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2407 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2408 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2409 } 2410 task_info->frame.exit_frame = ompt_data_none; 2411 task_info->task_data = ompt_data_none; 2412 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2413 __ompt_lw_taskteam_unlink(master_th); 2414 } 2415 #endif 2416 /* Decrement our nested depth level */ 2417 team->t.t_level--; 2418 team->t.t_active_level--; 2419 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2420 2421 // Restore number of threads in the team if needed. This code relies on 2422 // the proper adjustment of th_teams_size.nth after the fork in 2423 // __kmp_teams_master on each teams primary thread in the case that 2424 // __kmp_reserve_threads reduced it. 2425 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2426 int old_num = master_th->th.th_team_nproc; 2427 int new_num = master_th->th.th_teams_size.nth; 2428 kmp_info_t **other_threads = team->t.t_threads; 2429 team->t.t_nproc = new_num; 2430 for (int i = 0; i < old_num; ++i) { 2431 other_threads[i]->th.th_team_nproc = new_num; 2432 } 2433 // Adjust states of non-used threads of the team 2434 for (int i = old_num; i < new_num; ++i) { 2435 // Re-initialize thread's barrier data. 2436 KMP_DEBUG_ASSERT(other_threads[i]); 2437 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2438 for (int b = 0; b < bs_last_barrier; ++b) { 2439 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2440 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2441 #if USE_DEBUGGER 2442 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2443 #endif 2444 } 2445 if (__kmp_tasking_mode != tskm_immediate_exec) { 2446 // Synchronize thread's task state 2447 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2448 } 2449 } 2450 } 2451 2452 #if OMPT_SUPPORT 2453 if (ompt_enabled.enabled) { 2454 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2455 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2456 } 2457 #endif 2458 2459 return; 2460 } 2461 2462 /* do cleanup and restore the parent team */ 2463 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2464 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2465 2466 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2467 2468 /* jc: The following lock has instructions with REL and ACQ semantics, 2469 separating the parallel user code called in this parallel region 2470 from the serial user code called after this function returns. */ 2471 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2472 2473 if (!master_th->th.th_teams_microtask || 2474 team->t.t_level > master_th->th.th_teams_level) { 2475 /* Decrement our nested depth level */ 2476 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2477 } 2478 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2479 2480 #if OMPT_SUPPORT 2481 if (ompt_enabled.enabled) { 2482 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2483 if (ompt_enabled.ompt_callback_implicit_task) { 2484 int flags = (team_microtask == (void *)__kmp_teams_master) 2485 ? ompt_task_initial 2486 : ompt_task_implicit; 2487 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2488 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2489 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2490 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2491 } 2492 task_info->frame.exit_frame = ompt_data_none; 2493 task_info->task_data = ompt_data_none; 2494 } 2495 #endif 2496 2497 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2498 master_th, team)); 2499 __kmp_pop_current_task_from_thread(master_th); 2500 2501 #if KMP_AFFINITY_SUPPORTED 2502 // Restore master thread's partition. 2503 master_th->th.th_first_place = team->t.t_first_place; 2504 master_th->th.th_last_place = team->t.t_last_place; 2505 #endif // KMP_AFFINITY_SUPPORTED 2506 master_th->th.th_def_allocator = team->t.t_def_allocator; 2507 2508 #if OMPD_SUPPORT 2509 if (ompd_state & OMPD_ENABLE_BP) 2510 ompd_bp_parallel_end(); 2511 #endif 2512 updateHWFPControl(team); 2513 2514 if (root->r.r_active != master_active) 2515 root->r.r_active = master_active; 2516 2517 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2518 master_th)); // this will free worker threads 2519 2520 /* this race was fun to find. make sure the following is in the critical 2521 region otherwise assertions may fail occasionally since the old team may be 2522 reallocated and the hierarchy appears inconsistent. it is actually safe to 2523 run and won't cause any bugs, but will cause those assertion failures. it's 2524 only one deref&assign so might as well put this in the critical region */ 2525 master_th->th.th_team = parent_team; 2526 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2527 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2528 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2529 2530 /* restore serialized team, if need be */ 2531 if (parent_team->t.t_serialized && 2532 parent_team != master_th->th.th_serial_team && 2533 parent_team != root->r.r_root_team) { 2534 __kmp_free_team(root, 2535 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2536 master_th->th.th_serial_team = parent_team; 2537 } 2538 2539 if (__kmp_tasking_mode != tskm_immediate_exec) { 2540 if (master_th->th.th_task_state_top > 2541 0) { // Restore task state from memo stack 2542 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2543 // Remember primary thread's state if we re-use this nested hot team 2544 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2545 master_th->th.th_task_state; 2546 --master_th->th.th_task_state_top; // pop 2547 // Now restore state at this level 2548 master_th->th.th_task_state = 2549 master_th->th 2550 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2551 } 2552 // Copy the task team from the parent team to the primary thread 2553 master_th->th.th_task_team = 2554 parent_team->t.t_task_team[master_th->th.th_task_state]; 2555 KA_TRACE(20, 2556 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2557 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2558 parent_team)); 2559 } 2560 2561 // TODO: GEH - cannot do this assertion because root thread not set up as 2562 // executing 2563 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2564 master_th->th.th_current_task->td_flags.executing = 1; 2565 2566 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2567 2568 #if OMPT_SUPPORT 2569 int flags = 2570 OMPT_INVOKER(fork_context) | 2571 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2572 : ompt_parallel_team); 2573 if (ompt_enabled.enabled) { 2574 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2575 codeptr); 2576 } 2577 #endif 2578 2579 KMP_MB(); 2580 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2581 } 2582 2583 /* Check whether we should push an internal control record onto the 2584 serial team stack. If so, do it. */ 2585 void __kmp_save_internal_controls(kmp_info_t *thread) { 2586 2587 if (thread->th.th_team != thread->th.th_serial_team) { 2588 return; 2589 } 2590 if (thread->th.th_team->t.t_serialized > 1) { 2591 int push = 0; 2592 2593 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2594 push = 1; 2595 } else { 2596 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2597 thread->th.th_team->t.t_serialized) { 2598 push = 1; 2599 } 2600 } 2601 if (push) { /* push a record on the serial team's stack */ 2602 kmp_internal_control_t *control = 2603 (kmp_internal_control_t *)__kmp_allocate( 2604 sizeof(kmp_internal_control_t)); 2605 2606 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2607 2608 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2609 2610 control->next = thread->th.th_team->t.t_control_stack_top; 2611 thread->th.th_team->t.t_control_stack_top = control; 2612 } 2613 } 2614 } 2615 2616 /* Changes set_nproc */ 2617 void __kmp_set_num_threads(int new_nth, int gtid) { 2618 kmp_info_t *thread; 2619 kmp_root_t *root; 2620 2621 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2622 KMP_DEBUG_ASSERT(__kmp_init_serial); 2623 2624 if (new_nth < 1) 2625 new_nth = 1; 2626 else if (new_nth > __kmp_max_nth) 2627 new_nth = __kmp_max_nth; 2628 2629 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2630 thread = __kmp_threads[gtid]; 2631 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2632 return; // nothing to do 2633 2634 __kmp_save_internal_controls(thread); 2635 2636 set__nproc(thread, new_nth); 2637 2638 // If this omp_set_num_threads() call will cause the hot team size to be 2639 // reduced (in the absence of a num_threads clause), then reduce it now, 2640 // rather than waiting for the next parallel region. 2641 root = thread->th.th_root; 2642 if (__kmp_init_parallel && (!root->r.r_active) && 2643 (root->r.r_hot_team->t.t_nproc > new_nth) 2644 #if KMP_NESTED_HOT_TEAMS 2645 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2646 #endif 2647 ) { 2648 kmp_team_t *hot_team = root->r.r_hot_team; 2649 int f; 2650 2651 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2652 2653 // Release the extra threads we don't need any more. 2654 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2655 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2656 if (__kmp_tasking_mode != tskm_immediate_exec) { 2657 // When decreasing team size, threads no longer in the team should unref 2658 // task team. 2659 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2660 } 2661 __kmp_free_thread(hot_team->t.t_threads[f]); 2662 hot_team->t.t_threads[f] = NULL; 2663 } 2664 hot_team->t.t_nproc = new_nth; 2665 #if KMP_NESTED_HOT_TEAMS 2666 if (thread->th.th_hot_teams) { 2667 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2668 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2669 } 2670 #endif 2671 2672 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2673 2674 // Update the t_nproc field in the threads that are still active. 2675 for (f = 0; f < new_nth; f++) { 2676 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2677 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2678 } 2679 // Special flag in case omp_set_num_threads() call 2680 hot_team->t.t_size_changed = -1; 2681 } 2682 } 2683 2684 /* Changes max_active_levels */ 2685 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2686 kmp_info_t *thread; 2687 2688 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2689 "%d = (%d)\n", 2690 gtid, max_active_levels)); 2691 KMP_DEBUG_ASSERT(__kmp_init_serial); 2692 2693 // validate max_active_levels 2694 if (max_active_levels < 0) { 2695 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2696 // We ignore this call if the user has specified a negative value. 2697 // The current setting won't be changed. The last valid setting will be 2698 // used. A warning will be issued (if warnings are allowed as controlled by 2699 // the KMP_WARNINGS env var). 2700 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2701 "max_active_levels for thread %d = (%d)\n", 2702 gtid, max_active_levels)); 2703 return; 2704 } 2705 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2706 // it's OK, the max_active_levels is within the valid range: [ 0; 2707 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2708 // We allow a zero value. (implementation defined behavior) 2709 } else { 2710 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2711 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2712 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2713 // Current upper limit is MAX_INT. (implementation defined behavior) 2714 // If the input exceeds the upper limit, we correct the input to be the 2715 // upper limit. (implementation defined behavior) 2716 // Actually, the flow should never get here until we use MAX_INT limit. 2717 } 2718 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2719 "max_active_levels for thread %d = (%d)\n", 2720 gtid, max_active_levels)); 2721 2722 thread = __kmp_threads[gtid]; 2723 2724 __kmp_save_internal_controls(thread); 2725 2726 set__max_active_levels(thread, max_active_levels); 2727 } 2728 2729 /* Gets max_active_levels */ 2730 int __kmp_get_max_active_levels(int gtid) { 2731 kmp_info_t *thread; 2732 2733 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2734 KMP_DEBUG_ASSERT(__kmp_init_serial); 2735 2736 thread = __kmp_threads[gtid]; 2737 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2738 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2739 "curtask_maxaclevel=%d\n", 2740 gtid, thread->th.th_current_task, 2741 thread->th.th_current_task->td_icvs.max_active_levels)); 2742 return thread->th.th_current_task->td_icvs.max_active_levels; 2743 } 2744 2745 // nteams-var per-device ICV 2746 void __kmp_set_num_teams(int num_teams) { 2747 if (num_teams > 0) 2748 __kmp_nteams = num_teams; 2749 } 2750 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2751 // teams-thread-limit-var per-device ICV 2752 void __kmp_set_teams_thread_limit(int limit) { 2753 if (limit > 0) 2754 __kmp_teams_thread_limit = limit; 2755 } 2756 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2757 2758 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2759 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2760 2761 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2762 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2763 kmp_info_t *thread; 2764 kmp_sched_t orig_kind; 2765 // kmp_team_t *team; 2766 2767 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2768 gtid, (int)kind, chunk)); 2769 KMP_DEBUG_ASSERT(__kmp_init_serial); 2770 2771 // Check if the kind parameter is valid, correct if needed. 2772 // Valid parameters should fit in one of two intervals - standard or extended: 2773 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2774 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2775 orig_kind = kind; 2776 kind = __kmp_sched_without_mods(kind); 2777 2778 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2779 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2780 // TODO: Hint needs attention in case we change the default schedule. 2781 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2782 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2783 __kmp_msg_null); 2784 kind = kmp_sched_default; 2785 chunk = 0; // ignore chunk value in case of bad kind 2786 } 2787 2788 thread = __kmp_threads[gtid]; 2789 2790 __kmp_save_internal_controls(thread); 2791 2792 if (kind < kmp_sched_upper_std) { 2793 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2794 // differ static chunked vs. unchunked: chunk should be invalid to 2795 // indicate unchunked schedule (which is the default) 2796 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2797 } else { 2798 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2799 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2800 } 2801 } else { 2802 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2803 // kmp_sched_lower - 2 ]; 2804 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2805 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2806 kmp_sched_lower - 2]; 2807 } 2808 __kmp_sched_apply_mods_intkind( 2809 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2810 if (kind == kmp_sched_auto || chunk < 1) { 2811 // ignore parameter chunk for schedule auto 2812 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2813 } else { 2814 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2815 } 2816 } 2817 2818 /* Gets def_sched_var ICV values */ 2819 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2820 kmp_info_t *thread; 2821 enum sched_type th_type; 2822 2823 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2824 KMP_DEBUG_ASSERT(__kmp_init_serial); 2825 2826 thread = __kmp_threads[gtid]; 2827 2828 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2829 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2830 case kmp_sch_static: 2831 case kmp_sch_static_greedy: 2832 case kmp_sch_static_balanced: 2833 *kind = kmp_sched_static; 2834 __kmp_sched_apply_mods_stdkind(kind, th_type); 2835 *chunk = 0; // chunk was not set, try to show this fact via zero value 2836 return; 2837 case kmp_sch_static_chunked: 2838 *kind = kmp_sched_static; 2839 break; 2840 case kmp_sch_dynamic_chunked: 2841 *kind = kmp_sched_dynamic; 2842 break; 2843 case kmp_sch_guided_chunked: 2844 case kmp_sch_guided_iterative_chunked: 2845 case kmp_sch_guided_analytical_chunked: 2846 *kind = kmp_sched_guided; 2847 break; 2848 case kmp_sch_auto: 2849 *kind = kmp_sched_auto; 2850 break; 2851 case kmp_sch_trapezoidal: 2852 *kind = kmp_sched_trapezoidal; 2853 break; 2854 #if KMP_STATIC_STEAL_ENABLED 2855 case kmp_sch_static_steal: 2856 *kind = kmp_sched_static_steal; 2857 break; 2858 #endif 2859 default: 2860 KMP_FATAL(UnknownSchedulingType, th_type); 2861 } 2862 2863 __kmp_sched_apply_mods_stdkind(kind, th_type); 2864 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2865 } 2866 2867 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2868 2869 int ii, dd; 2870 kmp_team_t *team; 2871 kmp_info_t *thr; 2872 2873 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2874 KMP_DEBUG_ASSERT(__kmp_init_serial); 2875 2876 // validate level 2877 if (level == 0) 2878 return 0; 2879 if (level < 0) 2880 return -1; 2881 thr = __kmp_threads[gtid]; 2882 team = thr->th.th_team; 2883 ii = team->t.t_level; 2884 if (level > ii) 2885 return -1; 2886 2887 if (thr->th.th_teams_microtask) { 2888 // AC: we are in teams region where multiple nested teams have same level 2889 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2890 if (level <= 2891 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2892 KMP_DEBUG_ASSERT(ii >= tlevel); 2893 // AC: As we need to pass by the teams league, we need to artificially 2894 // increase ii 2895 if (ii == tlevel) { 2896 ii += 2; // three teams have same level 2897 } else { 2898 ii++; // two teams have same level 2899 } 2900 } 2901 } 2902 2903 if (ii == level) 2904 return __kmp_tid_from_gtid(gtid); 2905 2906 dd = team->t.t_serialized; 2907 level++; 2908 while (ii > level) { 2909 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2910 } 2911 if ((team->t.t_serialized) && (!dd)) { 2912 team = team->t.t_parent; 2913 continue; 2914 } 2915 if (ii > level) { 2916 team = team->t.t_parent; 2917 dd = team->t.t_serialized; 2918 ii--; 2919 } 2920 } 2921 2922 return (dd > 1) ? (0) : (team->t.t_master_tid); 2923 } 2924 2925 int __kmp_get_team_size(int gtid, int level) { 2926 2927 int ii, dd; 2928 kmp_team_t *team; 2929 kmp_info_t *thr; 2930 2931 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2932 KMP_DEBUG_ASSERT(__kmp_init_serial); 2933 2934 // validate level 2935 if (level == 0) 2936 return 1; 2937 if (level < 0) 2938 return -1; 2939 thr = __kmp_threads[gtid]; 2940 team = thr->th.th_team; 2941 ii = team->t.t_level; 2942 if (level > ii) 2943 return -1; 2944 2945 if (thr->th.th_teams_microtask) { 2946 // AC: we are in teams region where multiple nested teams have same level 2947 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2948 if (level <= 2949 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2950 KMP_DEBUG_ASSERT(ii >= tlevel); 2951 // AC: As we need to pass by the teams league, we need to artificially 2952 // increase ii 2953 if (ii == tlevel) { 2954 ii += 2; // three teams have same level 2955 } else { 2956 ii++; // two teams have same level 2957 } 2958 } 2959 } 2960 2961 while (ii > level) { 2962 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2963 } 2964 if (team->t.t_serialized && (!dd)) { 2965 team = team->t.t_parent; 2966 continue; 2967 } 2968 if (ii > level) { 2969 team = team->t.t_parent; 2970 ii--; 2971 } 2972 } 2973 2974 return team->t.t_nproc; 2975 } 2976 2977 kmp_r_sched_t __kmp_get_schedule_global() { 2978 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2979 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2980 // independently. So one can get the updated schedule here. 2981 2982 kmp_r_sched_t r_sched; 2983 2984 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2985 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2986 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2987 // different roots (even in OMP 2.5) 2988 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2989 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2990 if (s == kmp_sch_static) { 2991 // replace STATIC with more detailed schedule (balanced or greedy) 2992 r_sched.r_sched_type = __kmp_static; 2993 } else if (s == kmp_sch_guided_chunked) { 2994 // replace GUIDED with more detailed schedule (iterative or analytical) 2995 r_sched.r_sched_type = __kmp_guided; 2996 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2997 r_sched.r_sched_type = __kmp_sched; 2998 } 2999 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3000 3001 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3002 // __kmp_chunk may be wrong here (if it was not ever set) 3003 r_sched.chunk = KMP_DEFAULT_CHUNK; 3004 } else { 3005 r_sched.chunk = __kmp_chunk; 3006 } 3007 3008 return r_sched; 3009 } 3010 3011 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3012 at least argc number of *t_argv entries for the requested team. */ 3013 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3014 3015 KMP_DEBUG_ASSERT(team); 3016 if (!realloc || argc > team->t.t_max_argc) { 3017 3018 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3019 "current entries=%d\n", 3020 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3021 /* if previously allocated heap space for args, free them */ 3022 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3023 __kmp_free((void *)team->t.t_argv); 3024 3025 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3026 /* use unused space in the cache line for arguments */ 3027 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3028 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3029 "argv entries\n", 3030 team->t.t_id, team->t.t_max_argc)); 3031 team->t.t_argv = &team->t.t_inline_argv[0]; 3032 if (__kmp_storage_map) { 3033 __kmp_print_storage_map_gtid( 3034 -1, &team->t.t_inline_argv[0], 3035 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3036 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3037 team->t.t_id); 3038 } 3039 } else { 3040 /* allocate space for arguments in the heap */ 3041 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3042 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3043 : 2 * argc; 3044 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3045 "argv entries\n", 3046 team->t.t_id, team->t.t_max_argc)); 3047 team->t.t_argv = 3048 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3049 if (__kmp_storage_map) { 3050 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3051 &team->t.t_argv[team->t.t_max_argc], 3052 sizeof(void *) * team->t.t_max_argc, 3053 "team_%d.t_argv", team->t.t_id); 3054 } 3055 } 3056 } 3057 } 3058 3059 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3060 int i; 3061 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3062 team->t.t_threads = 3063 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3064 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3065 sizeof(dispatch_shared_info_t) * num_disp_buff); 3066 team->t.t_dispatch = 3067 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3068 team->t.t_implicit_task_taskdata = 3069 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3070 team->t.t_max_nproc = max_nth; 3071 3072 /* setup dispatch buffers */ 3073 for (i = 0; i < num_disp_buff; ++i) { 3074 team->t.t_disp_buffer[i].buffer_index = i; 3075 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3076 } 3077 } 3078 3079 static void __kmp_free_team_arrays(kmp_team_t *team) { 3080 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3081 int i; 3082 for (i = 0; i < team->t.t_max_nproc; ++i) { 3083 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3084 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3085 team->t.t_dispatch[i].th_disp_buffer = NULL; 3086 } 3087 } 3088 #if KMP_USE_HIER_SCHED 3089 __kmp_dispatch_free_hierarchies(team); 3090 #endif 3091 __kmp_free(team->t.t_threads); 3092 __kmp_free(team->t.t_disp_buffer); 3093 __kmp_free(team->t.t_dispatch); 3094 __kmp_free(team->t.t_implicit_task_taskdata); 3095 team->t.t_threads = NULL; 3096 team->t.t_disp_buffer = NULL; 3097 team->t.t_dispatch = NULL; 3098 team->t.t_implicit_task_taskdata = 0; 3099 } 3100 3101 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3102 kmp_info_t **oldThreads = team->t.t_threads; 3103 3104 __kmp_free(team->t.t_disp_buffer); 3105 __kmp_free(team->t.t_dispatch); 3106 __kmp_free(team->t.t_implicit_task_taskdata); 3107 __kmp_allocate_team_arrays(team, max_nth); 3108 3109 KMP_MEMCPY(team->t.t_threads, oldThreads, 3110 team->t.t_nproc * sizeof(kmp_info_t *)); 3111 3112 __kmp_free(oldThreads); 3113 } 3114 3115 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3116 3117 kmp_r_sched_t r_sched = 3118 __kmp_get_schedule_global(); // get current state of scheduling globals 3119 3120 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3121 3122 kmp_internal_control_t g_icvs = { 3123 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3124 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3125 // adjustment of threads (per thread) 3126 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3127 // whether blocktime is explicitly set 3128 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3129 #if KMP_USE_MONITOR 3130 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3131 // intervals 3132 #endif 3133 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3134 // next parallel region (per thread) 3135 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3136 __kmp_cg_max_nth, // int thread_limit; 3137 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3138 // for max_active_levels 3139 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3140 // {sched,chunk} pair 3141 __kmp_nested_proc_bind.bind_types[0], 3142 __kmp_default_device, 3143 NULL // struct kmp_internal_control *next; 3144 }; 3145 3146 return g_icvs; 3147 } 3148 3149 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3150 3151 kmp_internal_control_t gx_icvs; 3152 gx_icvs.serial_nesting_level = 3153 0; // probably =team->t.t_serial like in save_inter_controls 3154 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3155 gx_icvs.next = NULL; 3156 3157 return gx_icvs; 3158 } 3159 3160 static void __kmp_initialize_root(kmp_root_t *root) { 3161 int f; 3162 kmp_team_t *root_team; 3163 kmp_team_t *hot_team; 3164 int hot_team_max_nth; 3165 kmp_r_sched_t r_sched = 3166 __kmp_get_schedule_global(); // get current state of scheduling globals 3167 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3168 KMP_DEBUG_ASSERT(root); 3169 KMP_ASSERT(!root->r.r_begin); 3170 3171 /* setup the root state structure */ 3172 __kmp_init_lock(&root->r.r_begin_lock); 3173 root->r.r_begin = FALSE; 3174 root->r.r_active = FALSE; 3175 root->r.r_in_parallel = 0; 3176 root->r.r_blocktime = __kmp_dflt_blocktime; 3177 #if KMP_AFFINITY_SUPPORTED 3178 root->r.r_affinity_assigned = FALSE; 3179 #endif 3180 3181 /* setup the root team for this task */ 3182 /* allocate the root team structure */ 3183 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3184 3185 root_team = 3186 __kmp_allocate_team(root, 3187 1, // new_nproc 3188 1, // max_nproc 3189 #if OMPT_SUPPORT 3190 ompt_data_none, // root parallel id 3191 #endif 3192 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3193 0 // argc 3194 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3195 ); 3196 #if USE_DEBUGGER 3197 // Non-NULL value should be assigned to make the debugger display the root 3198 // team. 3199 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3200 #endif 3201 3202 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3203 3204 root->r.r_root_team = root_team; 3205 root_team->t.t_control_stack_top = NULL; 3206 3207 /* initialize root team */ 3208 root_team->t.t_threads[0] = NULL; 3209 root_team->t.t_nproc = 1; 3210 root_team->t.t_serialized = 1; 3211 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3212 root_team->t.t_sched.sched = r_sched.sched; 3213 KA_TRACE( 3214 20, 3215 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3216 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3217 3218 /* setup the hot team for this task */ 3219 /* allocate the hot team structure */ 3220 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3221 3222 hot_team = 3223 __kmp_allocate_team(root, 3224 1, // new_nproc 3225 __kmp_dflt_team_nth_ub * 2, // max_nproc 3226 #if OMPT_SUPPORT 3227 ompt_data_none, // root parallel id 3228 #endif 3229 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3230 0 // argc 3231 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3232 ); 3233 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3234 3235 root->r.r_hot_team = hot_team; 3236 root_team->t.t_control_stack_top = NULL; 3237 3238 /* first-time initialization */ 3239 hot_team->t.t_parent = root_team; 3240 3241 /* initialize hot team */ 3242 hot_team_max_nth = hot_team->t.t_max_nproc; 3243 for (f = 0; f < hot_team_max_nth; ++f) { 3244 hot_team->t.t_threads[f] = NULL; 3245 } 3246 hot_team->t.t_nproc = 1; 3247 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3248 hot_team->t.t_sched.sched = r_sched.sched; 3249 hot_team->t.t_size_changed = 0; 3250 } 3251 3252 #ifdef KMP_DEBUG 3253 3254 typedef struct kmp_team_list_item { 3255 kmp_team_p const *entry; 3256 struct kmp_team_list_item *next; 3257 } kmp_team_list_item_t; 3258 typedef kmp_team_list_item_t *kmp_team_list_t; 3259 3260 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3261 kmp_team_list_t list, // List of teams. 3262 kmp_team_p const *team // Team to add. 3263 ) { 3264 3265 // List must terminate with item where both entry and next are NULL. 3266 // Team is added to the list only once. 3267 // List is sorted in ascending order by team id. 3268 // Team id is *not* a key. 3269 3270 kmp_team_list_t l; 3271 3272 KMP_DEBUG_ASSERT(list != NULL); 3273 if (team == NULL) { 3274 return; 3275 } 3276 3277 __kmp_print_structure_team_accum(list, team->t.t_parent); 3278 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3279 3280 // Search list for the team. 3281 l = list; 3282 while (l->next != NULL && l->entry != team) { 3283 l = l->next; 3284 } 3285 if (l->next != NULL) { 3286 return; // Team has been added before, exit. 3287 } 3288 3289 // Team is not found. Search list again for insertion point. 3290 l = list; 3291 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3292 l = l->next; 3293 } 3294 3295 // Insert team. 3296 { 3297 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3298 sizeof(kmp_team_list_item_t)); 3299 *item = *l; 3300 l->entry = team; 3301 l->next = item; 3302 } 3303 } 3304 3305 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3306 3307 ) { 3308 __kmp_printf("%s", title); 3309 if (team != NULL) { 3310 __kmp_printf("%2x %p\n", team->t.t_id, team); 3311 } else { 3312 __kmp_printf(" - (nil)\n"); 3313 } 3314 } 3315 3316 static void __kmp_print_structure_thread(char const *title, 3317 kmp_info_p const *thread) { 3318 __kmp_printf("%s", title); 3319 if (thread != NULL) { 3320 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3321 } else { 3322 __kmp_printf(" - (nil)\n"); 3323 } 3324 } 3325 3326 void __kmp_print_structure(void) { 3327 3328 kmp_team_list_t list; 3329 3330 // Initialize list of teams. 3331 list = 3332 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3333 list->entry = NULL; 3334 list->next = NULL; 3335 3336 __kmp_printf("\n------------------------------\nGlobal Thread " 3337 "Table\n------------------------------\n"); 3338 { 3339 int gtid; 3340 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3341 __kmp_printf("%2d", gtid); 3342 if (__kmp_threads != NULL) { 3343 __kmp_printf(" %p", __kmp_threads[gtid]); 3344 } 3345 if (__kmp_root != NULL) { 3346 __kmp_printf(" %p", __kmp_root[gtid]); 3347 } 3348 __kmp_printf("\n"); 3349 } 3350 } 3351 3352 // Print out __kmp_threads array. 3353 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3354 "----------\n"); 3355 if (__kmp_threads != NULL) { 3356 int gtid; 3357 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3358 kmp_info_t const *thread = __kmp_threads[gtid]; 3359 if (thread != NULL) { 3360 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3361 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3362 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3363 __kmp_print_structure_team(" Serial Team: ", 3364 thread->th.th_serial_team); 3365 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3366 __kmp_print_structure_thread(" Primary: ", 3367 thread->th.th_team_master); 3368 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3369 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3370 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3371 __kmp_print_structure_thread(" Next in pool: ", 3372 thread->th.th_next_pool); 3373 __kmp_printf("\n"); 3374 __kmp_print_structure_team_accum(list, thread->th.th_team); 3375 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3376 } 3377 } 3378 } else { 3379 __kmp_printf("Threads array is not allocated.\n"); 3380 } 3381 3382 // Print out __kmp_root array. 3383 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3384 "--------\n"); 3385 if (__kmp_root != NULL) { 3386 int gtid; 3387 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3388 kmp_root_t const *root = __kmp_root[gtid]; 3389 if (root != NULL) { 3390 __kmp_printf("GTID %2d %p:\n", gtid, root); 3391 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3392 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3393 __kmp_print_structure_thread(" Uber Thread: ", 3394 root->r.r_uber_thread); 3395 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3396 __kmp_printf(" In Parallel: %2d\n", 3397 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3398 __kmp_printf("\n"); 3399 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3400 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3401 } 3402 } 3403 } else { 3404 __kmp_printf("Ubers array is not allocated.\n"); 3405 } 3406 3407 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3408 "--------\n"); 3409 while (list->next != NULL) { 3410 kmp_team_p const *team = list->entry; 3411 int i; 3412 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3413 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3414 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3415 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3416 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3417 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3418 for (i = 0; i < team->t.t_nproc; ++i) { 3419 __kmp_printf(" Thread %2d: ", i); 3420 __kmp_print_structure_thread("", team->t.t_threads[i]); 3421 } 3422 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3423 __kmp_printf("\n"); 3424 list = list->next; 3425 } 3426 3427 // Print out __kmp_thread_pool and __kmp_team_pool. 3428 __kmp_printf("\n------------------------------\nPools\n----------------------" 3429 "--------\n"); 3430 __kmp_print_structure_thread("Thread pool: ", 3431 CCAST(kmp_info_t *, __kmp_thread_pool)); 3432 __kmp_print_structure_team("Team pool: ", 3433 CCAST(kmp_team_t *, __kmp_team_pool)); 3434 __kmp_printf("\n"); 3435 3436 // Free team list. 3437 while (list != NULL) { 3438 kmp_team_list_item_t *item = list; 3439 list = list->next; 3440 KMP_INTERNAL_FREE(item); 3441 } 3442 } 3443 3444 #endif 3445 3446 //--------------------------------------------------------------------------- 3447 // Stuff for per-thread fast random number generator 3448 // Table of primes 3449 static const unsigned __kmp_primes[] = { 3450 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3451 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3452 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3453 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3454 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3455 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3456 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3457 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3458 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3459 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3460 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3461 3462 //--------------------------------------------------------------------------- 3463 // __kmp_get_random: Get a random number using a linear congruential method. 3464 unsigned short __kmp_get_random(kmp_info_t *thread) { 3465 unsigned x = thread->th.th_x; 3466 unsigned short r = (unsigned short)(x >> 16); 3467 3468 thread->th.th_x = x * thread->th.th_a + 1; 3469 3470 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3471 thread->th.th_info.ds.ds_tid, r)); 3472 3473 return r; 3474 } 3475 //-------------------------------------------------------- 3476 // __kmp_init_random: Initialize a random number generator 3477 void __kmp_init_random(kmp_info_t *thread) { 3478 unsigned seed = thread->th.th_info.ds.ds_tid; 3479 3480 thread->th.th_a = 3481 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3482 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3483 KA_TRACE(30, 3484 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3485 } 3486 3487 #if KMP_OS_WINDOWS 3488 /* reclaim array entries for root threads that are already dead, returns number 3489 * reclaimed */ 3490 static int __kmp_reclaim_dead_roots(void) { 3491 int i, r = 0; 3492 3493 for (i = 0; i < __kmp_threads_capacity; ++i) { 3494 if (KMP_UBER_GTID(i) && 3495 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3496 !__kmp_root[i] 3497 ->r.r_active) { // AC: reclaim only roots died in non-active state 3498 r += __kmp_unregister_root_other_thread(i); 3499 } 3500 } 3501 return r; 3502 } 3503 #endif 3504 3505 /* This function attempts to create free entries in __kmp_threads and 3506 __kmp_root, and returns the number of free entries generated. 3507 3508 For Windows* OS static library, the first mechanism used is to reclaim array 3509 entries for root threads that are already dead. 3510 3511 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3512 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3513 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3514 threadprivate cache array has been created. Synchronization with 3515 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3516 3517 After any dead root reclamation, if the clipping value allows array expansion 3518 to result in the generation of a total of nNeed free slots, the function does 3519 that expansion. If not, nothing is done beyond the possible initial root 3520 thread reclamation. 3521 3522 If any argument is negative, the behavior is undefined. */ 3523 static int __kmp_expand_threads(int nNeed) { 3524 int added = 0; 3525 int minimumRequiredCapacity; 3526 int newCapacity; 3527 kmp_info_t **newThreads; 3528 kmp_root_t **newRoot; 3529 3530 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3531 // resizing __kmp_threads does not need additional protection if foreign 3532 // threads are present 3533 3534 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3535 /* only for Windows static library */ 3536 /* reclaim array entries for root threads that are already dead */ 3537 added = __kmp_reclaim_dead_roots(); 3538 3539 if (nNeed) { 3540 nNeed -= added; 3541 if (nNeed < 0) 3542 nNeed = 0; 3543 } 3544 #endif 3545 if (nNeed <= 0) 3546 return added; 3547 3548 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3549 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3550 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3551 // > __kmp_max_nth in one of two ways: 3552 // 3553 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3554 // may not be reused by another thread, so we may need to increase 3555 // __kmp_threads_capacity to __kmp_max_nth + 1. 3556 // 3557 // 2) New foreign root(s) are encountered. We always register new foreign 3558 // roots. This may cause a smaller # of threads to be allocated at 3559 // subsequent parallel regions, but the worker threads hang around (and 3560 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3561 // 3562 // Anyway, that is the reason for moving the check to see if 3563 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3564 // instead of having it performed here. -BB 3565 3566 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3567 3568 /* compute expansion headroom to check if we can expand */ 3569 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3570 /* possible expansion too small -- give up */ 3571 return added; 3572 } 3573 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3574 3575 newCapacity = __kmp_threads_capacity; 3576 do { 3577 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3578 : __kmp_sys_max_nth; 3579 } while (newCapacity < minimumRequiredCapacity); 3580 newThreads = (kmp_info_t **)__kmp_allocate( 3581 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3582 newRoot = 3583 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3584 KMP_MEMCPY(newThreads, __kmp_threads, 3585 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3586 KMP_MEMCPY(newRoot, __kmp_root, 3587 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3588 3589 kmp_info_t **temp_threads = __kmp_threads; 3590 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3591 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3592 __kmp_free(temp_threads); 3593 added += newCapacity - __kmp_threads_capacity; 3594 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3595 3596 if (newCapacity > __kmp_tp_capacity) { 3597 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3598 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3599 __kmp_threadprivate_resize_cache(newCapacity); 3600 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3601 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3602 } 3603 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3604 } 3605 3606 return added; 3607 } 3608 3609 /* Register the current thread as a root thread and obtain our gtid. We must 3610 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3611 thread that calls from __kmp_do_serial_initialize() */ 3612 int __kmp_register_root(int initial_thread) { 3613 kmp_info_t *root_thread; 3614 kmp_root_t *root; 3615 int gtid; 3616 int capacity; 3617 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3618 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3619 KMP_MB(); 3620 3621 /* 2007-03-02: 3622 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3623 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3624 work as expected -- it may return false (that means there is at least one 3625 empty slot in __kmp_threads array), but it is possible the only free slot 3626 is #0, which is reserved for initial thread and so cannot be used for this 3627 one. Following code workarounds this bug. 3628 3629 However, right solution seems to be not reserving slot #0 for initial 3630 thread because: 3631 (1) there is no magic in slot #0, 3632 (2) we cannot detect initial thread reliably (the first thread which does 3633 serial initialization may be not a real initial thread). 3634 */ 3635 capacity = __kmp_threads_capacity; 3636 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3637 --capacity; 3638 } 3639 3640 // If it is not for initializing the hidden helper team, we need to take 3641 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3642 // in __kmp_threads_capacity. 3643 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3644 capacity -= __kmp_hidden_helper_threads_num; 3645 } 3646 3647 /* see if there are too many threads */ 3648 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3649 if (__kmp_tp_cached) { 3650 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3651 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3652 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3653 } else { 3654 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3655 __kmp_msg_null); 3656 } 3657 } 3658 3659 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3660 // 0: initial thread, also a regular OpenMP thread. 3661 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3662 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3663 // regular OpenMP threads. 3664 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3665 // Find an available thread slot for hidden helper thread. Slots for hidden 3666 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3667 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3668 gtid <= __kmp_hidden_helper_threads_num; 3669 gtid++) 3670 ; 3671 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3672 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3673 "hidden helper thread: T#%d\n", 3674 gtid)); 3675 } else { 3676 /* find an available thread slot */ 3677 // Don't reassign the zero slot since we need that to only be used by 3678 // initial thread. Slots for hidden helper threads should also be skipped. 3679 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3680 gtid = 0; 3681 } else { 3682 for (gtid = __kmp_hidden_helper_threads_num + 1; 3683 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3684 ; 3685 } 3686 KA_TRACE( 3687 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3688 KMP_ASSERT(gtid < __kmp_threads_capacity); 3689 } 3690 3691 /* update global accounting */ 3692 __kmp_all_nth++; 3693 TCW_4(__kmp_nth, __kmp_nth + 1); 3694 3695 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3696 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3697 if (__kmp_adjust_gtid_mode) { 3698 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3699 if (TCR_4(__kmp_gtid_mode) != 2) { 3700 TCW_4(__kmp_gtid_mode, 2); 3701 } 3702 } else { 3703 if (TCR_4(__kmp_gtid_mode) != 1) { 3704 TCW_4(__kmp_gtid_mode, 1); 3705 } 3706 } 3707 } 3708 3709 #ifdef KMP_ADJUST_BLOCKTIME 3710 /* Adjust blocktime to zero if necessary */ 3711 /* Middle initialization might not have occurred yet */ 3712 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3713 if (__kmp_nth > __kmp_avail_proc) { 3714 __kmp_zero_bt = TRUE; 3715 } 3716 } 3717 #endif /* KMP_ADJUST_BLOCKTIME */ 3718 3719 /* setup this new hierarchy */ 3720 if (!(root = __kmp_root[gtid])) { 3721 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3722 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3723 } 3724 3725 #if KMP_STATS_ENABLED 3726 // Initialize stats as soon as possible (right after gtid assignment). 3727 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3728 __kmp_stats_thread_ptr->startLife(); 3729 KMP_SET_THREAD_STATE(SERIAL_REGION); 3730 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3731 #endif 3732 __kmp_initialize_root(root); 3733 3734 /* setup new root thread structure */ 3735 if (root->r.r_uber_thread) { 3736 root_thread = root->r.r_uber_thread; 3737 } else { 3738 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3739 if (__kmp_storage_map) { 3740 __kmp_print_thread_storage_map(root_thread, gtid); 3741 } 3742 root_thread->th.th_info.ds.ds_gtid = gtid; 3743 #if OMPT_SUPPORT 3744 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3745 #endif 3746 root_thread->th.th_root = root; 3747 if (__kmp_env_consistency_check) { 3748 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3749 } 3750 #if USE_FAST_MEMORY 3751 __kmp_initialize_fast_memory(root_thread); 3752 #endif /* USE_FAST_MEMORY */ 3753 3754 #if KMP_USE_BGET 3755 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3756 __kmp_initialize_bget(root_thread); 3757 #endif 3758 __kmp_init_random(root_thread); // Initialize random number generator 3759 } 3760 3761 /* setup the serial team held in reserve by the root thread */ 3762 if (!root_thread->th.th_serial_team) { 3763 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3764 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3765 root_thread->th.th_serial_team = __kmp_allocate_team( 3766 root, 1, 1, 3767 #if OMPT_SUPPORT 3768 ompt_data_none, // root parallel id 3769 #endif 3770 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3771 } 3772 KMP_ASSERT(root_thread->th.th_serial_team); 3773 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3774 root_thread->th.th_serial_team)); 3775 3776 /* drop root_thread into place */ 3777 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3778 3779 root->r.r_root_team->t.t_threads[0] = root_thread; 3780 root->r.r_hot_team->t.t_threads[0] = root_thread; 3781 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3782 // AC: the team created in reserve, not for execution (it is unused for now). 3783 root_thread->th.th_serial_team->t.t_serialized = 0; 3784 root->r.r_uber_thread = root_thread; 3785 3786 /* initialize the thread, get it ready to go */ 3787 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3788 TCW_4(__kmp_init_gtid, TRUE); 3789 3790 /* prepare the primary thread for get_gtid() */ 3791 __kmp_gtid_set_specific(gtid); 3792 3793 #if USE_ITT_BUILD 3794 __kmp_itt_thread_name(gtid); 3795 #endif /* USE_ITT_BUILD */ 3796 3797 #ifdef KMP_TDATA_GTID 3798 __kmp_gtid = gtid; 3799 #endif 3800 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3801 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3802 3803 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3804 "plain=%u\n", 3805 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3806 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3807 KMP_INIT_BARRIER_STATE)); 3808 { // Initialize barrier data. 3809 int b; 3810 for (b = 0; b < bs_last_barrier; ++b) { 3811 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3812 #if USE_DEBUGGER 3813 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3814 #endif 3815 } 3816 } 3817 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3818 KMP_INIT_BARRIER_STATE); 3819 3820 #if KMP_AFFINITY_SUPPORTED 3821 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3822 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3823 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3824 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3825 #endif /* KMP_AFFINITY_SUPPORTED */ 3826 root_thread->th.th_def_allocator = __kmp_def_allocator; 3827 root_thread->th.th_prev_level = 0; 3828 root_thread->th.th_prev_num_threads = 1; 3829 3830 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3831 tmp->cg_root = root_thread; 3832 tmp->cg_thread_limit = __kmp_cg_max_nth; 3833 tmp->cg_nthreads = 1; 3834 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3835 " cg_nthreads init to 1\n", 3836 root_thread, tmp)); 3837 tmp->up = NULL; 3838 root_thread->th.th_cg_roots = tmp; 3839 3840 __kmp_root_counter++; 3841 3842 #if OMPT_SUPPORT 3843 if (!initial_thread && ompt_enabled.enabled) { 3844 3845 kmp_info_t *root_thread = ompt_get_thread(); 3846 3847 ompt_set_thread_state(root_thread, ompt_state_overhead); 3848 3849 if (ompt_enabled.ompt_callback_thread_begin) { 3850 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3851 ompt_thread_initial, __ompt_get_thread_data_internal()); 3852 } 3853 ompt_data_t *task_data; 3854 ompt_data_t *parallel_data; 3855 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3856 NULL); 3857 if (ompt_enabled.ompt_callback_implicit_task) { 3858 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3859 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3860 } 3861 3862 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3863 } 3864 #endif 3865 #if OMPD_SUPPORT 3866 if (ompd_state & OMPD_ENABLE_BP) 3867 ompd_bp_thread_begin(); 3868 #endif 3869 3870 KMP_MB(); 3871 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3872 3873 return gtid; 3874 } 3875 3876 #if KMP_NESTED_HOT_TEAMS 3877 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3878 const int max_level) { 3879 int i, n, nth; 3880 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3881 if (!hot_teams || !hot_teams[level].hot_team) { 3882 return 0; 3883 } 3884 KMP_DEBUG_ASSERT(level < max_level); 3885 kmp_team_t *team = hot_teams[level].hot_team; 3886 nth = hot_teams[level].hot_team_nth; 3887 n = nth - 1; // primary thread is not freed 3888 if (level < max_level - 1) { 3889 for (i = 0; i < nth; ++i) { 3890 kmp_info_t *th = team->t.t_threads[i]; 3891 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3892 if (i > 0 && th->th.th_hot_teams) { 3893 __kmp_free(th->th.th_hot_teams); 3894 th->th.th_hot_teams = NULL; 3895 } 3896 } 3897 } 3898 __kmp_free_team(root, team, NULL); 3899 return n; 3900 } 3901 #endif 3902 3903 // Resets a root thread and clear its root and hot teams. 3904 // Returns the number of __kmp_threads entries directly and indirectly freed. 3905 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3906 kmp_team_t *root_team = root->r.r_root_team; 3907 kmp_team_t *hot_team = root->r.r_hot_team; 3908 int n = hot_team->t.t_nproc; 3909 int i; 3910 3911 KMP_DEBUG_ASSERT(!root->r.r_active); 3912 3913 root->r.r_root_team = NULL; 3914 root->r.r_hot_team = NULL; 3915 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3916 // before call to __kmp_free_team(). 3917 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3918 #if KMP_NESTED_HOT_TEAMS 3919 if (__kmp_hot_teams_max_level > 3920 0) { // need to free nested hot teams and their threads if any 3921 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3922 kmp_info_t *th = hot_team->t.t_threads[i]; 3923 if (__kmp_hot_teams_max_level > 1) { 3924 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3925 } 3926 if (th->th.th_hot_teams) { 3927 __kmp_free(th->th.th_hot_teams); 3928 th->th.th_hot_teams = NULL; 3929 } 3930 } 3931 } 3932 #endif 3933 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3934 3935 // Before we can reap the thread, we need to make certain that all other 3936 // threads in the teams that had this root as ancestor have stopped trying to 3937 // steal tasks. 3938 if (__kmp_tasking_mode != tskm_immediate_exec) { 3939 __kmp_wait_to_unref_task_teams(); 3940 } 3941 3942 #if KMP_OS_WINDOWS 3943 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3944 KA_TRACE( 3945 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3946 "\n", 3947 (LPVOID) & (root->r.r_uber_thread->th), 3948 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3949 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3950 #endif /* KMP_OS_WINDOWS */ 3951 3952 #if OMPD_SUPPORT 3953 if (ompd_state & OMPD_ENABLE_BP) 3954 ompd_bp_thread_end(); 3955 #endif 3956 3957 #if OMPT_SUPPORT 3958 ompt_data_t *task_data; 3959 ompt_data_t *parallel_data; 3960 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3961 NULL); 3962 if (ompt_enabled.ompt_callback_implicit_task) { 3963 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3964 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3965 } 3966 if (ompt_enabled.ompt_callback_thread_end) { 3967 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3968 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3969 } 3970 #endif 3971 3972 TCW_4(__kmp_nth, 3973 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3974 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3975 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3976 " to %d\n", 3977 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3978 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3979 if (i == 1) { 3980 // need to free contention group structure 3981 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3982 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3983 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3984 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3985 root->r.r_uber_thread->th.th_cg_roots = NULL; 3986 } 3987 __kmp_reap_thread(root->r.r_uber_thread, 1); 3988 3989 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3990 // instead of freeing. 3991 root->r.r_uber_thread = NULL; 3992 /* mark root as no longer in use */ 3993 root->r.r_begin = FALSE; 3994 3995 return n; 3996 } 3997 3998 void __kmp_unregister_root_current_thread(int gtid) { 3999 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4000 /* this lock should be ok, since unregister_root_current_thread is never 4001 called during an abort, only during a normal close. furthermore, if you 4002 have the forkjoin lock, you should never try to get the initz lock */ 4003 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4004 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4005 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4006 "exiting T#%d\n", 4007 gtid)); 4008 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4009 return; 4010 } 4011 kmp_root_t *root = __kmp_root[gtid]; 4012 4013 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4014 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4015 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4016 KMP_ASSERT(root->r.r_active == FALSE); 4017 4018 KMP_MB(); 4019 4020 kmp_info_t *thread = __kmp_threads[gtid]; 4021 kmp_team_t *team = thread->th.th_team; 4022 kmp_task_team_t *task_team = thread->th.th_task_team; 4023 4024 // we need to wait for the proxy tasks before finishing the thread 4025 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4026 #if OMPT_SUPPORT 4027 // the runtime is shutting down so we won't report any events 4028 thread->th.ompt_thread_info.state = ompt_state_undefined; 4029 #endif 4030 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4031 } 4032 4033 __kmp_reset_root(gtid, root); 4034 4035 KMP_MB(); 4036 KC_TRACE(10, 4037 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4038 4039 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4040 } 4041 4042 #if KMP_OS_WINDOWS 4043 /* __kmp_forkjoin_lock must be already held 4044 Unregisters a root thread that is not the current thread. Returns the number 4045 of __kmp_threads entries freed as a result. */ 4046 static int __kmp_unregister_root_other_thread(int gtid) { 4047 kmp_root_t *root = __kmp_root[gtid]; 4048 int r; 4049 4050 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4051 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4052 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4053 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4054 KMP_ASSERT(root->r.r_active == FALSE); 4055 4056 r = __kmp_reset_root(gtid, root); 4057 KC_TRACE(10, 4058 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4059 return r; 4060 } 4061 #endif 4062 4063 #if KMP_DEBUG 4064 void __kmp_task_info() { 4065 4066 kmp_int32 gtid = __kmp_entry_gtid(); 4067 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4068 kmp_info_t *this_thr = __kmp_threads[gtid]; 4069 kmp_team_t *steam = this_thr->th.th_serial_team; 4070 kmp_team_t *team = this_thr->th.th_team; 4071 4072 __kmp_printf( 4073 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4074 "ptask=%p\n", 4075 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4076 team->t.t_implicit_task_taskdata[tid].td_parent); 4077 } 4078 #endif // KMP_DEBUG 4079 4080 /* TODO optimize with one big memclr, take out what isn't needed, split 4081 responsibility to workers as much as possible, and delay initialization of 4082 features as much as possible */ 4083 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4084 int tid, int gtid) { 4085 /* this_thr->th.th_info.ds.ds_gtid is setup in 4086 kmp_allocate_thread/create_worker. 4087 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4088 KMP_DEBUG_ASSERT(this_thr != NULL); 4089 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4090 KMP_DEBUG_ASSERT(team); 4091 KMP_DEBUG_ASSERT(team->t.t_threads); 4092 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4093 kmp_info_t *master = team->t.t_threads[0]; 4094 KMP_DEBUG_ASSERT(master); 4095 KMP_DEBUG_ASSERT(master->th.th_root); 4096 4097 KMP_MB(); 4098 4099 TCW_SYNC_PTR(this_thr->th.th_team, team); 4100 4101 this_thr->th.th_info.ds.ds_tid = tid; 4102 this_thr->th.th_set_nproc = 0; 4103 if (__kmp_tasking_mode != tskm_immediate_exec) 4104 // When tasking is possible, threads are not safe to reap until they are 4105 // done tasking; this will be set when tasking code is exited in wait 4106 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4107 else // no tasking --> always safe to reap 4108 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4109 this_thr->th.th_set_proc_bind = proc_bind_default; 4110 #if KMP_AFFINITY_SUPPORTED 4111 this_thr->th.th_new_place = this_thr->th.th_current_place; 4112 #endif 4113 this_thr->th.th_root = master->th.th_root; 4114 4115 /* setup the thread's cache of the team structure */ 4116 this_thr->th.th_team_nproc = team->t.t_nproc; 4117 this_thr->th.th_team_master = master; 4118 this_thr->th.th_team_serialized = team->t.t_serialized; 4119 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4120 4121 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4122 4123 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4124 tid, gtid, this_thr, this_thr->th.th_current_task)); 4125 4126 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4127 team, tid, TRUE); 4128 4129 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4130 tid, gtid, this_thr, this_thr->th.th_current_task)); 4131 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4132 // __kmp_initialize_team()? 4133 4134 /* TODO no worksharing in speculative threads */ 4135 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4136 4137 this_thr->th.th_local.this_construct = 0; 4138 4139 if (!this_thr->th.th_pri_common) { 4140 this_thr->th.th_pri_common = 4141 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4142 if (__kmp_storage_map) { 4143 __kmp_print_storage_map_gtid( 4144 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4145 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4146 } 4147 this_thr->th.th_pri_head = NULL; 4148 } 4149 4150 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4151 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4152 // Make new thread's CG root same as primary thread's 4153 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4154 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4155 if (tmp) { 4156 // worker changes CG, need to check if old CG should be freed 4157 int i = tmp->cg_nthreads--; 4158 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4159 " on node %p of thread %p to %d\n", 4160 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4161 if (i == 1) { 4162 __kmp_free(tmp); // last thread left CG --> free it 4163 } 4164 } 4165 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4166 // Increment new thread's CG root's counter to add the new thread 4167 this_thr->th.th_cg_roots->cg_nthreads++; 4168 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4169 " node %p of thread %p to %d\n", 4170 this_thr, this_thr->th.th_cg_roots, 4171 this_thr->th.th_cg_roots->cg_root, 4172 this_thr->th.th_cg_roots->cg_nthreads)); 4173 this_thr->th.th_current_task->td_icvs.thread_limit = 4174 this_thr->th.th_cg_roots->cg_thread_limit; 4175 } 4176 4177 /* Initialize dynamic dispatch */ 4178 { 4179 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4180 // Use team max_nproc since this will never change for the team. 4181 size_t disp_size = 4182 sizeof(dispatch_private_info_t) * 4183 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4184 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4185 team->t.t_max_nproc)); 4186 KMP_ASSERT(dispatch); 4187 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4188 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4189 4190 dispatch->th_disp_index = 0; 4191 dispatch->th_doacross_buf_idx = 0; 4192 if (!dispatch->th_disp_buffer) { 4193 dispatch->th_disp_buffer = 4194 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4195 4196 if (__kmp_storage_map) { 4197 __kmp_print_storage_map_gtid( 4198 gtid, &dispatch->th_disp_buffer[0], 4199 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4200 ? 1 4201 : __kmp_dispatch_num_buffers], 4202 disp_size, 4203 "th_%d.th_dispatch.th_disp_buffer " 4204 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4205 gtid, team->t.t_id, gtid); 4206 } 4207 } else { 4208 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4209 } 4210 4211 dispatch->th_dispatch_pr_current = 0; 4212 dispatch->th_dispatch_sh_current = 0; 4213 4214 dispatch->th_deo_fcn = 0; /* ORDERED */ 4215 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4216 } 4217 4218 this_thr->th.th_next_pool = NULL; 4219 4220 if (!this_thr->th.th_task_state_memo_stack) { 4221 size_t i; 4222 this_thr->th.th_task_state_memo_stack = 4223 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4224 this_thr->th.th_task_state_top = 0; 4225 this_thr->th.th_task_state_stack_sz = 4; 4226 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4227 ++i) // zero init the stack 4228 this_thr->th.th_task_state_memo_stack[i] = 0; 4229 } 4230 4231 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4232 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4233 4234 KMP_MB(); 4235 } 4236 4237 /* allocate a new thread for the requesting team. this is only called from 4238 within a forkjoin critical section. we will first try to get an available 4239 thread from the thread pool. if none is available, we will fork a new one 4240 assuming we are able to create a new one. this should be assured, as the 4241 caller should check on this first. */ 4242 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4243 int new_tid) { 4244 kmp_team_t *serial_team; 4245 kmp_info_t *new_thr; 4246 int new_gtid; 4247 4248 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4249 KMP_DEBUG_ASSERT(root && team); 4250 #if !KMP_NESTED_HOT_TEAMS 4251 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4252 #endif 4253 KMP_MB(); 4254 4255 /* first, try to get one from the thread pool */ 4256 if (__kmp_thread_pool) { 4257 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4258 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4259 if (new_thr == __kmp_thread_pool_insert_pt) { 4260 __kmp_thread_pool_insert_pt = NULL; 4261 } 4262 TCW_4(new_thr->th.th_in_pool, FALSE); 4263 __kmp_suspend_initialize_thread(new_thr); 4264 __kmp_lock_suspend_mx(new_thr); 4265 if (new_thr->th.th_active_in_pool == TRUE) { 4266 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4267 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4268 new_thr->th.th_active_in_pool = FALSE; 4269 } 4270 __kmp_unlock_suspend_mx(new_thr); 4271 4272 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4273 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4274 KMP_ASSERT(!new_thr->th.th_team); 4275 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4276 4277 /* setup the thread structure */ 4278 __kmp_initialize_info(new_thr, team, new_tid, 4279 new_thr->th.th_info.ds.ds_gtid); 4280 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4281 4282 TCW_4(__kmp_nth, __kmp_nth + 1); 4283 4284 new_thr->th.th_task_state = 0; 4285 new_thr->th.th_task_state_top = 0; 4286 new_thr->th.th_task_state_stack_sz = 4; 4287 4288 #ifdef KMP_ADJUST_BLOCKTIME 4289 /* Adjust blocktime back to zero if necessary */ 4290 /* Middle initialization might not have occurred yet */ 4291 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4292 if (__kmp_nth > __kmp_avail_proc) { 4293 __kmp_zero_bt = TRUE; 4294 } 4295 } 4296 #endif /* KMP_ADJUST_BLOCKTIME */ 4297 4298 #if KMP_DEBUG 4299 // If thread entered pool via __kmp_free_thread, wait_flag should != 4300 // KMP_BARRIER_PARENT_FLAG. 4301 int b; 4302 kmp_balign_t *balign = new_thr->th.th_bar; 4303 for (b = 0; b < bs_last_barrier; ++b) 4304 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4305 #endif 4306 4307 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4308 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4309 4310 KMP_MB(); 4311 return new_thr; 4312 } 4313 4314 /* no, well fork a new one */ 4315 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4316 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4317 4318 #if KMP_USE_MONITOR 4319 // If this is the first worker thread the RTL is creating, then also 4320 // launch the monitor thread. We try to do this as early as possible. 4321 if (!TCR_4(__kmp_init_monitor)) { 4322 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4323 if (!TCR_4(__kmp_init_monitor)) { 4324 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4325 TCW_4(__kmp_init_monitor, 1); 4326 __kmp_create_monitor(&__kmp_monitor); 4327 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4328 #if KMP_OS_WINDOWS 4329 // AC: wait until monitor has started. This is a fix for CQ232808. 4330 // The reason is that if the library is loaded/unloaded in a loop with 4331 // small (parallel) work in between, then there is high probability that 4332 // monitor thread started after the library shutdown. At shutdown it is 4333 // too late to cope with the problem, because when the primary thread is 4334 // in DllMain (process detach) the monitor has no chances to start (it is 4335 // blocked), and primary thread has no means to inform the monitor that 4336 // the library has gone, because all the memory which the monitor can 4337 // access is going to be released/reset. 4338 while (TCR_4(__kmp_init_monitor) < 2) { 4339 KMP_YIELD(TRUE); 4340 } 4341 KF_TRACE(10, ("after monitor thread has started\n")); 4342 #endif 4343 } 4344 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4345 } 4346 #endif 4347 4348 KMP_MB(); 4349 4350 { 4351 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4352 ? 1 4353 : __kmp_hidden_helper_threads_num + 1; 4354 4355 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4356 ++new_gtid) { 4357 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4358 } 4359 4360 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4361 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4362 } 4363 } 4364 4365 /* allocate space for it. */ 4366 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4367 4368 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4369 4370 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4371 // suppress race conditions detection on synchronization flags in debug mode 4372 // this helps to analyze library internals eliminating false positives 4373 __itt_suppress_mark_range( 4374 __itt_suppress_range, __itt_suppress_threading_errors, 4375 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4376 __itt_suppress_mark_range( 4377 __itt_suppress_range, __itt_suppress_threading_errors, 4378 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4379 #if KMP_OS_WINDOWS 4380 __itt_suppress_mark_range( 4381 __itt_suppress_range, __itt_suppress_threading_errors, 4382 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4383 #else 4384 __itt_suppress_mark_range(__itt_suppress_range, 4385 __itt_suppress_threading_errors, 4386 &new_thr->th.th_suspend_init_count, 4387 sizeof(new_thr->th.th_suspend_init_count)); 4388 #endif 4389 // TODO: check if we need to also suppress b_arrived flags 4390 __itt_suppress_mark_range(__itt_suppress_range, 4391 __itt_suppress_threading_errors, 4392 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4393 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4394 __itt_suppress_mark_range(__itt_suppress_range, 4395 __itt_suppress_threading_errors, 4396 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4397 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4398 __itt_suppress_mark_range(__itt_suppress_range, 4399 __itt_suppress_threading_errors, 4400 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4401 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4402 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4403 if (__kmp_storage_map) { 4404 __kmp_print_thread_storage_map(new_thr, new_gtid); 4405 } 4406 4407 // add the reserve serialized team, initialized from the team's primary thread 4408 { 4409 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4410 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4411 new_thr->th.th_serial_team = serial_team = 4412 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4413 #if OMPT_SUPPORT 4414 ompt_data_none, // root parallel id 4415 #endif 4416 proc_bind_default, &r_icvs, 4417 0 USE_NESTED_HOT_ARG(NULL)); 4418 } 4419 KMP_ASSERT(serial_team); 4420 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4421 // execution (it is unused for now). 4422 serial_team->t.t_threads[0] = new_thr; 4423 KF_TRACE(10, 4424 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4425 new_thr)); 4426 4427 /* setup the thread structures */ 4428 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4429 4430 #if USE_FAST_MEMORY 4431 __kmp_initialize_fast_memory(new_thr); 4432 #endif /* USE_FAST_MEMORY */ 4433 4434 #if KMP_USE_BGET 4435 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4436 __kmp_initialize_bget(new_thr); 4437 #endif 4438 4439 __kmp_init_random(new_thr); // Initialize random number generator 4440 4441 /* Initialize these only once when thread is grabbed for a team allocation */ 4442 KA_TRACE(20, 4443 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4444 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4445 4446 int b; 4447 kmp_balign_t *balign = new_thr->th.th_bar; 4448 for (b = 0; b < bs_last_barrier; ++b) { 4449 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4450 balign[b].bb.team = NULL; 4451 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4452 balign[b].bb.use_oncore_barrier = 0; 4453 } 4454 4455 new_thr->th.th_spin_here = FALSE; 4456 new_thr->th.th_next_waiting = 0; 4457 #if KMP_OS_UNIX 4458 new_thr->th.th_blocking = false; 4459 #endif 4460 4461 #if KMP_AFFINITY_SUPPORTED 4462 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4463 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4464 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4465 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4466 #endif 4467 new_thr->th.th_def_allocator = __kmp_def_allocator; 4468 new_thr->th.th_prev_level = 0; 4469 new_thr->th.th_prev_num_threads = 1; 4470 4471 TCW_4(new_thr->th.th_in_pool, FALSE); 4472 new_thr->th.th_active_in_pool = FALSE; 4473 TCW_4(new_thr->th.th_active, TRUE); 4474 4475 /* adjust the global counters */ 4476 __kmp_all_nth++; 4477 __kmp_nth++; 4478 4479 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4480 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4481 if (__kmp_adjust_gtid_mode) { 4482 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4483 if (TCR_4(__kmp_gtid_mode) != 2) { 4484 TCW_4(__kmp_gtid_mode, 2); 4485 } 4486 } else { 4487 if (TCR_4(__kmp_gtid_mode) != 1) { 4488 TCW_4(__kmp_gtid_mode, 1); 4489 } 4490 } 4491 } 4492 4493 #ifdef KMP_ADJUST_BLOCKTIME 4494 /* Adjust blocktime back to zero if necessary */ 4495 /* Middle initialization might not have occurred yet */ 4496 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4497 if (__kmp_nth > __kmp_avail_proc) { 4498 __kmp_zero_bt = TRUE; 4499 } 4500 } 4501 #endif /* KMP_ADJUST_BLOCKTIME */ 4502 4503 /* actually fork it and create the new worker thread */ 4504 KF_TRACE( 4505 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4506 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4507 KF_TRACE(10, 4508 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4509 4510 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4511 new_gtid)); 4512 KMP_MB(); 4513 return new_thr; 4514 } 4515 4516 /* Reinitialize team for reuse. 4517 The hot team code calls this case at every fork barrier, so EPCC barrier 4518 test are extremely sensitive to changes in it, esp. writes to the team 4519 struct, which cause a cache invalidation in all threads. 4520 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4521 static void __kmp_reinitialize_team(kmp_team_t *team, 4522 kmp_internal_control_t *new_icvs, 4523 ident_t *loc) { 4524 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4525 team->t.t_threads[0], team)); 4526 KMP_DEBUG_ASSERT(team && new_icvs); 4527 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4528 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4529 4530 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4531 // Copy ICVs to the primary thread's implicit taskdata 4532 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4533 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4534 4535 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4536 team->t.t_threads[0], team)); 4537 } 4538 4539 /* Initialize the team data structure. 4540 This assumes the t_threads and t_max_nproc are already set. 4541 Also, we don't touch the arguments */ 4542 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4543 kmp_internal_control_t *new_icvs, 4544 ident_t *loc) { 4545 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4546 4547 /* verify */ 4548 KMP_DEBUG_ASSERT(team); 4549 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4550 KMP_DEBUG_ASSERT(team->t.t_threads); 4551 KMP_MB(); 4552 4553 team->t.t_master_tid = 0; /* not needed */ 4554 /* team->t.t_master_bar; not needed */ 4555 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4556 team->t.t_nproc = new_nproc; 4557 4558 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4559 team->t.t_next_pool = NULL; 4560 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4561 * up hot team */ 4562 4563 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4564 team->t.t_invoke = NULL; /* not needed */ 4565 4566 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4567 team->t.t_sched.sched = new_icvs->sched.sched; 4568 4569 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4570 team->t.t_fp_control_saved = FALSE; /* not needed */ 4571 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4572 team->t.t_mxcsr = 0; /* not needed */ 4573 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4574 4575 team->t.t_construct = 0; 4576 4577 team->t.t_ordered.dt.t_value = 0; 4578 team->t.t_master_active = FALSE; 4579 4580 #ifdef KMP_DEBUG 4581 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4582 #endif 4583 #if KMP_OS_WINDOWS 4584 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4585 #endif 4586 4587 team->t.t_control_stack_top = NULL; 4588 4589 __kmp_reinitialize_team(team, new_icvs, loc); 4590 4591 KMP_MB(); 4592 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4593 } 4594 4595 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4596 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4597 static void 4598 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4599 if (KMP_AFFINITY_CAPABLE()) { 4600 int status; 4601 if (old_mask != NULL) { 4602 status = __kmp_get_system_affinity(old_mask, TRUE); 4603 int error = errno; 4604 if (status != 0) { 4605 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4606 __kmp_msg_null); 4607 } 4608 } 4609 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4610 } 4611 } 4612 #endif 4613 4614 #if KMP_AFFINITY_SUPPORTED 4615 4616 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4617 // It calculates the worker + primary thread's partition based upon the parent 4618 // thread's partition, and binds each worker to a thread in their partition. 4619 // The primary thread's partition should already include its current binding. 4620 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4621 // Do not partition places for the hidden helper team 4622 if (KMP_HIDDEN_HELPER_TEAM(team)) 4623 return; 4624 // Copy the primary thread's place partition to the team struct 4625 kmp_info_t *master_th = team->t.t_threads[0]; 4626 KMP_DEBUG_ASSERT(master_th != NULL); 4627 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4628 int first_place = master_th->th.th_first_place; 4629 int last_place = master_th->th.th_last_place; 4630 int masters_place = master_th->th.th_current_place; 4631 team->t.t_first_place = first_place; 4632 team->t.t_last_place = last_place; 4633 4634 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4635 "bound to place %d partition = [%d,%d]\n", 4636 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4637 team->t.t_id, masters_place, first_place, last_place)); 4638 4639 switch (proc_bind) { 4640 4641 case proc_bind_default: 4642 // Serial teams might have the proc_bind policy set to proc_bind_default. 4643 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4644 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4645 break; 4646 4647 case proc_bind_primary: { 4648 int f; 4649 int n_th = team->t.t_nproc; 4650 for (f = 1; f < n_th; f++) { 4651 kmp_info_t *th = team->t.t_threads[f]; 4652 KMP_DEBUG_ASSERT(th != NULL); 4653 th->th.th_first_place = first_place; 4654 th->th.th_last_place = last_place; 4655 th->th.th_new_place = masters_place; 4656 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4657 team->t.t_display_affinity != 1) { 4658 team->t.t_display_affinity = 1; 4659 } 4660 4661 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4662 "partition = [%d,%d]\n", 4663 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4664 f, masters_place, first_place, last_place)); 4665 } 4666 } break; 4667 4668 case proc_bind_close: { 4669 int f; 4670 int n_th = team->t.t_nproc; 4671 int n_places; 4672 if (first_place <= last_place) { 4673 n_places = last_place - first_place + 1; 4674 } else { 4675 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4676 } 4677 if (n_th <= n_places) { 4678 int place = masters_place; 4679 for (f = 1; f < n_th; f++) { 4680 kmp_info_t *th = team->t.t_threads[f]; 4681 KMP_DEBUG_ASSERT(th != NULL); 4682 4683 if (place == last_place) { 4684 place = first_place; 4685 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4686 place = 0; 4687 } else { 4688 place++; 4689 } 4690 th->th.th_first_place = first_place; 4691 th->th.th_last_place = last_place; 4692 th->th.th_new_place = place; 4693 if (__kmp_display_affinity && place != th->th.th_current_place && 4694 team->t.t_display_affinity != 1) { 4695 team->t.t_display_affinity = 1; 4696 } 4697 4698 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4699 "partition = [%d,%d]\n", 4700 __kmp_gtid_from_thread(team->t.t_threads[f]), 4701 team->t.t_id, f, place, first_place, last_place)); 4702 } 4703 } else { 4704 int S, rem, gap, s_count; 4705 S = n_th / n_places; 4706 s_count = 0; 4707 rem = n_th - (S * n_places); 4708 gap = rem > 0 ? n_places / rem : n_places; 4709 int place = masters_place; 4710 int gap_ct = gap; 4711 for (f = 0; f < n_th; f++) { 4712 kmp_info_t *th = team->t.t_threads[f]; 4713 KMP_DEBUG_ASSERT(th != NULL); 4714 4715 th->th.th_first_place = first_place; 4716 th->th.th_last_place = last_place; 4717 th->th.th_new_place = place; 4718 if (__kmp_display_affinity && place != th->th.th_current_place && 4719 team->t.t_display_affinity != 1) { 4720 team->t.t_display_affinity = 1; 4721 } 4722 s_count++; 4723 4724 if ((s_count == S) && rem && (gap_ct == gap)) { 4725 // do nothing, add an extra thread to place on next iteration 4726 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4727 // we added an extra thread to this place; move to next place 4728 if (place == last_place) { 4729 place = first_place; 4730 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4731 place = 0; 4732 } else { 4733 place++; 4734 } 4735 s_count = 0; 4736 gap_ct = 1; 4737 rem--; 4738 } else if (s_count == S) { // place full; don't add extra 4739 if (place == last_place) { 4740 place = first_place; 4741 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4742 place = 0; 4743 } else { 4744 place++; 4745 } 4746 gap_ct++; 4747 s_count = 0; 4748 } 4749 4750 KA_TRACE(100, 4751 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4752 "partition = [%d,%d]\n", 4753 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4754 th->th.th_new_place, first_place, last_place)); 4755 } 4756 KMP_DEBUG_ASSERT(place == masters_place); 4757 } 4758 } break; 4759 4760 case proc_bind_spread: { 4761 int f; 4762 int n_th = team->t.t_nproc; 4763 int n_places; 4764 int thidx; 4765 if (first_place <= last_place) { 4766 n_places = last_place - first_place + 1; 4767 } else { 4768 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4769 } 4770 if (n_th <= n_places) { 4771 int place = -1; 4772 4773 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4774 int S = n_places / n_th; 4775 int s_count, rem, gap, gap_ct; 4776 4777 place = masters_place; 4778 rem = n_places - n_th * S; 4779 gap = rem ? n_th / rem : 1; 4780 gap_ct = gap; 4781 thidx = n_th; 4782 if (update_master_only == 1) 4783 thidx = 1; 4784 for (f = 0; f < thidx; f++) { 4785 kmp_info_t *th = team->t.t_threads[f]; 4786 KMP_DEBUG_ASSERT(th != NULL); 4787 4788 th->th.th_first_place = place; 4789 th->th.th_new_place = place; 4790 if (__kmp_display_affinity && place != th->th.th_current_place && 4791 team->t.t_display_affinity != 1) { 4792 team->t.t_display_affinity = 1; 4793 } 4794 s_count = 1; 4795 while (s_count < S) { 4796 if (place == last_place) { 4797 place = first_place; 4798 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4799 place = 0; 4800 } else { 4801 place++; 4802 } 4803 s_count++; 4804 } 4805 if (rem && (gap_ct == gap)) { 4806 if (place == last_place) { 4807 place = first_place; 4808 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4809 place = 0; 4810 } else { 4811 place++; 4812 } 4813 rem--; 4814 gap_ct = 0; 4815 } 4816 th->th.th_last_place = place; 4817 gap_ct++; 4818 4819 if (place == last_place) { 4820 place = first_place; 4821 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4822 place = 0; 4823 } else { 4824 place++; 4825 } 4826 4827 KA_TRACE(100, 4828 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4829 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4830 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4831 f, th->th.th_new_place, th->th.th_first_place, 4832 th->th.th_last_place, __kmp_affinity_num_masks)); 4833 } 4834 } else { 4835 /* Having uniform space of available computation places I can create 4836 T partitions of round(P/T) size and put threads into the first 4837 place of each partition. */ 4838 double current = static_cast<double>(masters_place); 4839 double spacing = 4840 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4841 int first, last; 4842 kmp_info_t *th; 4843 4844 thidx = n_th + 1; 4845 if (update_master_only == 1) 4846 thidx = 1; 4847 for (f = 0; f < thidx; f++) { 4848 first = static_cast<int>(current); 4849 last = static_cast<int>(current + spacing) - 1; 4850 KMP_DEBUG_ASSERT(last >= first); 4851 if (first >= n_places) { 4852 if (masters_place) { 4853 first -= n_places; 4854 last -= n_places; 4855 if (first == (masters_place + 1)) { 4856 KMP_DEBUG_ASSERT(f == n_th); 4857 first--; 4858 } 4859 if (last == masters_place) { 4860 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4861 last--; 4862 } 4863 } else { 4864 KMP_DEBUG_ASSERT(f == n_th); 4865 first = 0; 4866 last = 0; 4867 } 4868 } 4869 if (last >= n_places) { 4870 last = (n_places - 1); 4871 } 4872 place = first; 4873 current += spacing; 4874 if (f < n_th) { 4875 KMP_DEBUG_ASSERT(0 <= first); 4876 KMP_DEBUG_ASSERT(n_places > first); 4877 KMP_DEBUG_ASSERT(0 <= last); 4878 KMP_DEBUG_ASSERT(n_places > last); 4879 KMP_DEBUG_ASSERT(last_place >= first_place); 4880 th = team->t.t_threads[f]; 4881 KMP_DEBUG_ASSERT(th); 4882 th->th.th_first_place = first; 4883 th->th.th_new_place = place; 4884 th->th.th_last_place = last; 4885 if (__kmp_display_affinity && place != th->th.th_current_place && 4886 team->t.t_display_affinity != 1) { 4887 team->t.t_display_affinity = 1; 4888 } 4889 KA_TRACE(100, 4890 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4891 "partition = [%d,%d], spacing = %.4f\n", 4892 __kmp_gtid_from_thread(team->t.t_threads[f]), 4893 team->t.t_id, f, th->th.th_new_place, 4894 th->th.th_first_place, th->th.th_last_place, spacing)); 4895 } 4896 } 4897 } 4898 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4899 } else { 4900 int S, rem, gap, s_count; 4901 S = n_th / n_places; 4902 s_count = 0; 4903 rem = n_th - (S * n_places); 4904 gap = rem > 0 ? n_places / rem : n_places; 4905 int place = masters_place; 4906 int gap_ct = gap; 4907 thidx = n_th; 4908 if (update_master_only == 1) 4909 thidx = 1; 4910 for (f = 0; f < thidx; f++) { 4911 kmp_info_t *th = team->t.t_threads[f]; 4912 KMP_DEBUG_ASSERT(th != NULL); 4913 4914 th->th.th_first_place = place; 4915 th->th.th_last_place = place; 4916 th->th.th_new_place = place; 4917 if (__kmp_display_affinity && place != th->th.th_current_place && 4918 team->t.t_display_affinity != 1) { 4919 team->t.t_display_affinity = 1; 4920 } 4921 s_count++; 4922 4923 if ((s_count == S) && rem && (gap_ct == gap)) { 4924 // do nothing, add an extra thread to place on next iteration 4925 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4926 // we added an extra thread to this place; move on to next place 4927 if (place == last_place) { 4928 place = first_place; 4929 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4930 place = 0; 4931 } else { 4932 place++; 4933 } 4934 s_count = 0; 4935 gap_ct = 1; 4936 rem--; 4937 } else if (s_count == S) { // place is full; don't add extra thread 4938 if (place == last_place) { 4939 place = first_place; 4940 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4941 place = 0; 4942 } else { 4943 place++; 4944 } 4945 gap_ct++; 4946 s_count = 0; 4947 } 4948 4949 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4950 "partition = [%d,%d]\n", 4951 __kmp_gtid_from_thread(team->t.t_threads[f]), 4952 team->t.t_id, f, th->th.th_new_place, 4953 th->th.th_first_place, th->th.th_last_place)); 4954 } 4955 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4956 } 4957 } break; 4958 4959 default: 4960 break; 4961 } 4962 4963 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4964 } 4965 4966 #endif // KMP_AFFINITY_SUPPORTED 4967 4968 /* allocate a new team data structure to use. take one off of the free pool if 4969 available */ 4970 kmp_team_t * 4971 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4972 #if OMPT_SUPPORT 4973 ompt_data_t ompt_parallel_data, 4974 #endif 4975 kmp_proc_bind_t new_proc_bind, 4976 kmp_internal_control_t *new_icvs, 4977 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4978 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4979 int f; 4980 kmp_team_t *team; 4981 int use_hot_team = !root->r.r_active; 4982 int level = 0; 4983 4984 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4985 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4986 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4987 KMP_MB(); 4988 4989 #if KMP_NESTED_HOT_TEAMS 4990 kmp_hot_team_ptr_t *hot_teams; 4991 if (master) { 4992 team = master->th.th_team; 4993 level = team->t.t_active_level; 4994 if (master->th.th_teams_microtask) { // in teams construct? 4995 if (master->th.th_teams_size.nteams > 1 && 4996 ( // #teams > 1 4997 team->t.t_pkfn == 4998 (microtask_t)__kmp_teams_master || // inner fork of the teams 4999 master->th.th_teams_level < 5000 team->t.t_level)) { // or nested parallel inside the teams 5001 ++level; // not increment if #teams==1, or for outer fork of the teams; 5002 // increment otherwise 5003 } 5004 } 5005 hot_teams = master->th.th_hot_teams; 5006 if (level < __kmp_hot_teams_max_level && hot_teams && 5007 hot_teams[level].hot_team) { 5008 // hot team has already been allocated for given level 5009 use_hot_team = 1; 5010 } else { 5011 use_hot_team = 0; 5012 } 5013 } else { 5014 // check we won't access uninitialized hot_teams, just in case 5015 KMP_DEBUG_ASSERT(new_nproc == 1); 5016 } 5017 #endif 5018 // Optimization to use a "hot" team 5019 if (use_hot_team && new_nproc > 1) { 5020 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5021 #if KMP_NESTED_HOT_TEAMS 5022 team = hot_teams[level].hot_team; 5023 #else 5024 team = root->r.r_hot_team; 5025 #endif 5026 #if KMP_DEBUG 5027 if (__kmp_tasking_mode != tskm_immediate_exec) { 5028 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5029 "task_team[1] = %p before reinit\n", 5030 team->t.t_task_team[0], team->t.t_task_team[1])); 5031 } 5032 #endif 5033 5034 // Has the number of threads changed? 5035 /* Let's assume the most common case is that the number of threads is 5036 unchanged, and put that case first. */ 5037 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5038 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5039 // This case can mean that omp_set_num_threads() was called and the hot 5040 // team size was already reduced, so we check the special flag 5041 if (team->t.t_size_changed == -1) { 5042 team->t.t_size_changed = 1; 5043 } else { 5044 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5045 } 5046 5047 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5048 kmp_r_sched_t new_sched = new_icvs->sched; 5049 // set primary thread's schedule as new run-time schedule 5050 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5051 5052 __kmp_reinitialize_team(team, new_icvs, 5053 root->r.r_uber_thread->th.th_ident); 5054 5055 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5056 team->t.t_threads[0], team)); 5057 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5058 5059 #if KMP_AFFINITY_SUPPORTED 5060 if ((team->t.t_size_changed == 0) && 5061 (team->t.t_proc_bind == new_proc_bind)) { 5062 if (new_proc_bind == proc_bind_spread) { 5063 __kmp_partition_places( 5064 team, 1); // add flag to update only master for spread 5065 } 5066 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5067 "proc_bind = %d, partition = [%d,%d]\n", 5068 team->t.t_id, new_proc_bind, team->t.t_first_place, 5069 team->t.t_last_place)); 5070 } else { 5071 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5072 __kmp_partition_places(team); 5073 } 5074 #else 5075 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5076 #endif /* KMP_AFFINITY_SUPPORTED */ 5077 } else if (team->t.t_nproc > new_nproc) { 5078 KA_TRACE(20, 5079 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5080 new_nproc)); 5081 5082 team->t.t_size_changed = 1; 5083 #if KMP_NESTED_HOT_TEAMS 5084 if (__kmp_hot_teams_mode == 0) { 5085 // AC: saved number of threads should correspond to team's value in this 5086 // mode, can be bigger in mode 1, when hot team has threads in reserve 5087 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5088 hot_teams[level].hot_team_nth = new_nproc; 5089 #endif // KMP_NESTED_HOT_TEAMS 5090 /* release the extra threads we don't need any more */ 5091 for (f = new_nproc; f < team->t.t_nproc; f++) { 5092 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5093 if (__kmp_tasking_mode != tskm_immediate_exec) { 5094 // When decreasing team size, threads no longer in the team should 5095 // unref task team. 5096 team->t.t_threads[f]->th.th_task_team = NULL; 5097 } 5098 __kmp_free_thread(team->t.t_threads[f]); 5099 team->t.t_threads[f] = NULL; 5100 } 5101 #if KMP_NESTED_HOT_TEAMS 5102 } // (__kmp_hot_teams_mode == 0) 5103 else { 5104 // When keeping extra threads in team, switch threads to wait on own 5105 // b_go flag 5106 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5107 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5108 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5109 for (int b = 0; b < bs_last_barrier; ++b) { 5110 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5111 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5112 } 5113 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5114 } 5115 } 5116 } 5117 #endif // KMP_NESTED_HOT_TEAMS 5118 team->t.t_nproc = new_nproc; 5119 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5120 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5121 __kmp_reinitialize_team(team, new_icvs, 5122 root->r.r_uber_thread->th.th_ident); 5123 5124 // Update remaining threads 5125 for (f = 0; f < new_nproc; ++f) { 5126 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5127 } 5128 5129 // restore the current task state of the primary thread: should be the 5130 // implicit task 5131 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5132 team->t.t_threads[0], team)); 5133 5134 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5135 5136 #ifdef KMP_DEBUG 5137 for (f = 0; f < team->t.t_nproc; f++) { 5138 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5139 team->t.t_threads[f]->th.th_team_nproc == 5140 team->t.t_nproc); 5141 } 5142 #endif 5143 5144 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5145 #if KMP_AFFINITY_SUPPORTED 5146 __kmp_partition_places(team); 5147 #endif 5148 } else { // team->t.t_nproc < new_nproc 5149 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5150 kmp_affin_mask_t *old_mask; 5151 if (KMP_AFFINITY_CAPABLE()) { 5152 KMP_CPU_ALLOC(old_mask); 5153 } 5154 #endif 5155 5156 KA_TRACE(20, 5157 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5158 new_nproc)); 5159 5160 team->t.t_size_changed = 1; 5161 5162 #if KMP_NESTED_HOT_TEAMS 5163 int avail_threads = hot_teams[level].hot_team_nth; 5164 if (new_nproc < avail_threads) 5165 avail_threads = new_nproc; 5166 kmp_info_t **other_threads = team->t.t_threads; 5167 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5168 // Adjust barrier data of reserved threads (if any) of the team 5169 // Other data will be set in __kmp_initialize_info() below. 5170 int b; 5171 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5172 for (b = 0; b < bs_last_barrier; ++b) { 5173 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5174 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5175 #if USE_DEBUGGER 5176 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5177 #endif 5178 } 5179 } 5180 if (hot_teams[level].hot_team_nth >= new_nproc) { 5181 // we have all needed threads in reserve, no need to allocate any 5182 // this only possible in mode 1, cannot have reserved threads in mode 0 5183 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5184 team->t.t_nproc = new_nproc; // just get reserved threads involved 5185 } else { 5186 // we may have some threads in reserve, but not enough 5187 team->t.t_nproc = 5188 hot_teams[level] 5189 .hot_team_nth; // get reserved threads involved if any 5190 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5191 #endif // KMP_NESTED_HOT_TEAMS 5192 if (team->t.t_max_nproc < new_nproc) { 5193 /* reallocate larger arrays */ 5194 __kmp_reallocate_team_arrays(team, new_nproc); 5195 __kmp_reinitialize_team(team, new_icvs, NULL); 5196 } 5197 5198 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5199 /* Temporarily set full mask for primary thread before creation of 5200 workers. The reason is that workers inherit the affinity from the 5201 primary thread, so if a lot of workers are created on the single 5202 core quickly, they don't get a chance to set their own affinity for 5203 a long time. */ 5204 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5205 #endif 5206 5207 /* allocate new threads for the hot team */ 5208 for (f = team->t.t_nproc; f < new_nproc; f++) { 5209 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5210 KMP_DEBUG_ASSERT(new_worker); 5211 team->t.t_threads[f] = new_worker; 5212 5213 KA_TRACE(20, 5214 ("__kmp_allocate_team: team %d init T#%d arrived: " 5215 "join=%llu, plain=%llu\n", 5216 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5217 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5218 team->t.t_bar[bs_plain_barrier].b_arrived)); 5219 5220 { // Initialize barrier data for new threads. 5221 int b; 5222 kmp_balign_t *balign = new_worker->th.th_bar; 5223 for (b = 0; b < bs_last_barrier; ++b) { 5224 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5225 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5226 KMP_BARRIER_PARENT_FLAG); 5227 #if USE_DEBUGGER 5228 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5229 #endif 5230 } 5231 } 5232 } 5233 5234 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5235 if (KMP_AFFINITY_CAPABLE()) { 5236 /* Restore initial primary thread's affinity mask */ 5237 __kmp_set_system_affinity(old_mask, TRUE); 5238 KMP_CPU_FREE(old_mask); 5239 } 5240 #endif 5241 #if KMP_NESTED_HOT_TEAMS 5242 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5243 #endif // KMP_NESTED_HOT_TEAMS 5244 /* make sure everyone is syncronized */ 5245 int old_nproc = team->t.t_nproc; // save old value and use to update only 5246 // new threads below 5247 __kmp_initialize_team(team, new_nproc, new_icvs, 5248 root->r.r_uber_thread->th.th_ident); 5249 5250 /* reinitialize the threads */ 5251 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5252 for (f = 0; f < team->t.t_nproc; ++f) 5253 __kmp_initialize_info(team->t.t_threads[f], team, f, 5254 __kmp_gtid_from_tid(f, team)); 5255 5256 if (level) { // set th_task_state for new threads in nested hot team 5257 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5258 // only need to set the th_task_state for the new threads. th_task_state 5259 // for primary thread will not be accurate until after this in 5260 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5261 // get the correct value. 5262 for (f = old_nproc; f < team->t.t_nproc; ++f) 5263 team->t.t_threads[f]->th.th_task_state = 5264 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5265 } else { // set th_task_state for new threads in non-nested hot team 5266 // copy primary thread's state 5267 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5268 for (f = old_nproc; f < team->t.t_nproc; ++f) 5269 team->t.t_threads[f]->th.th_task_state = old_state; 5270 } 5271 5272 #ifdef KMP_DEBUG 5273 for (f = 0; f < team->t.t_nproc; ++f) { 5274 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5275 team->t.t_threads[f]->th.th_team_nproc == 5276 team->t.t_nproc); 5277 } 5278 #endif 5279 5280 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5281 #if KMP_AFFINITY_SUPPORTED 5282 __kmp_partition_places(team); 5283 #endif 5284 } // Check changes in number of threads 5285 5286 kmp_info_t *master = team->t.t_threads[0]; 5287 if (master->th.th_teams_microtask) { 5288 for (f = 1; f < new_nproc; ++f) { 5289 // propagate teams construct specific info to workers 5290 kmp_info_t *thr = team->t.t_threads[f]; 5291 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5292 thr->th.th_teams_level = master->th.th_teams_level; 5293 thr->th.th_teams_size = master->th.th_teams_size; 5294 } 5295 } 5296 #if KMP_NESTED_HOT_TEAMS 5297 if (level) { 5298 // Sync barrier state for nested hot teams, not needed for outermost hot 5299 // team. 5300 for (f = 1; f < new_nproc; ++f) { 5301 kmp_info_t *thr = team->t.t_threads[f]; 5302 int b; 5303 kmp_balign_t *balign = thr->th.th_bar; 5304 for (b = 0; b < bs_last_barrier; ++b) { 5305 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5306 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5307 #if USE_DEBUGGER 5308 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5309 #endif 5310 } 5311 } 5312 } 5313 #endif // KMP_NESTED_HOT_TEAMS 5314 5315 /* reallocate space for arguments if necessary */ 5316 __kmp_alloc_argv_entries(argc, team, TRUE); 5317 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5318 // The hot team re-uses the previous task team, 5319 // if untouched during the previous release->gather phase. 5320 5321 KF_TRACE(10, (" hot_team = %p\n", team)); 5322 5323 #if KMP_DEBUG 5324 if (__kmp_tasking_mode != tskm_immediate_exec) { 5325 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5326 "task_team[1] = %p after reinit\n", 5327 team->t.t_task_team[0], team->t.t_task_team[1])); 5328 } 5329 #endif 5330 5331 #if OMPT_SUPPORT 5332 __ompt_team_assign_id(team, ompt_parallel_data); 5333 #endif 5334 5335 KMP_MB(); 5336 5337 return team; 5338 } 5339 5340 /* next, let's try to take one from the team pool */ 5341 KMP_MB(); 5342 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5343 /* TODO: consider resizing undersized teams instead of reaping them, now 5344 that we have a resizing mechanism */ 5345 if (team->t.t_max_nproc >= max_nproc) { 5346 /* take this team from the team pool */ 5347 __kmp_team_pool = team->t.t_next_pool; 5348 5349 /* setup the team for fresh use */ 5350 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5351 5352 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5353 "task_team[1] %p to NULL\n", 5354 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5355 team->t.t_task_team[0] = NULL; 5356 team->t.t_task_team[1] = NULL; 5357 5358 /* reallocate space for arguments if necessary */ 5359 __kmp_alloc_argv_entries(argc, team, TRUE); 5360 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5361 5362 KA_TRACE( 5363 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5364 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5365 { // Initialize barrier data. 5366 int b; 5367 for (b = 0; b < bs_last_barrier; ++b) { 5368 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5369 #if USE_DEBUGGER 5370 team->t.t_bar[b].b_master_arrived = 0; 5371 team->t.t_bar[b].b_team_arrived = 0; 5372 #endif 5373 } 5374 } 5375 5376 team->t.t_proc_bind = new_proc_bind; 5377 5378 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5379 team->t.t_id)); 5380 5381 #if OMPT_SUPPORT 5382 __ompt_team_assign_id(team, ompt_parallel_data); 5383 #endif 5384 5385 KMP_MB(); 5386 5387 return team; 5388 } 5389 5390 /* reap team if it is too small, then loop back and check the next one */ 5391 // not sure if this is wise, but, will be redone during the hot-teams 5392 // rewrite. 5393 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5394 team = __kmp_reap_team(team); 5395 __kmp_team_pool = team; 5396 } 5397 5398 /* nothing available in the pool, no matter, make a new team! */ 5399 KMP_MB(); 5400 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5401 5402 /* and set it up */ 5403 team->t.t_max_nproc = max_nproc; 5404 /* NOTE well, for some reason allocating one big buffer and dividing it up 5405 seems to really hurt performance a lot on the P4, so, let's not use this */ 5406 __kmp_allocate_team_arrays(team, max_nproc); 5407 5408 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5409 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5410 5411 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5412 "%p to NULL\n", 5413 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5414 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5415 // memory, no need to duplicate 5416 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5417 // memory, no need to duplicate 5418 5419 if (__kmp_storage_map) { 5420 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5421 } 5422 5423 /* allocate space for arguments */ 5424 __kmp_alloc_argv_entries(argc, team, FALSE); 5425 team->t.t_argc = argc; 5426 5427 KA_TRACE(20, 5428 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5429 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5430 { // Initialize barrier data. 5431 int b; 5432 for (b = 0; b < bs_last_barrier; ++b) { 5433 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5434 #if USE_DEBUGGER 5435 team->t.t_bar[b].b_master_arrived = 0; 5436 team->t.t_bar[b].b_team_arrived = 0; 5437 #endif 5438 } 5439 } 5440 5441 team->t.t_proc_bind = new_proc_bind; 5442 5443 #if OMPT_SUPPORT 5444 __ompt_team_assign_id(team, ompt_parallel_data); 5445 team->t.ompt_serialized_team_info = NULL; 5446 #endif 5447 5448 KMP_MB(); 5449 5450 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5451 team->t.t_id)); 5452 5453 return team; 5454 } 5455 5456 /* TODO implement hot-teams at all levels */ 5457 /* TODO implement lazy thread release on demand (disband request) */ 5458 5459 /* free the team. return it to the team pool. release all the threads 5460 * associated with it */ 5461 void __kmp_free_team(kmp_root_t *root, 5462 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5463 int f; 5464 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5465 team->t.t_id)); 5466 5467 /* verify state */ 5468 KMP_DEBUG_ASSERT(root); 5469 KMP_DEBUG_ASSERT(team); 5470 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5471 KMP_DEBUG_ASSERT(team->t.t_threads); 5472 5473 int use_hot_team = team == root->r.r_hot_team; 5474 #if KMP_NESTED_HOT_TEAMS 5475 int level; 5476 kmp_hot_team_ptr_t *hot_teams; 5477 if (master) { 5478 level = team->t.t_active_level - 1; 5479 if (master->th.th_teams_microtask) { // in teams construct? 5480 if (master->th.th_teams_size.nteams > 1) { 5481 ++level; // level was not increased in teams construct for 5482 // team_of_masters 5483 } 5484 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5485 master->th.th_teams_level == team->t.t_level) { 5486 ++level; // level was not increased in teams construct for 5487 // team_of_workers before the parallel 5488 } // team->t.t_level will be increased inside parallel 5489 } 5490 hot_teams = master->th.th_hot_teams; 5491 if (level < __kmp_hot_teams_max_level) { 5492 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5493 use_hot_team = 1; 5494 } 5495 } 5496 #endif // KMP_NESTED_HOT_TEAMS 5497 5498 /* team is done working */ 5499 TCW_SYNC_PTR(team->t.t_pkfn, 5500 NULL); // Important for Debugging Support Library. 5501 #if KMP_OS_WINDOWS 5502 team->t.t_copyin_counter = 0; // init counter for possible reuse 5503 #endif 5504 // Do not reset pointer to parent team to NULL for hot teams. 5505 5506 /* if we are non-hot team, release our threads */ 5507 if (!use_hot_team) { 5508 if (__kmp_tasking_mode != tskm_immediate_exec) { 5509 // Wait for threads to reach reapable state 5510 for (f = 1; f < team->t.t_nproc; ++f) { 5511 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5512 kmp_info_t *th = team->t.t_threads[f]; 5513 volatile kmp_uint32 *state = &th->th.th_reap_state; 5514 while (*state != KMP_SAFE_TO_REAP) { 5515 #if KMP_OS_WINDOWS 5516 // On Windows a thread can be killed at any time, check this 5517 DWORD ecode; 5518 if (!__kmp_is_thread_alive(th, &ecode)) { 5519 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5520 break; 5521 } 5522 #endif 5523 // first check if thread is sleeping 5524 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5525 if (fl.is_sleeping()) 5526 fl.resume(__kmp_gtid_from_thread(th)); 5527 KMP_CPU_PAUSE(); 5528 } 5529 } 5530 5531 // Delete task teams 5532 int tt_idx; 5533 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5534 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5535 if (task_team != NULL) { 5536 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5537 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5538 team->t.t_threads[f]->th.th_task_team = NULL; 5539 } 5540 KA_TRACE( 5541 20, 5542 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5543 __kmp_get_gtid(), task_team, team->t.t_id)); 5544 #if KMP_NESTED_HOT_TEAMS 5545 __kmp_free_task_team(master, task_team); 5546 #endif 5547 team->t.t_task_team[tt_idx] = NULL; 5548 } 5549 } 5550 } 5551 5552 // Reset pointer to parent team only for non-hot teams. 5553 team->t.t_parent = NULL; 5554 team->t.t_level = 0; 5555 team->t.t_active_level = 0; 5556 5557 /* free the worker threads */ 5558 for (f = 1; f < team->t.t_nproc; ++f) { 5559 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5560 __kmp_free_thread(team->t.t_threads[f]); 5561 team->t.t_threads[f] = NULL; 5562 } 5563 5564 /* put the team back in the team pool */ 5565 /* TODO limit size of team pool, call reap_team if pool too large */ 5566 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5567 __kmp_team_pool = (volatile kmp_team_t *)team; 5568 } else { // Check if team was created for primary threads in teams construct 5569 // See if first worker is a CG root 5570 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5571 team->t.t_threads[1]->th.th_cg_roots); 5572 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5573 // Clean up the CG root nodes on workers so that this team can be re-used 5574 for (f = 1; f < team->t.t_nproc; ++f) { 5575 kmp_info_t *thr = team->t.t_threads[f]; 5576 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5577 thr->th.th_cg_roots->cg_root == thr); 5578 // Pop current CG root off list 5579 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5580 thr->th.th_cg_roots = tmp->up; 5581 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5582 " up to node %p. cg_nthreads was %d\n", 5583 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5584 int i = tmp->cg_nthreads--; 5585 if (i == 1) { 5586 __kmp_free(tmp); // free CG if we are the last thread in it 5587 } 5588 // Restore current task's thread_limit from CG root 5589 if (thr->th.th_cg_roots) 5590 thr->th.th_current_task->td_icvs.thread_limit = 5591 thr->th.th_cg_roots->cg_thread_limit; 5592 } 5593 } 5594 } 5595 5596 KMP_MB(); 5597 } 5598 5599 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5600 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5601 kmp_team_t *next_pool = team->t.t_next_pool; 5602 5603 KMP_DEBUG_ASSERT(team); 5604 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5605 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5606 KMP_DEBUG_ASSERT(team->t.t_threads); 5607 KMP_DEBUG_ASSERT(team->t.t_argv); 5608 5609 /* TODO clean the threads that are a part of this? */ 5610 5611 /* free stuff */ 5612 __kmp_free_team_arrays(team); 5613 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5614 __kmp_free((void *)team->t.t_argv); 5615 __kmp_free(team); 5616 5617 KMP_MB(); 5618 return next_pool; 5619 } 5620 5621 // Free the thread. Don't reap it, just place it on the pool of available 5622 // threads. 5623 // 5624 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5625 // binding for the affinity mechanism to be useful. 5626 // 5627 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5628 // However, we want to avoid a potential performance problem by always 5629 // scanning through the list to find the correct point at which to insert 5630 // the thread (potential N**2 behavior). To do this we keep track of the 5631 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5632 // With single-level parallelism, threads will always be added to the tail 5633 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5634 // parallelism, all bets are off and we may need to scan through the entire 5635 // free list. 5636 // 5637 // This change also has a potentially large performance benefit, for some 5638 // applications. Previously, as threads were freed from the hot team, they 5639 // would be placed back on the free list in inverse order. If the hot team 5640 // grew back to it's original size, then the freed thread would be placed 5641 // back on the hot team in reverse order. This could cause bad cache 5642 // locality problems on programs where the size of the hot team regularly 5643 // grew and shrunk. 5644 // 5645 // Now, for single-level parallelism, the OMP tid is always == gtid. 5646 void __kmp_free_thread(kmp_info_t *this_th) { 5647 int gtid; 5648 kmp_info_t **scan; 5649 5650 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5651 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5652 5653 KMP_DEBUG_ASSERT(this_th); 5654 5655 // When moving thread to pool, switch thread to wait on own b_go flag, and 5656 // uninitialized (NULL team). 5657 int b; 5658 kmp_balign_t *balign = this_th->th.th_bar; 5659 for (b = 0; b < bs_last_barrier; ++b) { 5660 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5661 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5662 balign[b].bb.team = NULL; 5663 balign[b].bb.leaf_kids = 0; 5664 } 5665 this_th->th.th_task_state = 0; 5666 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5667 5668 /* put thread back on the free pool */ 5669 TCW_PTR(this_th->th.th_team, NULL); 5670 TCW_PTR(this_th->th.th_root, NULL); 5671 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5672 5673 while (this_th->th.th_cg_roots) { 5674 this_th->th.th_cg_roots->cg_nthreads--; 5675 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5676 " %p of thread %p to %d\n", 5677 this_th, this_th->th.th_cg_roots, 5678 this_th->th.th_cg_roots->cg_root, 5679 this_th->th.th_cg_roots->cg_nthreads)); 5680 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5681 if (tmp->cg_root == this_th) { // Thread is a cg_root 5682 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5683 KA_TRACE( 5684 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5685 this_th->th.th_cg_roots = tmp->up; 5686 __kmp_free(tmp); 5687 } else { // Worker thread 5688 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5689 __kmp_free(tmp); 5690 } 5691 this_th->th.th_cg_roots = NULL; 5692 break; 5693 } 5694 } 5695 5696 /* If the implicit task assigned to this thread can be used by other threads 5697 * -> multiple threads can share the data and try to free the task at 5698 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5699 * with higher probability when hot team is disabled but can occurs even when 5700 * the hot team is enabled */ 5701 __kmp_free_implicit_task(this_th); 5702 this_th->th.th_current_task = NULL; 5703 5704 // If the __kmp_thread_pool_insert_pt is already past the new insert 5705 // point, then we need to re-scan the entire list. 5706 gtid = this_th->th.th_info.ds.ds_gtid; 5707 if (__kmp_thread_pool_insert_pt != NULL) { 5708 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5709 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5710 __kmp_thread_pool_insert_pt = NULL; 5711 } 5712 } 5713 5714 // Scan down the list to find the place to insert the thread. 5715 // scan is the address of a link in the list, possibly the address of 5716 // __kmp_thread_pool itself. 5717 // 5718 // In the absence of nested parallelism, the for loop will have 0 iterations. 5719 if (__kmp_thread_pool_insert_pt != NULL) { 5720 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5721 } else { 5722 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5723 } 5724 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5725 scan = &((*scan)->th.th_next_pool)) 5726 ; 5727 5728 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5729 // to its address. 5730 TCW_PTR(this_th->th.th_next_pool, *scan); 5731 __kmp_thread_pool_insert_pt = *scan = this_th; 5732 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5733 (this_th->th.th_info.ds.ds_gtid < 5734 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5735 TCW_4(this_th->th.th_in_pool, TRUE); 5736 __kmp_suspend_initialize_thread(this_th); 5737 __kmp_lock_suspend_mx(this_th); 5738 if (this_th->th.th_active == TRUE) { 5739 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5740 this_th->th.th_active_in_pool = TRUE; 5741 } 5742 #if KMP_DEBUG 5743 else { 5744 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5745 } 5746 #endif 5747 __kmp_unlock_suspend_mx(this_th); 5748 5749 TCW_4(__kmp_nth, __kmp_nth - 1); 5750 5751 #ifdef KMP_ADJUST_BLOCKTIME 5752 /* Adjust blocktime back to user setting or default if necessary */ 5753 /* Middle initialization might never have occurred */ 5754 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5755 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5756 if (__kmp_nth <= __kmp_avail_proc) { 5757 __kmp_zero_bt = FALSE; 5758 } 5759 } 5760 #endif /* KMP_ADJUST_BLOCKTIME */ 5761 5762 KMP_MB(); 5763 } 5764 5765 /* ------------------------------------------------------------------------ */ 5766 5767 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5768 #if OMP_PROFILING_SUPPORT 5769 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5770 // TODO: add a configuration option for time granularity 5771 if (ProfileTraceFile) 5772 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5773 #endif 5774 5775 int gtid = this_thr->th.th_info.ds.ds_gtid; 5776 /* void *stack_data;*/ 5777 kmp_team_t **volatile pteam; 5778 5779 KMP_MB(); 5780 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5781 5782 if (__kmp_env_consistency_check) { 5783 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5784 } 5785 5786 #if OMPD_SUPPORT 5787 if (ompd_state & OMPD_ENABLE_BP) 5788 ompd_bp_thread_begin(); 5789 #endif 5790 5791 #if OMPT_SUPPORT 5792 ompt_data_t *thread_data = nullptr; 5793 if (ompt_enabled.enabled) { 5794 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5795 *thread_data = ompt_data_none; 5796 5797 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5798 this_thr->th.ompt_thread_info.wait_id = 0; 5799 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5800 this_thr->th.ompt_thread_info.parallel_flags = 0; 5801 if (ompt_enabled.ompt_callback_thread_begin) { 5802 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5803 ompt_thread_worker, thread_data); 5804 } 5805 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5806 } 5807 #endif 5808 5809 /* This is the place where threads wait for work */ 5810 while (!TCR_4(__kmp_global.g.g_done)) { 5811 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5812 KMP_MB(); 5813 5814 /* wait for work to do */ 5815 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5816 5817 /* No tid yet since not part of a team */ 5818 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5819 5820 #if OMPT_SUPPORT 5821 if (ompt_enabled.enabled) { 5822 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5823 } 5824 #endif 5825 5826 pteam = &this_thr->th.th_team; 5827 5828 /* have we been allocated? */ 5829 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5830 /* we were just woken up, so run our new task */ 5831 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5832 int rc; 5833 KA_TRACE(20, 5834 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5835 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5836 (*pteam)->t.t_pkfn)); 5837 5838 updateHWFPControl(*pteam); 5839 5840 #if OMPT_SUPPORT 5841 if (ompt_enabled.enabled) { 5842 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5843 } 5844 #endif 5845 5846 rc = (*pteam)->t.t_invoke(gtid); 5847 KMP_ASSERT(rc); 5848 5849 KMP_MB(); 5850 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5851 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5852 (*pteam)->t.t_pkfn)); 5853 } 5854 #if OMPT_SUPPORT 5855 if (ompt_enabled.enabled) { 5856 /* no frame set while outside task */ 5857 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5858 5859 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5860 } 5861 #endif 5862 /* join barrier after parallel region */ 5863 __kmp_join_barrier(gtid); 5864 } 5865 } 5866 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5867 5868 #if OMPD_SUPPORT 5869 if (ompd_state & OMPD_ENABLE_BP) 5870 ompd_bp_thread_end(); 5871 #endif 5872 5873 #if OMPT_SUPPORT 5874 if (ompt_enabled.ompt_callback_thread_end) { 5875 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5876 } 5877 #endif 5878 5879 this_thr->th.th_task_team = NULL; 5880 /* run the destructors for the threadprivate data for this thread */ 5881 __kmp_common_destroy_gtid(gtid); 5882 5883 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5884 KMP_MB(); 5885 5886 #if OMP_PROFILING_SUPPORT 5887 llvm::timeTraceProfilerFinishThread(); 5888 #endif 5889 return this_thr; 5890 } 5891 5892 /* ------------------------------------------------------------------------ */ 5893 5894 void __kmp_internal_end_dest(void *specific_gtid) { 5895 // Make sure no significant bits are lost 5896 int gtid; 5897 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5898 5899 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5900 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5901 * this is because 0 is reserved for the nothing-stored case */ 5902 5903 __kmp_internal_end_thread(gtid); 5904 } 5905 5906 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5907 5908 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5909 __kmp_internal_end_atexit(); 5910 } 5911 5912 #endif 5913 5914 /* [Windows] josh: when the atexit handler is called, there may still be more 5915 than one thread alive */ 5916 void __kmp_internal_end_atexit(void) { 5917 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5918 /* [Windows] 5919 josh: ideally, we want to completely shutdown the library in this atexit 5920 handler, but stat code that depends on thread specific data for gtid fails 5921 because that data becomes unavailable at some point during the shutdown, so 5922 we call __kmp_internal_end_thread instead. We should eventually remove the 5923 dependency on __kmp_get_specific_gtid in the stat code and use 5924 __kmp_internal_end_library to cleanly shutdown the library. 5925 5926 // TODO: Can some of this comment about GVS be removed? 5927 I suspect that the offending stat code is executed when the calling thread 5928 tries to clean up a dead root thread's data structures, resulting in GVS 5929 code trying to close the GVS structures for that thread, but since the stat 5930 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5931 the calling thread is cleaning up itself instead of another thread, it get 5932 confused. This happens because allowing a thread to unregister and cleanup 5933 another thread is a recent modification for addressing an issue. 5934 Based on the current design (20050722), a thread may end up 5935 trying to unregister another thread only if thread death does not trigger 5936 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5937 thread specific data destructor function to detect thread death. For 5938 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5939 is nothing. Thus, the workaround is applicable only for Windows static 5940 stat library. */ 5941 __kmp_internal_end_library(-1); 5942 #if KMP_OS_WINDOWS 5943 __kmp_close_console(); 5944 #endif 5945 } 5946 5947 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5948 // It is assumed __kmp_forkjoin_lock is acquired. 5949 5950 int gtid; 5951 5952 KMP_DEBUG_ASSERT(thread != NULL); 5953 5954 gtid = thread->th.th_info.ds.ds_gtid; 5955 5956 if (!is_root) { 5957 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5958 /* Assume the threads are at the fork barrier here */ 5959 KA_TRACE( 5960 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5961 gtid)); 5962 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5963 * (GEH) */ 5964 ANNOTATE_HAPPENS_BEFORE(thread); 5965 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5966 thread); 5967 __kmp_release_64(&flag); 5968 } 5969 5970 // Terminate OS thread. 5971 __kmp_reap_worker(thread); 5972 5973 // The thread was killed asynchronously. If it was actively 5974 // spinning in the thread pool, decrement the global count. 5975 // 5976 // There is a small timing hole here - if the worker thread was just waking 5977 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5978 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5979 // the global counter might not get updated. 5980 // 5981 // Currently, this can only happen as the library is unloaded, 5982 // so there are no harmful side effects. 5983 if (thread->th.th_active_in_pool) { 5984 thread->th.th_active_in_pool = FALSE; 5985 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5986 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5987 } 5988 } 5989 5990 __kmp_free_implicit_task(thread); 5991 5992 // Free the fast memory for tasking 5993 #if USE_FAST_MEMORY 5994 __kmp_free_fast_memory(thread); 5995 #endif /* USE_FAST_MEMORY */ 5996 5997 __kmp_suspend_uninitialize_thread(thread); 5998 5999 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6000 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6001 6002 --__kmp_all_nth; 6003 // __kmp_nth was decremented when thread is added to the pool. 6004 6005 #ifdef KMP_ADJUST_BLOCKTIME 6006 /* Adjust blocktime back to user setting or default if necessary */ 6007 /* Middle initialization might never have occurred */ 6008 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6009 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6010 if (__kmp_nth <= __kmp_avail_proc) { 6011 __kmp_zero_bt = FALSE; 6012 } 6013 } 6014 #endif /* KMP_ADJUST_BLOCKTIME */ 6015 6016 /* free the memory being used */ 6017 if (__kmp_env_consistency_check) { 6018 if (thread->th.th_cons) { 6019 __kmp_free_cons_stack(thread->th.th_cons); 6020 thread->th.th_cons = NULL; 6021 } 6022 } 6023 6024 if (thread->th.th_pri_common != NULL) { 6025 __kmp_free(thread->th.th_pri_common); 6026 thread->th.th_pri_common = NULL; 6027 } 6028 6029 if (thread->th.th_task_state_memo_stack != NULL) { 6030 __kmp_free(thread->th.th_task_state_memo_stack); 6031 thread->th.th_task_state_memo_stack = NULL; 6032 } 6033 6034 #if KMP_USE_BGET 6035 if (thread->th.th_local.bget_data != NULL) { 6036 __kmp_finalize_bget(thread); 6037 } 6038 #endif 6039 6040 #if KMP_AFFINITY_SUPPORTED 6041 if (thread->th.th_affin_mask != NULL) { 6042 KMP_CPU_FREE(thread->th.th_affin_mask); 6043 thread->th.th_affin_mask = NULL; 6044 } 6045 #endif /* KMP_AFFINITY_SUPPORTED */ 6046 6047 #if KMP_USE_HIER_SCHED 6048 if (thread->th.th_hier_bar_data != NULL) { 6049 __kmp_free(thread->th.th_hier_bar_data); 6050 thread->th.th_hier_bar_data = NULL; 6051 } 6052 #endif 6053 6054 __kmp_reap_team(thread->th.th_serial_team); 6055 thread->th.th_serial_team = NULL; 6056 __kmp_free(thread); 6057 6058 KMP_MB(); 6059 6060 } // __kmp_reap_thread 6061 6062 static void __kmp_internal_end(void) { 6063 int i; 6064 6065 /* First, unregister the library */ 6066 __kmp_unregister_library(); 6067 6068 #if KMP_OS_WINDOWS 6069 /* In Win static library, we can't tell when a root actually dies, so we 6070 reclaim the data structures for any root threads that have died but not 6071 unregistered themselves, in order to shut down cleanly. 6072 In Win dynamic library we also can't tell when a thread dies. */ 6073 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6074 // dead roots 6075 #endif 6076 6077 for (i = 0; i < __kmp_threads_capacity; i++) 6078 if (__kmp_root[i]) 6079 if (__kmp_root[i]->r.r_active) 6080 break; 6081 KMP_MB(); /* Flush all pending memory write invalidates. */ 6082 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6083 6084 if (i < __kmp_threads_capacity) { 6085 #if KMP_USE_MONITOR 6086 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6087 KMP_MB(); /* Flush all pending memory write invalidates. */ 6088 6089 // Need to check that monitor was initialized before reaping it. If we are 6090 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6091 // __kmp_monitor will appear to contain valid data, but it is only valid in 6092 // the parent process, not the child. 6093 // New behavior (201008): instead of keying off of the flag 6094 // __kmp_init_parallel, the monitor thread creation is keyed off 6095 // of the new flag __kmp_init_monitor. 6096 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6097 if (TCR_4(__kmp_init_monitor)) { 6098 __kmp_reap_monitor(&__kmp_monitor); 6099 TCW_4(__kmp_init_monitor, 0); 6100 } 6101 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6102 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6103 #endif // KMP_USE_MONITOR 6104 } else { 6105 /* TODO move this to cleanup code */ 6106 #ifdef KMP_DEBUG 6107 /* make sure that everything has properly ended */ 6108 for (i = 0; i < __kmp_threads_capacity; i++) { 6109 if (__kmp_root[i]) { 6110 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6111 // there can be uber threads alive here 6112 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6113 } 6114 } 6115 #endif 6116 6117 KMP_MB(); 6118 6119 // Reap the worker threads. 6120 // This is valid for now, but be careful if threads are reaped sooner. 6121 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6122 // Get the next thread from the pool. 6123 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6124 __kmp_thread_pool = thread->th.th_next_pool; 6125 // Reap it. 6126 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6127 thread->th.th_next_pool = NULL; 6128 thread->th.th_in_pool = FALSE; 6129 __kmp_reap_thread(thread, 0); 6130 } 6131 __kmp_thread_pool_insert_pt = NULL; 6132 6133 // Reap teams. 6134 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6135 // Get the next team from the pool. 6136 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6137 __kmp_team_pool = team->t.t_next_pool; 6138 // Reap it. 6139 team->t.t_next_pool = NULL; 6140 __kmp_reap_team(team); 6141 } 6142 6143 __kmp_reap_task_teams(); 6144 6145 #if KMP_OS_UNIX 6146 // Threads that are not reaped should not access any resources since they 6147 // are going to be deallocated soon, so the shutdown sequence should wait 6148 // until all threads either exit the final spin-waiting loop or begin 6149 // sleeping after the given blocktime. 6150 for (i = 0; i < __kmp_threads_capacity; i++) { 6151 kmp_info_t *thr = __kmp_threads[i]; 6152 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6153 KMP_CPU_PAUSE(); 6154 } 6155 #endif 6156 6157 for (i = 0; i < __kmp_threads_capacity; ++i) { 6158 // TBD: Add some checking... 6159 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6160 } 6161 6162 /* Make sure all threadprivate destructors get run by joining with all 6163 worker threads before resetting this flag */ 6164 TCW_SYNC_4(__kmp_init_common, FALSE); 6165 6166 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6167 KMP_MB(); 6168 6169 #if KMP_USE_MONITOR 6170 // See note above: One of the possible fixes for CQ138434 / CQ140126 6171 // 6172 // FIXME: push both code fragments down and CSE them? 6173 // push them into __kmp_cleanup() ? 6174 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6175 if (TCR_4(__kmp_init_monitor)) { 6176 __kmp_reap_monitor(&__kmp_monitor); 6177 TCW_4(__kmp_init_monitor, 0); 6178 } 6179 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6180 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6181 #endif 6182 } /* else !__kmp_global.t_active */ 6183 TCW_4(__kmp_init_gtid, FALSE); 6184 KMP_MB(); /* Flush all pending memory write invalidates. */ 6185 6186 __kmp_cleanup(); 6187 #if OMPT_SUPPORT 6188 ompt_fini(); 6189 #endif 6190 } 6191 6192 void __kmp_internal_end_library(int gtid_req) { 6193 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6194 /* this shouldn't be a race condition because __kmp_internal_end() is the 6195 only place to clear __kmp_serial_init */ 6196 /* we'll check this later too, after we get the lock */ 6197 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6198 // redundant, because the next check will work in any case. 6199 if (__kmp_global.g.g_abort) { 6200 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6201 /* TODO abort? */ 6202 return; 6203 } 6204 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6205 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6206 return; 6207 } 6208 6209 KMP_MB(); /* Flush all pending memory write invalidates. */ 6210 /* find out who we are and what we should do */ 6211 { 6212 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6213 KA_TRACE( 6214 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6215 if (gtid == KMP_GTID_SHUTDOWN) { 6216 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6217 "already shutdown\n")); 6218 return; 6219 } else if (gtid == KMP_GTID_MONITOR) { 6220 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6221 "registered, or system shutdown\n")); 6222 return; 6223 } else if (gtid == KMP_GTID_DNE) { 6224 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6225 "shutdown\n")); 6226 /* we don't know who we are, but we may still shutdown the library */ 6227 } else if (KMP_UBER_GTID(gtid)) { 6228 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6229 if (__kmp_root[gtid]->r.r_active) { 6230 __kmp_global.g.g_abort = -1; 6231 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6232 __kmp_unregister_library(); 6233 KA_TRACE(10, 6234 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6235 gtid)); 6236 return; 6237 } else { 6238 KA_TRACE( 6239 10, 6240 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6241 __kmp_unregister_root_current_thread(gtid); 6242 } 6243 } else { 6244 /* worker threads may call this function through the atexit handler, if they 6245 * call exit() */ 6246 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6247 TODO: do a thorough shutdown instead */ 6248 #ifdef DUMP_DEBUG_ON_EXIT 6249 if (__kmp_debug_buf) 6250 __kmp_dump_debug_buffer(); 6251 #endif 6252 // added unregister library call here when we switch to shm linux 6253 // if we don't, it will leave lots of files in /dev/shm 6254 // cleanup shared memory file before exiting. 6255 __kmp_unregister_library(); 6256 return; 6257 } 6258 } 6259 /* synchronize the termination process */ 6260 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6261 6262 /* have we already finished */ 6263 if (__kmp_global.g.g_abort) { 6264 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6265 /* TODO abort? */ 6266 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6267 return; 6268 } 6269 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6270 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6271 return; 6272 } 6273 6274 /* We need this lock to enforce mutex between this reading of 6275 __kmp_threads_capacity and the writing by __kmp_register_root. 6276 Alternatively, we can use a counter of roots that is atomically updated by 6277 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6278 __kmp_internal_end_*. */ 6279 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6280 6281 /* now we can safely conduct the actual termination */ 6282 __kmp_internal_end(); 6283 6284 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6285 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6286 6287 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6288 6289 #ifdef DUMP_DEBUG_ON_EXIT 6290 if (__kmp_debug_buf) 6291 __kmp_dump_debug_buffer(); 6292 #endif 6293 6294 #if KMP_OS_WINDOWS 6295 __kmp_close_console(); 6296 #endif 6297 6298 __kmp_fini_allocator(); 6299 6300 } // __kmp_internal_end_library 6301 6302 void __kmp_internal_end_thread(int gtid_req) { 6303 int i; 6304 6305 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6306 /* this shouldn't be a race condition because __kmp_internal_end() is the 6307 * only place to clear __kmp_serial_init */ 6308 /* we'll check this later too, after we get the lock */ 6309 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6310 // redundant, because the next check will work in any case. 6311 if (__kmp_global.g.g_abort) { 6312 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6313 /* TODO abort? */ 6314 return; 6315 } 6316 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6317 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6318 return; 6319 } 6320 6321 // If hidden helper team has been initialized, we need to deinit it 6322 if (TCR_4(__kmp_init_hidden_helper)) { 6323 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6324 // First release the main thread to let it continue its work 6325 __kmp_hidden_helper_main_thread_release(); 6326 // Wait until the hidden helper team has been destroyed 6327 __kmp_hidden_helper_threads_deinitz_wait(); 6328 } 6329 6330 KMP_MB(); /* Flush all pending memory write invalidates. */ 6331 6332 /* find out who we are and what we should do */ 6333 { 6334 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6335 KA_TRACE(10, 6336 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6337 if (gtid == KMP_GTID_SHUTDOWN) { 6338 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6339 "already shutdown\n")); 6340 return; 6341 } else if (gtid == KMP_GTID_MONITOR) { 6342 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6343 "registered, or system shutdown\n")); 6344 return; 6345 } else if (gtid == KMP_GTID_DNE) { 6346 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6347 "shutdown\n")); 6348 return; 6349 /* we don't know who we are */ 6350 } else if (KMP_UBER_GTID(gtid)) { 6351 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6352 if (__kmp_root[gtid]->r.r_active) { 6353 __kmp_global.g.g_abort = -1; 6354 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6355 KA_TRACE(10, 6356 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6357 gtid)); 6358 return; 6359 } else { 6360 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6361 gtid)); 6362 __kmp_unregister_root_current_thread(gtid); 6363 } 6364 } else { 6365 /* just a worker thread, let's leave */ 6366 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6367 6368 if (gtid >= 0) { 6369 __kmp_threads[gtid]->th.th_task_team = NULL; 6370 } 6371 6372 KA_TRACE(10, 6373 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6374 gtid)); 6375 return; 6376 } 6377 } 6378 #if KMP_DYNAMIC_LIB 6379 if (__kmp_pause_status != kmp_hard_paused) 6380 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6381 // because we will better shutdown later in the library destructor. 6382 { 6383 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6384 return; 6385 } 6386 #endif 6387 /* synchronize the termination process */ 6388 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6389 6390 /* have we already finished */ 6391 if (__kmp_global.g.g_abort) { 6392 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6393 /* TODO abort? */ 6394 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6395 return; 6396 } 6397 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6398 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6399 return; 6400 } 6401 6402 /* We need this lock to enforce mutex between this reading of 6403 __kmp_threads_capacity and the writing by __kmp_register_root. 6404 Alternatively, we can use a counter of roots that is atomically updated by 6405 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6406 __kmp_internal_end_*. */ 6407 6408 /* should we finish the run-time? are all siblings done? */ 6409 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6410 6411 for (i = 0; i < __kmp_threads_capacity; ++i) { 6412 if (KMP_UBER_GTID(i)) { 6413 KA_TRACE( 6414 10, 6415 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6416 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6417 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6418 return; 6419 } 6420 } 6421 6422 /* now we can safely conduct the actual termination */ 6423 6424 __kmp_internal_end(); 6425 6426 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6427 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6428 6429 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6430 6431 #ifdef DUMP_DEBUG_ON_EXIT 6432 if (__kmp_debug_buf) 6433 __kmp_dump_debug_buffer(); 6434 #endif 6435 } // __kmp_internal_end_thread 6436 6437 // ----------------------------------------------------------------------------- 6438 // Library registration stuff. 6439 6440 static long __kmp_registration_flag = 0; 6441 // Random value used to indicate library initialization. 6442 static char *__kmp_registration_str = NULL; 6443 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6444 6445 static inline char *__kmp_reg_status_name() { 6446 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6447 each thread. If registration and unregistration go in different threads 6448 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6449 env var can not be found, because the name will contain different pid. */ 6450 // macOS* complains about name being too long with additional getuid() 6451 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6452 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6453 (int)getuid()); 6454 #else 6455 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6456 #endif 6457 } // __kmp_reg_status_get 6458 6459 void __kmp_register_library_startup(void) { 6460 6461 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6462 int done = 0; 6463 union { 6464 double dtime; 6465 long ltime; 6466 } time; 6467 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6468 __kmp_initialize_system_tick(); 6469 #endif 6470 __kmp_read_system_time(&time.dtime); 6471 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6472 __kmp_registration_str = 6473 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6474 __kmp_registration_flag, KMP_LIBRARY_FILE); 6475 6476 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6477 __kmp_registration_str)); 6478 6479 while (!done) { 6480 6481 char *value = NULL; // Actual value of the environment variable. 6482 6483 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6484 char *shm_name = __kmp_str_format("/%s", name); 6485 int shm_preexist = 0; 6486 char *data1; 6487 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6488 if ((fd1 == -1) && (errno == EEXIST)) { 6489 // file didn't open because it already exists. 6490 // try opening existing file 6491 fd1 = shm_open(shm_name, O_RDWR, 0666); 6492 if (fd1 == -1) { // file didn't open 6493 // error out here 6494 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6495 __kmp_msg_null); 6496 } else { 6497 // able to open existing file 6498 shm_preexist = 1; 6499 } 6500 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6501 // already exists. 6502 // error out here. 6503 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6504 __kmp_msg_null); 6505 } 6506 if (shm_preexist == 0) { 6507 // we created SHM now set size 6508 if (ftruncate(fd1, SHM_SIZE) == -1) { 6509 // error occured setting size; 6510 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6511 KMP_ERR(errno), __kmp_msg_null); 6512 } 6513 } 6514 data1 = 6515 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6516 if (data1 == MAP_FAILED) { 6517 // failed to map shared memory 6518 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6519 __kmp_msg_null); 6520 } 6521 if (shm_preexist == 0) { // set data to SHM, set value 6522 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6523 } 6524 // Read value from either what we just wrote or existing file. 6525 value = __kmp_str_format("%s", data1); // read value from SHM 6526 munmap(data1, SHM_SIZE); 6527 close(fd1); 6528 #else // Windows and unix with static library 6529 // Set environment variable, but do not overwrite if it is exist. 6530 __kmp_env_set(name, __kmp_registration_str, 0); 6531 // read value to see if it got set 6532 value = __kmp_env_get(name); 6533 #endif 6534 6535 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6536 done = 1; // Ok, environment variable set successfully, exit the loop. 6537 } else { 6538 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6539 // Check whether it alive or dead. 6540 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6541 char *tail = value; 6542 char *flag_addr_str = NULL; 6543 char *flag_val_str = NULL; 6544 char const *file_name = NULL; 6545 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6546 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6547 file_name = tail; 6548 if (tail != NULL) { 6549 long *flag_addr = 0; 6550 unsigned long flag_val = 0; 6551 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6552 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6553 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6554 // First, check whether environment-encoded address is mapped into 6555 // addr space. 6556 // If so, dereference it to see if it still has the right value. 6557 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6558 neighbor = 1; 6559 } else { 6560 // If not, then we know the other copy of the library is no longer 6561 // running. 6562 neighbor = 2; 6563 } 6564 } 6565 } 6566 switch (neighbor) { 6567 case 0: // Cannot parse environment variable -- neighbor status unknown. 6568 // Assume it is the incompatible format of future version of the 6569 // library. Assume the other library is alive. 6570 // WARN( ... ); // TODO: Issue a warning. 6571 file_name = "unknown library"; 6572 KMP_FALLTHROUGH(); 6573 // Attention! Falling to the next case. That's intentional. 6574 case 1: { // Neighbor is alive. 6575 // Check it is allowed. 6576 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6577 if (!__kmp_str_match_true(duplicate_ok)) { 6578 // That's not allowed. Issue fatal error. 6579 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6580 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6581 } 6582 KMP_INTERNAL_FREE(duplicate_ok); 6583 __kmp_duplicate_library_ok = 1; 6584 done = 1; // Exit the loop. 6585 } break; 6586 case 2: { // Neighbor is dead. 6587 6588 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6589 // close shared memory. 6590 shm_unlink(shm_name); // this removes file in /dev/shm 6591 #else 6592 // Clear the variable and try to register library again. 6593 __kmp_env_unset(name); 6594 #endif 6595 } break; 6596 default: { 6597 KMP_DEBUG_ASSERT(0); 6598 } break; 6599 } 6600 } 6601 KMP_INTERNAL_FREE((void *)value); 6602 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6603 KMP_INTERNAL_FREE((void *)shm_name); 6604 #endif 6605 } // while 6606 KMP_INTERNAL_FREE((void *)name); 6607 6608 } // func __kmp_register_library_startup 6609 6610 void __kmp_unregister_library(void) { 6611 6612 char *name = __kmp_reg_status_name(); 6613 char *value = NULL; 6614 6615 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6616 char *shm_name = __kmp_str_format("/%s", name); 6617 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6618 if (fd1 == -1) { 6619 // file did not open. return. 6620 return; 6621 } 6622 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6623 if (data1 != MAP_FAILED) { 6624 value = __kmp_str_format("%s", data1); // read value from SHM 6625 munmap(data1, SHM_SIZE); 6626 } 6627 close(fd1); 6628 #else 6629 value = __kmp_env_get(name); 6630 #endif 6631 6632 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6633 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6634 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6635 // Ok, this is our variable. Delete it. 6636 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6637 shm_unlink(shm_name); // this removes file in /dev/shm 6638 #else 6639 __kmp_env_unset(name); 6640 #endif 6641 } 6642 6643 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6644 KMP_INTERNAL_FREE(shm_name); 6645 #endif 6646 6647 KMP_INTERNAL_FREE(__kmp_registration_str); 6648 KMP_INTERNAL_FREE(value); 6649 KMP_INTERNAL_FREE(name); 6650 6651 __kmp_registration_flag = 0; 6652 __kmp_registration_str = NULL; 6653 6654 } // __kmp_unregister_library 6655 6656 // End of Library registration stuff. 6657 // ----------------------------------------------------------------------------- 6658 6659 #if KMP_MIC_SUPPORTED 6660 6661 static void __kmp_check_mic_type() { 6662 kmp_cpuid_t cpuid_state = {0}; 6663 kmp_cpuid_t *cs_p = &cpuid_state; 6664 __kmp_x86_cpuid(1, 0, cs_p); 6665 // We don't support mic1 at the moment 6666 if ((cs_p->eax & 0xff0) == 0xB10) { 6667 __kmp_mic_type = mic2; 6668 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6669 __kmp_mic_type = mic3; 6670 } else { 6671 __kmp_mic_type = non_mic; 6672 } 6673 } 6674 6675 #endif /* KMP_MIC_SUPPORTED */ 6676 6677 #if KMP_HAVE_UMWAIT 6678 static void __kmp_user_level_mwait_init() { 6679 struct kmp_cpuid buf; 6680 __kmp_x86_cpuid(7, 0, &buf); 6681 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6682 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6683 __kmp_umwait_enabled)); 6684 } 6685 #elif KMP_HAVE_MWAIT 6686 #ifndef AT_INTELPHIUSERMWAIT 6687 // Spurious, non-existent value that should always fail to return anything. 6688 // Will be replaced with the correct value when we know that. 6689 #define AT_INTELPHIUSERMWAIT 10000 6690 #endif 6691 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6692 // earlier OS is used to build the RTL, we'll use the following internal 6693 // function when the entry is not found. 6694 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6695 unsigned long getauxval(unsigned long) { return 0; } 6696 6697 static void __kmp_user_level_mwait_init() { 6698 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6699 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6700 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6701 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6702 if (__kmp_mic_type == mic3) { 6703 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6704 if ((res & 0x1) || __kmp_user_level_mwait) { 6705 __kmp_mwait_enabled = TRUE; 6706 if (__kmp_user_level_mwait) { 6707 KMP_INFORM(EnvMwaitWarn); 6708 } 6709 } else { 6710 __kmp_mwait_enabled = FALSE; 6711 } 6712 } 6713 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6714 "__kmp_mwait_enabled = %d\n", 6715 __kmp_mic_type, __kmp_mwait_enabled)); 6716 } 6717 #endif /* KMP_HAVE_UMWAIT */ 6718 6719 static void __kmp_do_serial_initialize(void) { 6720 int i, gtid; 6721 size_t size; 6722 6723 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6724 6725 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6726 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6727 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6728 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6729 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6730 6731 #if OMPT_SUPPORT 6732 ompt_pre_init(); 6733 #endif 6734 #if OMPD_SUPPORT 6735 __kmp_env_dump(); 6736 ompd_init(); 6737 #endif 6738 6739 __kmp_validate_locks(); 6740 6741 /* Initialize internal memory allocator */ 6742 __kmp_init_allocator(); 6743 6744 /* Register the library startup via an environment variable and check to see 6745 whether another copy of the library is already registered. */ 6746 6747 __kmp_register_library_startup(); 6748 6749 /* TODO reinitialization of library */ 6750 if (TCR_4(__kmp_global.g.g_done)) { 6751 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6752 } 6753 6754 __kmp_global.g.g_abort = 0; 6755 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6756 6757 /* initialize the locks */ 6758 #if KMP_USE_ADAPTIVE_LOCKS 6759 #if KMP_DEBUG_ADAPTIVE_LOCKS 6760 __kmp_init_speculative_stats(); 6761 #endif 6762 #endif 6763 #if KMP_STATS_ENABLED 6764 __kmp_stats_init(); 6765 #endif 6766 __kmp_init_lock(&__kmp_global_lock); 6767 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6768 __kmp_init_lock(&__kmp_debug_lock); 6769 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6770 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6771 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6772 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6773 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6774 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6775 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6776 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6777 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6778 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6779 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6780 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6781 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6782 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6783 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6784 #if KMP_USE_MONITOR 6785 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6786 #endif 6787 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6788 6789 /* conduct initialization and initial setup of configuration */ 6790 6791 __kmp_runtime_initialize(); 6792 6793 #if KMP_MIC_SUPPORTED 6794 __kmp_check_mic_type(); 6795 #endif 6796 6797 // Some global variable initialization moved here from kmp_env_initialize() 6798 #ifdef KMP_DEBUG 6799 kmp_diag = 0; 6800 #endif 6801 __kmp_abort_delay = 0; 6802 6803 // From __kmp_init_dflt_team_nth() 6804 /* assume the entire machine will be used */ 6805 __kmp_dflt_team_nth_ub = __kmp_xproc; 6806 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6807 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6808 } 6809 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6810 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6811 } 6812 __kmp_max_nth = __kmp_sys_max_nth; 6813 __kmp_cg_max_nth = __kmp_sys_max_nth; 6814 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6815 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6816 __kmp_teams_max_nth = __kmp_sys_max_nth; 6817 } 6818 6819 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6820 // part 6821 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6822 #if KMP_USE_MONITOR 6823 __kmp_monitor_wakeups = 6824 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6825 __kmp_bt_intervals = 6826 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6827 #endif 6828 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6829 __kmp_library = library_throughput; 6830 // From KMP_SCHEDULE initialization 6831 __kmp_static = kmp_sch_static_balanced; 6832 // AC: do not use analytical here, because it is non-monotonous 6833 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6834 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6835 // need to repeat assignment 6836 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6837 // bit control and barrier method control parts 6838 #if KMP_FAST_REDUCTION_BARRIER 6839 #define kmp_reduction_barrier_gather_bb ((int)1) 6840 #define kmp_reduction_barrier_release_bb ((int)1) 6841 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6842 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6843 #endif // KMP_FAST_REDUCTION_BARRIER 6844 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6845 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6846 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6847 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6848 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6849 #if KMP_FAST_REDUCTION_BARRIER 6850 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6851 // lin_64 ): hyper,1 6852 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6853 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6854 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6855 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6856 } 6857 #endif // KMP_FAST_REDUCTION_BARRIER 6858 } 6859 #if KMP_FAST_REDUCTION_BARRIER 6860 #undef kmp_reduction_barrier_release_pat 6861 #undef kmp_reduction_barrier_gather_pat 6862 #undef kmp_reduction_barrier_release_bb 6863 #undef kmp_reduction_barrier_gather_bb 6864 #endif // KMP_FAST_REDUCTION_BARRIER 6865 #if KMP_MIC_SUPPORTED 6866 if (__kmp_mic_type == mic2) { // KNC 6867 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6868 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6869 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6870 1; // forkjoin release 6871 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6872 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6873 } 6874 #if KMP_FAST_REDUCTION_BARRIER 6875 if (__kmp_mic_type == mic2) { // KNC 6876 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6877 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6878 } 6879 #endif // KMP_FAST_REDUCTION_BARRIER 6880 #endif // KMP_MIC_SUPPORTED 6881 6882 // From KMP_CHECKS initialization 6883 #ifdef KMP_DEBUG 6884 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6885 #else 6886 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6887 #endif 6888 6889 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6890 __kmp_foreign_tp = TRUE; 6891 6892 __kmp_global.g.g_dynamic = FALSE; 6893 __kmp_global.g.g_dynamic_mode = dynamic_default; 6894 6895 __kmp_init_nesting_mode(); 6896 6897 __kmp_env_initialize(NULL); 6898 6899 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6900 __kmp_user_level_mwait_init(); 6901 #endif 6902 // Print all messages in message catalog for testing purposes. 6903 #ifdef KMP_DEBUG 6904 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6905 if (__kmp_str_match_true(val)) { 6906 kmp_str_buf_t buffer; 6907 __kmp_str_buf_init(&buffer); 6908 __kmp_i18n_dump_catalog(&buffer); 6909 __kmp_printf("%s", buffer.str); 6910 __kmp_str_buf_free(&buffer); 6911 } 6912 __kmp_env_free(&val); 6913 #endif 6914 6915 __kmp_threads_capacity = 6916 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6917 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6918 __kmp_tp_capacity = __kmp_default_tp_capacity( 6919 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6920 6921 // If the library is shut down properly, both pools must be NULL. Just in 6922 // case, set them to NULL -- some memory may leak, but subsequent code will 6923 // work even if pools are not freed. 6924 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6925 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6926 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6927 __kmp_thread_pool = NULL; 6928 __kmp_thread_pool_insert_pt = NULL; 6929 __kmp_team_pool = NULL; 6930 6931 /* Allocate all of the variable sized records */ 6932 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6933 * expandable */ 6934 /* Since allocation is cache-aligned, just add extra padding at the end */ 6935 size = 6936 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6937 CACHE_LINE; 6938 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6939 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6940 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6941 6942 /* init thread counts */ 6943 KMP_DEBUG_ASSERT(__kmp_all_nth == 6944 0); // Asserts fail if the library is reinitializing and 6945 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6946 __kmp_all_nth = 0; 6947 __kmp_nth = 0; 6948 6949 /* setup the uber master thread and hierarchy */ 6950 gtid = __kmp_register_root(TRUE); 6951 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6952 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6953 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6954 6955 KMP_MB(); /* Flush all pending memory write invalidates. */ 6956 6957 __kmp_common_initialize(); 6958 6959 #if KMP_OS_UNIX 6960 /* invoke the child fork handler */ 6961 __kmp_register_atfork(); 6962 #endif 6963 6964 #if !KMP_DYNAMIC_LIB 6965 { 6966 /* Invoke the exit handler when the program finishes, only for static 6967 library. For dynamic library, we already have _fini and DllMain. */ 6968 int rc = atexit(__kmp_internal_end_atexit); 6969 if (rc != 0) { 6970 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6971 __kmp_msg_null); 6972 } 6973 } 6974 #endif 6975 6976 #if KMP_HANDLE_SIGNALS 6977 #if KMP_OS_UNIX 6978 /* NOTE: make sure that this is called before the user installs their own 6979 signal handlers so that the user handlers are called first. this way they 6980 can return false, not call our handler, avoid terminating the library, and 6981 continue execution where they left off. */ 6982 __kmp_install_signals(FALSE); 6983 #endif /* KMP_OS_UNIX */ 6984 #if KMP_OS_WINDOWS 6985 __kmp_install_signals(TRUE); 6986 #endif /* KMP_OS_WINDOWS */ 6987 #endif 6988 6989 /* we have finished the serial initialization */ 6990 __kmp_init_counter++; 6991 6992 __kmp_init_serial = TRUE; 6993 6994 if (__kmp_settings) { 6995 __kmp_env_print(); 6996 } 6997 6998 if (__kmp_display_env || __kmp_display_env_verbose) { 6999 __kmp_env_print_2(); 7000 } 7001 7002 #if OMPT_SUPPORT 7003 ompt_post_init(); 7004 #endif 7005 7006 KMP_MB(); 7007 7008 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7009 } 7010 7011 void __kmp_serial_initialize(void) { 7012 if (__kmp_init_serial) { 7013 return; 7014 } 7015 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7016 if (__kmp_init_serial) { 7017 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7018 return; 7019 } 7020 __kmp_do_serial_initialize(); 7021 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7022 } 7023 7024 static void __kmp_do_middle_initialize(void) { 7025 int i, j; 7026 int prev_dflt_team_nth; 7027 7028 if (!__kmp_init_serial) { 7029 __kmp_do_serial_initialize(); 7030 } 7031 7032 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7033 7034 // Save the previous value for the __kmp_dflt_team_nth so that 7035 // we can avoid some reinitialization if it hasn't changed. 7036 prev_dflt_team_nth = __kmp_dflt_team_nth; 7037 7038 #if KMP_AFFINITY_SUPPORTED 7039 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7040 // number of cores on the machine. 7041 __kmp_affinity_initialize(); 7042 7043 #endif /* KMP_AFFINITY_SUPPORTED */ 7044 7045 KMP_ASSERT(__kmp_xproc > 0); 7046 if (__kmp_avail_proc == 0) { 7047 __kmp_avail_proc = __kmp_xproc; 7048 } 7049 7050 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7051 // correct them now 7052 j = 0; 7053 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7054 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7055 __kmp_avail_proc; 7056 j++; 7057 } 7058 7059 if (__kmp_dflt_team_nth == 0) { 7060 #ifdef KMP_DFLT_NTH_CORES 7061 // Default #threads = #cores 7062 __kmp_dflt_team_nth = __kmp_ncores; 7063 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7064 "__kmp_ncores (%d)\n", 7065 __kmp_dflt_team_nth)); 7066 #else 7067 // Default #threads = #available OS procs 7068 __kmp_dflt_team_nth = __kmp_avail_proc; 7069 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7070 "__kmp_avail_proc(%d)\n", 7071 __kmp_dflt_team_nth)); 7072 #endif /* KMP_DFLT_NTH_CORES */ 7073 } 7074 7075 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7076 __kmp_dflt_team_nth = KMP_MIN_NTH; 7077 } 7078 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7079 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7080 } 7081 7082 if (__kmp_nesting_mode > 0) 7083 __kmp_set_nesting_mode_threads(); 7084 7085 // There's no harm in continuing if the following check fails, 7086 // but it indicates an error in the previous logic. 7087 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7088 7089 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7090 // Run through the __kmp_threads array and set the num threads icv for each 7091 // root thread that is currently registered with the RTL (which has not 7092 // already explicitly set its nthreads-var with a call to 7093 // omp_set_num_threads()). 7094 for (i = 0; i < __kmp_threads_capacity; i++) { 7095 kmp_info_t *thread = __kmp_threads[i]; 7096 if (thread == NULL) 7097 continue; 7098 if (thread->th.th_current_task->td_icvs.nproc != 0) 7099 continue; 7100 7101 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7102 } 7103 } 7104 KA_TRACE( 7105 20, 7106 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7107 __kmp_dflt_team_nth)); 7108 7109 #ifdef KMP_ADJUST_BLOCKTIME 7110 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7111 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7112 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7113 if (__kmp_nth > __kmp_avail_proc) { 7114 __kmp_zero_bt = TRUE; 7115 } 7116 } 7117 #endif /* KMP_ADJUST_BLOCKTIME */ 7118 7119 /* we have finished middle initialization */ 7120 TCW_SYNC_4(__kmp_init_middle, TRUE); 7121 7122 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7123 } 7124 7125 void __kmp_middle_initialize(void) { 7126 if (__kmp_init_middle) { 7127 return; 7128 } 7129 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7130 if (__kmp_init_middle) { 7131 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7132 return; 7133 } 7134 __kmp_do_middle_initialize(); 7135 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7136 } 7137 7138 void __kmp_parallel_initialize(void) { 7139 int gtid = __kmp_entry_gtid(); // this might be a new root 7140 7141 /* synchronize parallel initialization (for sibling) */ 7142 if (TCR_4(__kmp_init_parallel)) 7143 return; 7144 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7145 if (TCR_4(__kmp_init_parallel)) { 7146 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7147 return; 7148 } 7149 7150 /* TODO reinitialization after we have already shut down */ 7151 if (TCR_4(__kmp_global.g.g_done)) { 7152 KA_TRACE( 7153 10, 7154 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7155 __kmp_infinite_loop(); 7156 } 7157 7158 /* jc: The lock __kmp_initz_lock is already held, so calling 7159 __kmp_serial_initialize would cause a deadlock. So we call 7160 __kmp_do_serial_initialize directly. */ 7161 if (!__kmp_init_middle) { 7162 __kmp_do_middle_initialize(); 7163 } 7164 __kmp_assign_root_init_mask(); 7165 __kmp_resume_if_hard_paused(); 7166 7167 /* begin initialization */ 7168 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7169 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7170 7171 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7172 // Save the FP control regs. 7173 // Worker threads will set theirs to these values at thread startup. 7174 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7175 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7176 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7177 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7178 7179 #if KMP_OS_UNIX 7180 #if KMP_HANDLE_SIGNALS 7181 /* must be after __kmp_serial_initialize */ 7182 __kmp_install_signals(TRUE); 7183 #endif 7184 #endif 7185 7186 __kmp_suspend_initialize(); 7187 7188 #if defined(USE_LOAD_BALANCE) 7189 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7190 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7191 } 7192 #else 7193 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7194 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7195 } 7196 #endif 7197 7198 if (__kmp_version) { 7199 __kmp_print_version_2(); 7200 } 7201 7202 /* we have finished parallel initialization */ 7203 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7204 7205 KMP_MB(); 7206 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7207 7208 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7209 } 7210 7211 void __kmp_hidden_helper_initialize() { 7212 if (TCR_4(__kmp_init_hidden_helper)) 7213 return; 7214 7215 // __kmp_parallel_initialize is required before we initialize hidden helper 7216 if (!TCR_4(__kmp_init_parallel)) 7217 __kmp_parallel_initialize(); 7218 7219 // Double check. Note that this double check should not be placed before 7220 // __kmp_parallel_initialize as it will cause dead lock. 7221 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7222 if (TCR_4(__kmp_init_hidden_helper)) { 7223 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7224 return; 7225 } 7226 7227 // Set the count of hidden helper tasks to be executed to zero 7228 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7229 7230 // Set the global variable indicating that we're initializing hidden helper 7231 // team/threads 7232 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7233 7234 // Platform independent initialization 7235 __kmp_do_initialize_hidden_helper_threads(); 7236 7237 // Wait here for the finish of initialization of hidden helper teams 7238 __kmp_hidden_helper_threads_initz_wait(); 7239 7240 // We have finished hidden helper initialization 7241 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7242 7243 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7244 } 7245 7246 /* ------------------------------------------------------------------------ */ 7247 7248 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7249 kmp_team_t *team) { 7250 kmp_disp_t *dispatch; 7251 7252 KMP_MB(); 7253 7254 /* none of the threads have encountered any constructs, yet. */ 7255 this_thr->th.th_local.this_construct = 0; 7256 #if KMP_CACHE_MANAGE 7257 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7258 #endif /* KMP_CACHE_MANAGE */ 7259 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7260 KMP_DEBUG_ASSERT(dispatch); 7261 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7262 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7263 // this_thr->th.th_info.ds.ds_tid ] ); 7264 7265 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7266 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7267 if (__kmp_env_consistency_check) 7268 __kmp_push_parallel(gtid, team->t.t_ident); 7269 7270 KMP_MB(); /* Flush all pending memory write invalidates. */ 7271 } 7272 7273 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7274 kmp_team_t *team) { 7275 if (__kmp_env_consistency_check) 7276 __kmp_pop_parallel(gtid, team->t.t_ident); 7277 7278 __kmp_finish_implicit_task(this_thr); 7279 } 7280 7281 int __kmp_invoke_task_func(int gtid) { 7282 int rc; 7283 int tid = __kmp_tid_from_gtid(gtid); 7284 kmp_info_t *this_thr = __kmp_threads[gtid]; 7285 kmp_team_t *team = this_thr->th.th_team; 7286 7287 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7288 #if USE_ITT_BUILD 7289 if (__itt_stack_caller_create_ptr) { 7290 // inform ittnotify about entering user's code 7291 if (team->t.t_stack_id != NULL) { 7292 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7293 } else { 7294 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7295 __kmp_itt_stack_callee_enter( 7296 (__itt_caller)team->t.t_parent->t.t_stack_id); 7297 } 7298 } 7299 #endif /* USE_ITT_BUILD */ 7300 #if INCLUDE_SSC_MARKS 7301 SSC_MARK_INVOKING(); 7302 #endif 7303 7304 #if OMPT_SUPPORT 7305 void *dummy; 7306 void **exit_frame_p; 7307 ompt_data_t *my_task_data; 7308 ompt_data_t *my_parallel_data; 7309 int ompt_team_size; 7310 7311 if (ompt_enabled.enabled) { 7312 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7313 .ompt_task_info.frame.exit_frame.ptr); 7314 } else { 7315 exit_frame_p = &dummy; 7316 } 7317 7318 my_task_data = 7319 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7320 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7321 if (ompt_enabled.ompt_callback_implicit_task) { 7322 ompt_team_size = team->t.t_nproc; 7323 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7324 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7325 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7326 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7327 } 7328 #endif 7329 7330 #if KMP_STATS_ENABLED 7331 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7332 if (previous_state == stats_state_e::TEAMS_REGION) { 7333 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7334 } else { 7335 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7336 } 7337 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7338 #endif 7339 7340 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7341 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7342 #if OMPT_SUPPORT 7343 , 7344 exit_frame_p 7345 #endif 7346 ); 7347 #if OMPT_SUPPORT 7348 *exit_frame_p = NULL; 7349 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7350 #endif 7351 7352 #if KMP_STATS_ENABLED 7353 if (previous_state == stats_state_e::TEAMS_REGION) { 7354 KMP_SET_THREAD_STATE(previous_state); 7355 } 7356 KMP_POP_PARTITIONED_TIMER(); 7357 #endif 7358 7359 #if USE_ITT_BUILD 7360 if (__itt_stack_caller_create_ptr) { 7361 // inform ittnotify about leaving user's code 7362 if (team->t.t_stack_id != NULL) { 7363 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7364 } else { 7365 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7366 __kmp_itt_stack_callee_leave( 7367 (__itt_caller)team->t.t_parent->t.t_stack_id); 7368 } 7369 } 7370 #endif /* USE_ITT_BUILD */ 7371 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7372 7373 return rc; 7374 } 7375 7376 void __kmp_teams_master(int gtid) { 7377 // This routine is called by all primary threads in teams construct 7378 kmp_info_t *thr = __kmp_threads[gtid]; 7379 kmp_team_t *team = thr->th.th_team; 7380 ident_t *loc = team->t.t_ident; 7381 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7382 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7383 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7384 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7385 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7386 7387 // This thread is a new CG root. Set up the proper variables. 7388 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7389 tmp->cg_root = thr; // Make thr the CG root 7390 // Init to thread limit stored when league primary threads were forked 7391 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7392 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7393 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7394 " cg_nthreads to 1\n", 7395 thr, tmp)); 7396 tmp->up = thr->th.th_cg_roots; 7397 thr->th.th_cg_roots = tmp; 7398 7399 // Launch league of teams now, but not let workers execute 7400 // (they hang on fork barrier until next parallel) 7401 #if INCLUDE_SSC_MARKS 7402 SSC_MARK_FORKING(); 7403 #endif 7404 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7405 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7406 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7407 #if INCLUDE_SSC_MARKS 7408 SSC_MARK_JOINING(); 7409 #endif 7410 // If the team size was reduced from the limit, set it to the new size 7411 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7412 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7413 // AC: last parameter "1" eliminates join barrier which won't work because 7414 // worker threads are in a fork barrier waiting for more parallel regions 7415 __kmp_join_call(loc, gtid 7416 #if OMPT_SUPPORT 7417 , 7418 fork_context_intel 7419 #endif 7420 , 7421 1); 7422 } 7423 7424 int __kmp_invoke_teams_master(int gtid) { 7425 kmp_info_t *this_thr = __kmp_threads[gtid]; 7426 kmp_team_t *team = this_thr->th.th_team; 7427 #if KMP_DEBUG 7428 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7429 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7430 (void *)__kmp_teams_master); 7431 #endif 7432 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7433 #if OMPT_SUPPORT 7434 int tid = __kmp_tid_from_gtid(gtid); 7435 ompt_data_t *task_data = 7436 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7437 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7438 if (ompt_enabled.ompt_callback_implicit_task) { 7439 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7440 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7441 ompt_task_initial); 7442 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7443 } 7444 #endif 7445 __kmp_teams_master(gtid); 7446 #if OMPT_SUPPORT 7447 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7448 #endif 7449 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7450 return 1; 7451 } 7452 7453 /* this sets the requested number of threads for the next parallel region 7454 encountered by this team. since this should be enclosed in the forkjoin 7455 critical section it should avoid race conditions with asymmetrical nested 7456 parallelism */ 7457 7458 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7459 kmp_info_t *thr = __kmp_threads[gtid]; 7460 7461 if (num_threads > 0) 7462 thr->th.th_set_nproc = num_threads; 7463 } 7464 7465 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7466 int num_threads) { 7467 KMP_DEBUG_ASSERT(thr); 7468 // Remember the number of threads for inner parallel regions 7469 if (!TCR_4(__kmp_init_middle)) 7470 __kmp_middle_initialize(); // get internal globals calculated 7471 __kmp_assign_root_init_mask(); 7472 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7473 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7474 7475 if (num_threads == 0) { 7476 if (__kmp_teams_thread_limit > 0) { 7477 num_threads = __kmp_teams_thread_limit; 7478 } else { 7479 num_threads = __kmp_avail_proc / num_teams; 7480 } 7481 // adjust num_threads w/o warning as it is not user setting 7482 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7483 // no thread_limit clause specified - do not change thread-limit-var ICV 7484 if (num_threads > __kmp_dflt_team_nth) { 7485 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7486 } 7487 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7488 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7489 } // prevent team size to exceed thread-limit-var 7490 if (num_teams * num_threads > __kmp_teams_max_nth) { 7491 num_threads = __kmp_teams_max_nth / num_teams; 7492 } 7493 if (num_threads == 0) { 7494 num_threads = 1; 7495 } 7496 } else { 7497 // This thread will be the primary thread of the league primary threads 7498 // Store new thread limit; old limit is saved in th_cg_roots list 7499 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7500 // num_threads = min(num_threads, nthreads-var) 7501 if (num_threads > __kmp_dflt_team_nth) { 7502 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7503 } 7504 if (num_teams * num_threads > __kmp_teams_max_nth) { 7505 int new_threads = __kmp_teams_max_nth / num_teams; 7506 if (new_threads == 0) { 7507 new_threads = 1; 7508 } 7509 if (new_threads != num_threads) { 7510 if (!__kmp_reserve_warn) { // user asked for too many threads 7511 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7512 __kmp_msg(kmp_ms_warning, 7513 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7514 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7515 } 7516 } 7517 num_threads = new_threads; 7518 } 7519 } 7520 thr->th.th_teams_size.nth = num_threads; 7521 } 7522 7523 /* this sets the requested number of teams for the teams region and/or 7524 the number of threads for the next parallel region encountered */ 7525 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7526 int num_threads) { 7527 kmp_info_t *thr = __kmp_threads[gtid]; 7528 KMP_DEBUG_ASSERT(num_teams >= 0); 7529 KMP_DEBUG_ASSERT(num_threads >= 0); 7530 7531 if (num_teams == 0) { 7532 if (__kmp_nteams > 0) { 7533 num_teams = __kmp_nteams; 7534 } else { 7535 num_teams = 1; // default number of teams is 1. 7536 } 7537 } 7538 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7539 if (!__kmp_reserve_warn) { 7540 __kmp_reserve_warn = 1; 7541 __kmp_msg(kmp_ms_warning, 7542 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7543 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7544 } 7545 num_teams = __kmp_teams_max_nth; 7546 } 7547 // Set number of teams (number of threads in the outer "parallel" of the 7548 // teams) 7549 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7550 7551 __kmp_push_thread_limit(thr, num_teams, num_threads); 7552 } 7553 7554 /* This sets the requested number of teams for the teams region and/or 7555 the number of threads for the next parallel region encountered */ 7556 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7557 int num_teams_ub, int num_threads) { 7558 kmp_info_t *thr = __kmp_threads[gtid]; 7559 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7560 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7561 KMP_DEBUG_ASSERT(num_threads >= 0); 7562 7563 if (num_teams_lb > num_teams_ub) { 7564 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7565 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7566 } 7567 7568 int num_teams = 1; // defalt number of teams is 1. 7569 7570 if (num_teams_lb == 0 && num_teams_ub > 0) 7571 num_teams_lb = num_teams_ub; 7572 7573 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7574 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7575 if (num_teams > __kmp_teams_max_nth) { 7576 if (!__kmp_reserve_warn) { 7577 __kmp_reserve_warn = 1; 7578 __kmp_msg(kmp_ms_warning, 7579 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7580 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7581 } 7582 num_teams = __kmp_teams_max_nth; 7583 } 7584 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7585 num_teams = num_teams_ub; 7586 } else { // num_teams_lb <= num_teams <= num_teams_ub 7587 if (num_threads == 0) { 7588 if (num_teams_ub > __kmp_teams_max_nth) { 7589 num_teams = num_teams_lb; 7590 } else { 7591 num_teams = num_teams_ub; 7592 } 7593 } else { 7594 num_teams = (num_threads > __kmp_teams_max_nth) 7595 ? num_teams 7596 : __kmp_teams_max_nth / num_threads; 7597 if (num_teams < num_teams_lb) { 7598 num_teams = num_teams_lb; 7599 } else if (num_teams > num_teams_ub) { 7600 num_teams = num_teams_ub; 7601 } 7602 } 7603 } 7604 // Set number of teams (number of threads in the outer "parallel" of the 7605 // teams) 7606 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7607 7608 __kmp_push_thread_limit(thr, num_teams, num_threads); 7609 } 7610 7611 // Set the proc_bind var to use in the following parallel region. 7612 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7613 kmp_info_t *thr = __kmp_threads[gtid]; 7614 thr->th.th_set_proc_bind = proc_bind; 7615 } 7616 7617 /* Launch the worker threads into the microtask. */ 7618 7619 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7620 kmp_info_t *this_thr = __kmp_threads[gtid]; 7621 7622 #ifdef KMP_DEBUG 7623 int f; 7624 #endif /* KMP_DEBUG */ 7625 7626 KMP_DEBUG_ASSERT(team); 7627 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7628 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7629 KMP_MB(); /* Flush all pending memory write invalidates. */ 7630 7631 team->t.t_construct = 0; /* no single directives seen yet */ 7632 team->t.t_ordered.dt.t_value = 7633 0; /* thread 0 enters the ordered section first */ 7634 7635 /* Reset the identifiers on the dispatch buffer */ 7636 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7637 if (team->t.t_max_nproc > 1) { 7638 int i; 7639 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7640 team->t.t_disp_buffer[i].buffer_index = i; 7641 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7642 } 7643 } else { 7644 team->t.t_disp_buffer[0].buffer_index = 0; 7645 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7646 } 7647 7648 KMP_MB(); /* Flush all pending memory write invalidates. */ 7649 KMP_ASSERT(this_thr->th.th_team == team); 7650 7651 #ifdef KMP_DEBUG 7652 for (f = 0; f < team->t.t_nproc; f++) { 7653 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7654 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7655 } 7656 #endif /* KMP_DEBUG */ 7657 7658 /* release the worker threads so they may begin working */ 7659 __kmp_fork_barrier(gtid, 0); 7660 } 7661 7662 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7663 kmp_info_t *this_thr = __kmp_threads[gtid]; 7664 7665 KMP_DEBUG_ASSERT(team); 7666 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7667 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7668 KMP_MB(); /* Flush all pending memory write invalidates. */ 7669 7670 /* Join barrier after fork */ 7671 7672 #ifdef KMP_DEBUG 7673 if (__kmp_threads[gtid] && 7674 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7675 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7676 __kmp_threads[gtid]); 7677 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7678 "team->t.t_nproc=%d\n", 7679 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7680 team->t.t_nproc); 7681 __kmp_print_structure(); 7682 } 7683 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7684 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7685 #endif /* KMP_DEBUG */ 7686 7687 __kmp_join_barrier(gtid); /* wait for everyone */ 7688 #if OMPT_SUPPORT 7689 if (ompt_enabled.enabled && 7690 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7691 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7692 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7693 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7694 #if OMPT_OPTIONAL 7695 void *codeptr = NULL; 7696 if (KMP_MASTER_TID(ds_tid) && 7697 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7698 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7699 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7700 7701 if (ompt_enabled.ompt_callback_sync_region_wait) { 7702 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7703 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7704 codeptr); 7705 } 7706 if (ompt_enabled.ompt_callback_sync_region) { 7707 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7708 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7709 codeptr); 7710 } 7711 #endif 7712 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7713 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7714 ompt_scope_end, NULL, task_data, 0, ds_tid, 7715 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7716 } 7717 } 7718 #endif 7719 7720 KMP_MB(); /* Flush all pending memory write invalidates. */ 7721 KMP_ASSERT(this_thr->th.th_team == team); 7722 } 7723 7724 /* ------------------------------------------------------------------------ */ 7725 7726 #ifdef USE_LOAD_BALANCE 7727 7728 // Return the worker threads actively spinning in the hot team, if we 7729 // are at the outermost level of parallelism. Otherwise, return 0. 7730 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7731 int i; 7732 int retval; 7733 kmp_team_t *hot_team; 7734 7735 if (root->r.r_active) { 7736 return 0; 7737 } 7738 hot_team = root->r.r_hot_team; 7739 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7740 return hot_team->t.t_nproc - 1; // Don't count primary thread 7741 } 7742 7743 // Skip the primary thread - it is accounted for elsewhere. 7744 retval = 0; 7745 for (i = 1; i < hot_team->t.t_nproc; i++) { 7746 if (hot_team->t.t_threads[i]->th.th_active) { 7747 retval++; 7748 } 7749 } 7750 return retval; 7751 } 7752 7753 // Perform an automatic adjustment to the number of 7754 // threads used by the next parallel region. 7755 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7756 int retval; 7757 int pool_active; 7758 int hot_team_active; 7759 int team_curr_active; 7760 int system_active; 7761 7762 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7763 set_nproc)); 7764 KMP_DEBUG_ASSERT(root); 7765 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7766 ->th.th_current_task->td_icvs.dynamic == TRUE); 7767 KMP_DEBUG_ASSERT(set_nproc > 1); 7768 7769 if (set_nproc == 1) { 7770 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7771 return 1; 7772 } 7773 7774 // Threads that are active in the thread pool, active in the hot team for this 7775 // particular root (if we are at the outer par level), and the currently 7776 // executing thread (to become the primary thread) are available to add to the 7777 // new team, but are currently contributing to the system load, and must be 7778 // accounted for. 7779 pool_active = __kmp_thread_pool_active_nth; 7780 hot_team_active = __kmp_active_hot_team_nproc(root); 7781 team_curr_active = pool_active + hot_team_active + 1; 7782 7783 // Check the system load. 7784 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7785 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7786 "hot team active = %d\n", 7787 system_active, pool_active, hot_team_active)); 7788 7789 if (system_active < 0) { 7790 // There was an error reading the necessary info from /proc, so use the 7791 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7792 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7793 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7794 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7795 7796 // Make this call behave like the thread limit algorithm. 7797 retval = __kmp_avail_proc - __kmp_nth + 7798 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7799 if (retval > set_nproc) { 7800 retval = set_nproc; 7801 } 7802 if (retval < KMP_MIN_NTH) { 7803 retval = KMP_MIN_NTH; 7804 } 7805 7806 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7807 retval)); 7808 return retval; 7809 } 7810 7811 // There is a slight delay in the load balance algorithm in detecting new 7812 // running procs. The real system load at this instant should be at least as 7813 // large as the #active omp thread that are available to add to the team. 7814 if (system_active < team_curr_active) { 7815 system_active = team_curr_active; 7816 } 7817 retval = __kmp_avail_proc - system_active + team_curr_active; 7818 if (retval > set_nproc) { 7819 retval = set_nproc; 7820 } 7821 if (retval < KMP_MIN_NTH) { 7822 retval = KMP_MIN_NTH; 7823 } 7824 7825 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7826 return retval; 7827 } // __kmp_load_balance_nproc() 7828 7829 #endif /* USE_LOAD_BALANCE */ 7830 7831 /* ------------------------------------------------------------------------ */ 7832 7833 /* NOTE: this is called with the __kmp_init_lock held */ 7834 void __kmp_cleanup(void) { 7835 int f; 7836 7837 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7838 7839 if (TCR_4(__kmp_init_parallel)) { 7840 #if KMP_HANDLE_SIGNALS 7841 __kmp_remove_signals(); 7842 #endif 7843 TCW_4(__kmp_init_parallel, FALSE); 7844 } 7845 7846 if (TCR_4(__kmp_init_middle)) { 7847 #if KMP_AFFINITY_SUPPORTED 7848 __kmp_affinity_uninitialize(); 7849 #endif /* KMP_AFFINITY_SUPPORTED */ 7850 __kmp_cleanup_hierarchy(); 7851 TCW_4(__kmp_init_middle, FALSE); 7852 } 7853 7854 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7855 7856 if (__kmp_init_serial) { 7857 __kmp_runtime_destroy(); 7858 __kmp_init_serial = FALSE; 7859 } 7860 7861 __kmp_cleanup_threadprivate_caches(); 7862 7863 for (f = 0; f < __kmp_threads_capacity; f++) { 7864 if (__kmp_root[f] != NULL) { 7865 __kmp_free(__kmp_root[f]); 7866 __kmp_root[f] = NULL; 7867 } 7868 } 7869 __kmp_free(__kmp_threads); 7870 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7871 // there is no need in freeing __kmp_root. 7872 __kmp_threads = NULL; 7873 __kmp_root = NULL; 7874 __kmp_threads_capacity = 0; 7875 7876 #if KMP_USE_DYNAMIC_LOCK 7877 __kmp_cleanup_indirect_user_locks(); 7878 #else 7879 __kmp_cleanup_user_locks(); 7880 #endif 7881 #if OMPD_SUPPORT 7882 if (ompd_state) { 7883 __kmp_free(ompd_env_block); 7884 ompd_env_block = NULL; 7885 ompd_env_block_size = 0; 7886 } 7887 #endif 7888 7889 #if KMP_AFFINITY_SUPPORTED 7890 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7891 __kmp_cpuinfo_file = NULL; 7892 #endif /* KMP_AFFINITY_SUPPORTED */ 7893 7894 #if KMP_USE_ADAPTIVE_LOCKS 7895 #if KMP_DEBUG_ADAPTIVE_LOCKS 7896 __kmp_print_speculative_stats(); 7897 #endif 7898 #endif 7899 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7900 __kmp_nested_nth.nth = NULL; 7901 __kmp_nested_nth.size = 0; 7902 __kmp_nested_nth.used = 0; 7903 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7904 __kmp_nested_proc_bind.bind_types = NULL; 7905 __kmp_nested_proc_bind.size = 0; 7906 __kmp_nested_proc_bind.used = 0; 7907 if (__kmp_affinity_format) { 7908 KMP_INTERNAL_FREE(__kmp_affinity_format); 7909 __kmp_affinity_format = NULL; 7910 } 7911 7912 __kmp_i18n_catclose(); 7913 7914 #if KMP_USE_HIER_SCHED 7915 __kmp_hier_scheds.deallocate(); 7916 #endif 7917 7918 #if KMP_STATS_ENABLED 7919 __kmp_stats_fini(); 7920 #endif 7921 7922 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7923 } 7924 7925 /* ------------------------------------------------------------------------ */ 7926 7927 int __kmp_ignore_mppbeg(void) { 7928 char *env; 7929 7930 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7931 if (__kmp_str_match_false(env)) 7932 return FALSE; 7933 } 7934 // By default __kmpc_begin() is no-op. 7935 return TRUE; 7936 } 7937 7938 int __kmp_ignore_mppend(void) { 7939 char *env; 7940 7941 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7942 if (__kmp_str_match_false(env)) 7943 return FALSE; 7944 } 7945 // By default __kmpc_end() is no-op. 7946 return TRUE; 7947 } 7948 7949 void __kmp_internal_begin(void) { 7950 int gtid; 7951 kmp_root_t *root; 7952 7953 /* this is a very important step as it will register new sibling threads 7954 and assign these new uber threads a new gtid */ 7955 gtid = __kmp_entry_gtid(); 7956 root = __kmp_threads[gtid]->th.th_root; 7957 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7958 7959 if (root->r.r_begin) 7960 return; 7961 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7962 if (root->r.r_begin) { 7963 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7964 return; 7965 } 7966 7967 root->r.r_begin = TRUE; 7968 7969 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7970 } 7971 7972 /* ------------------------------------------------------------------------ */ 7973 7974 void __kmp_user_set_library(enum library_type arg) { 7975 int gtid; 7976 kmp_root_t *root; 7977 kmp_info_t *thread; 7978 7979 /* first, make sure we are initialized so we can get our gtid */ 7980 7981 gtid = __kmp_entry_gtid(); 7982 thread = __kmp_threads[gtid]; 7983 7984 root = thread->th.th_root; 7985 7986 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7987 library_serial)); 7988 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7989 thread */ 7990 KMP_WARNING(SetLibraryIncorrectCall); 7991 return; 7992 } 7993 7994 switch (arg) { 7995 case library_serial: 7996 thread->th.th_set_nproc = 0; 7997 set__nproc(thread, 1); 7998 break; 7999 case library_turnaround: 8000 thread->th.th_set_nproc = 0; 8001 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8002 : __kmp_dflt_team_nth_ub); 8003 break; 8004 case library_throughput: 8005 thread->th.th_set_nproc = 0; 8006 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8007 : __kmp_dflt_team_nth_ub); 8008 break; 8009 default: 8010 KMP_FATAL(UnknownLibraryType, arg); 8011 } 8012 8013 __kmp_aux_set_library(arg); 8014 } 8015 8016 void __kmp_aux_set_stacksize(size_t arg) { 8017 if (!__kmp_init_serial) 8018 __kmp_serial_initialize(); 8019 8020 #if KMP_OS_DARWIN 8021 if (arg & (0x1000 - 1)) { 8022 arg &= ~(0x1000 - 1); 8023 if (arg + 0x1000) /* check for overflow if we round up */ 8024 arg += 0x1000; 8025 } 8026 #endif 8027 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8028 8029 /* only change the default stacksize before the first parallel region */ 8030 if (!TCR_4(__kmp_init_parallel)) { 8031 size_t value = arg; /* argument is in bytes */ 8032 8033 if (value < __kmp_sys_min_stksize) 8034 value = __kmp_sys_min_stksize; 8035 else if (value > KMP_MAX_STKSIZE) 8036 value = KMP_MAX_STKSIZE; 8037 8038 __kmp_stksize = value; 8039 8040 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8041 } 8042 8043 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8044 } 8045 8046 /* set the behaviour of the runtime library */ 8047 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8048 void __kmp_aux_set_library(enum library_type arg) { 8049 __kmp_library = arg; 8050 8051 switch (__kmp_library) { 8052 case library_serial: { 8053 KMP_INFORM(LibraryIsSerial); 8054 } break; 8055 case library_turnaround: 8056 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8057 __kmp_use_yield = 2; // only yield when oversubscribed 8058 break; 8059 case library_throughput: 8060 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8061 __kmp_dflt_blocktime = 200; 8062 break; 8063 default: 8064 KMP_FATAL(UnknownLibraryType, arg); 8065 } 8066 } 8067 8068 /* Getting team information common for all team API */ 8069 // Returns NULL if not in teams construct 8070 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8071 kmp_info_t *thr = __kmp_entry_thread(); 8072 teams_serialized = 0; 8073 if (thr->th.th_teams_microtask) { 8074 kmp_team_t *team = thr->th.th_team; 8075 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8076 int ii = team->t.t_level; 8077 teams_serialized = team->t.t_serialized; 8078 int level = tlevel + 1; 8079 KMP_DEBUG_ASSERT(ii >= tlevel); 8080 while (ii > level) { 8081 for (teams_serialized = team->t.t_serialized; 8082 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8083 } 8084 if (team->t.t_serialized && (!teams_serialized)) { 8085 team = team->t.t_parent; 8086 continue; 8087 } 8088 if (ii > level) { 8089 team = team->t.t_parent; 8090 ii--; 8091 } 8092 } 8093 return team; 8094 } 8095 return NULL; 8096 } 8097 8098 int __kmp_aux_get_team_num() { 8099 int serialized; 8100 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8101 if (team) { 8102 if (serialized > 1) { 8103 return 0; // teams region is serialized ( 1 team of 1 thread ). 8104 } else { 8105 return team->t.t_master_tid; 8106 } 8107 } 8108 return 0; 8109 } 8110 8111 int __kmp_aux_get_num_teams() { 8112 int serialized; 8113 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8114 if (team) { 8115 if (serialized > 1) { 8116 return 1; 8117 } else { 8118 return team->t.t_parent->t.t_nproc; 8119 } 8120 } 8121 return 1; 8122 } 8123 8124 /* ------------------------------------------------------------------------ */ 8125 8126 /* 8127 * Affinity Format Parser 8128 * 8129 * Field is in form of: %[[[0].]size]type 8130 * % and type are required (%% means print a literal '%') 8131 * type is either single char or long name surrounded by {}, 8132 * e.g., N or {num_threads} 8133 * 0 => leading zeros 8134 * . => right justified when size is specified 8135 * by default output is left justified 8136 * size is the *minimum* field length 8137 * All other characters are printed as is 8138 * 8139 * Available field types: 8140 * L {thread_level} - omp_get_level() 8141 * n {thread_num} - omp_get_thread_num() 8142 * h {host} - name of host machine 8143 * P {process_id} - process id (integer) 8144 * T {thread_identifier} - native thread identifier (integer) 8145 * N {num_threads} - omp_get_num_threads() 8146 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8147 * a {thread_affinity} - comma separated list of integers or integer ranges 8148 * (values of affinity mask) 8149 * 8150 * Implementation-specific field types can be added 8151 * If a type is unknown, print "undefined" 8152 */ 8153 8154 // Structure holding the short name, long name, and corresponding data type 8155 // for snprintf. A table of these will represent the entire valid keyword 8156 // field types. 8157 typedef struct kmp_affinity_format_field_t { 8158 char short_name; // from spec e.g., L -> thread level 8159 const char *long_name; // from spec thread_level -> thread level 8160 char field_format; // data type for snprintf (typically 'd' or 's' 8161 // for integer or string) 8162 } kmp_affinity_format_field_t; 8163 8164 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8165 #if KMP_AFFINITY_SUPPORTED 8166 {'A', "thread_affinity", 's'}, 8167 #endif 8168 {'t', "team_num", 'd'}, 8169 {'T', "num_teams", 'd'}, 8170 {'L', "nesting_level", 'd'}, 8171 {'n', "thread_num", 'd'}, 8172 {'N', "num_threads", 'd'}, 8173 {'a', "ancestor_tnum", 'd'}, 8174 {'H', "host", 's'}, 8175 {'P', "process_id", 'd'}, 8176 {'i', "native_thread_id", 'd'}}; 8177 8178 // Return the number of characters it takes to hold field 8179 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8180 const char **ptr, 8181 kmp_str_buf_t *field_buffer) { 8182 int rc, format_index, field_value; 8183 const char *width_left, *width_right; 8184 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8185 static const int FORMAT_SIZE = 20; 8186 char format[FORMAT_SIZE] = {0}; 8187 char absolute_short_name = 0; 8188 8189 KMP_DEBUG_ASSERT(gtid >= 0); 8190 KMP_DEBUG_ASSERT(th); 8191 KMP_DEBUG_ASSERT(**ptr == '%'); 8192 KMP_DEBUG_ASSERT(field_buffer); 8193 8194 __kmp_str_buf_clear(field_buffer); 8195 8196 // Skip the initial % 8197 (*ptr)++; 8198 8199 // Check for %% first 8200 if (**ptr == '%') { 8201 __kmp_str_buf_cat(field_buffer, "%", 1); 8202 (*ptr)++; // skip over the second % 8203 return 1; 8204 } 8205 8206 // Parse field modifiers if they are present 8207 pad_zeros = false; 8208 if (**ptr == '0') { 8209 pad_zeros = true; 8210 (*ptr)++; // skip over 0 8211 } 8212 right_justify = false; 8213 if (**ptr == '.') { 8214 right_justify = true; 8215 (*ptr)++; // skip over . 8216 } 8217 // Parse width of field: [width_left, width_right) 8218 width_left = width_right = NULL; 8219 if (**ptr >= '0' && **ptr <= '9') { 8220 width_left = *ptr; 8221 SKIP_DIGITS(*ptr); 8222 width_right = *ptr; 8223 } 8224 8225 // Create the format for KMP_SNPRINTF based on flags parsed above 8226 format_index = 0; 8227 format[format_index++] = '%'; 8228 if (!right_justify) 8229 format[format_index++] = '-'; 8230 if (pad_zeros) 8231 format[format_index++] = '0'; 8232 if (width_left && width_right) { 8233 int i = 0; 8234 // Only allow 8 digit number widths. 8235 // This also prevents overflowing format variable 8236 while (i < 8 && width_left < width_right) { 8237 format[format_index++] = *width_left; 8238 width_left++; 8239 i++; 8240 } 8241 } 8242 8243 // Parse a name (long or short) 8244 // Canonicalize the name into absolute_short_name 8245 found_valid_name = false; 8246 parse_long_name = (**ptr == '{'); 8247 if (parse_long_name) 8248 (*ptr)++; // skip initial left brace 8249 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8250 sizeof(__kmp_affinity_format_table[0]); 8251 ++i) { 8252 char short_name = __kmp_affinity_format_table[i].short_name; 8253 const char *long_name = __kmp_affinity_format_table[i].long_name; 8254 char field_format = __kmp_affinity_format_table[i].field_format; 8255 if (parse_long_name) { 8256 size_t length = KMP_STRLEN(long_name); 8257 if (strncmp(*ptr, long_name, length) == 0) { 8258 found_valid_name = true; 8259 (*ptr) += length; // skip the long name 8260 } 8261 } else if (**ptr == short_name) { 8262 found_valid_name = true; 8263 (*ptr)++; // skip the short name 8264 } 8265 if (found_valid_name) { 8266 format[format_index++] = field_format; 8267 format[format_index++] = '\0'; 8268 absolute_short_name = short_name; 8269 break; 8270 } 8271 } 8272 if (parse_long_name) { 8273 if (**ptr != '}') { 8274 absolute_short_name = 0; 8275 } else { 8276 (*ptr)++; // skip over the right brace 8277 } 8278 } 8279 8280 // Attempt to fill the buffer with the requested 8281 // value using snprintf within __kmp_str_buf_print() 8282 switch (absolute_short_name) { 8283 case 't': 8284 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8285 break; 8286 case 'T': 8287 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8288 break; 8289 case 'L': 8290 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8291 break; 8292 case 'n': 8293 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8294 break; 8295 case 'H': { 8296 static const int BUFFER_SIZE = 256; 8297 char buf[BUFFER_SIZE]; 8298 __kmp_expand_host_name(buf, BUFFER_SIZE); 8299 rc = __kmp_str_buf_print(field_buffer, format, buf); 8300 } break; 8301 case 'P': 8302 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8303 break; 8304 case 'i': 8305 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8306 break; 8307 case 'N': 8308 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8309 break; 8310 case 'a': 8311 field_value = 8312 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8313 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8314 break; 8315 #if KMP_AFFINITY_SUPPORTED 8316 case 'A': { 8317 kmp_str_buf_t buf; 8318 __kmp_str_buf_init(&buf); 8319 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8320 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8321 __kmp_str_buf_free(&buf); 8322 } break; 8323 #endif 8324 default: 8325 // According to spec, If an implementation does not have info for field 8326 // type, then "undefined" is printed 8327 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8328 // Skip the field 8329 if (parse_long_name) { 8330 SKIP_TOKEN(*ptr); 8331 if (**ptr == '}') 8332 (*ptr)++; 8333 } else { 8334 (*ptr)++; 8335 } 8336 } 8337 8338 KMP_ASSERT(format_index <= FORMAT_SIZE); 8339 return rc; 8340 } 8341 8342 /* 8343 * Return number of characters needed to hold the affinity string 8344 * (not including null byte character) 8345 * The resultant string is printed to buffer, which the caller can then 8346 * handle afterwards 8347 */ 8348 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8349 kmp_str_buf_t *buffer) { 8350 const char *parse_ptr; 8351 size_t retval; 8352 const kmp_info_t *th; 8353 kmp_str_buf_t field; 8354 8355 KMP_DEBUG_ASSERT(buffer); 8356 KMP_DEBUG_ASSERT(gtid >= 0); 8357 8358 __kmp_str_buf_init(&field); 8359 __kmp_str_buf_clear(buffer); 8360 8361 th = __kmp_threads[gtid]; 8362 retval = 0; 8363 8364 // If format is NULL or zero-length string, then we use 8365 // affinity-format-var ICV 8366 parse_ptr = format; 8367 if (parse_ptr == NULL || *parse_ptr == '\0') { 8368 parse_ptr = __kmp_affinity_format; 8369 } 8370 KMP_DEBUG_ASSERT(parse_ptr); 8371 8372 while (*parse_ptr != '\0') { 8373 // Parse a field 8374 if (*parse_ptr == '%') { 8375 // Put field in the buffer 8376 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8377 __kmp_str_buf_catbuf(buffer, &field); 8378 retval += rc; 8379 } else { 8380 // Put literal character in buffer 8381 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8382 retval++; 8383 parse_ptr++; 8384 } 8385 } 8386 __kmp_str_buf_free(&field); 8387 return retval; 8388 } 8389 8390 // Displays the affinity string to stdout 8391 void __kmp_aux_display_affinity(int gtid, const char *format) { 8392 kmp_str_buf_t buf; 8393 __kmp_str_buf_init(&buf); 8394 __kmp_aux_capture_affinity(gtid, format, &buf); 8395 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8396 __kmp_str_buf_free(&buf); 8397 } 8398 8399 /* ------------------------------------------------------------------------ */ 8400 8401 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8402 int blocktime = arg; /* argument is in milliseconds */ 8403 #if KMP_USE_MONITOR 8404 int bt_intervals; 8405 #endif 8406 kmp_int8 bt_set; 8407 8408 __kmp_save_internal_controls(thread); 8409 8410 /* Normalize and set blocktime for the teams */ 8411 if (blocktime < KMP_MIN_BLOCKTIME) 8412 blocktime = KMP_MIN_BLOCKTIME; 8413 else if (blocktime > KMP_MAX_BLOCKTIME) 8414 blocktime = KMP_MAX_BLOCKTIME; 8415 8416 set__blocktime_team(thread->th.th_team, tid, blocktime); 8417 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8418 8419 #if KMP_USE_MONITOR 8420 /* Calculate and set blocktime intervals for the teams */ 8421 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8422 8423 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8424 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8425 #endif 8426 8427 /* Set whether blocktime has been set to "TRUE" */ 8428 bt_set = TRUE; 8429 8430 set__bt_set_team(thread->th.th_team, tid, bt_set); 8431 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8432 #if KMP_USE_MONITOR 8433 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8434 "bt_intervals=%d, monitor_updates=%d\n", 8435 __kmp_gtid_from_tid(tid, thread->th.th_team), 8436 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8437 __kmp_monitor_wakeups)); 8438 #else 8439 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8440 __kmp_gtid_from_tid(tid, thread->th.th_team), 8441 thread->th.th_team->t.t_id, tid, blocktime)); 8442 #endif 8443 } 8444 8445 void __kmp_aux_set_defaults(char const *str, size_t len) { 8446 if (!__kmp_init_serial) { 8447 __kmp_serial_initialize(); 8448 } 8449 __kmp_env_initialize(str); 8450 8451 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8452 __kmp_env_print(); 8453 } 8454 } // __kmp_aux_set_defaults 8455 8456 /* ------------------------------------------------------------------------ */ 8457 /* internal fast reduction routines */ 8458 8459 PACKED_REDUCTION_METHOD_T 8460 __kmp_determine_reduction_method( 8461 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8462 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8463 kmp_critical_name *lck) { 8464 8465 // Default reduction method: critical construct ( lck != NULL, like in current 8466 // PAROPT ) 8467 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8468 // can be selected by RTL 8469 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8470 // can be selected by RTL 8471 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8472 // among generated by PAROPT. 8473 8474 PACKED_REDUCTION_METHOD_T retval; 8475 8476 int team_size; 8477 8478 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8479 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8480 8481 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8482 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8483 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8484 8485 retval = critical_reduce_block; 8486 8487 // another choice of getting a team size (with 1 dynamic deference) is slower 8488 team_size = __kmp_get_team_num_threads(global_tid); 8489 if (team_size == 1) { 8490 8491 retval = empty_reduce_block; 8492 8493 } else { 8494 8495 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8496 8497 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8498 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8499 8500 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8501 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8502 8503 int teamsize_cutoff = 4; 8504 8505 #if KMP_MIC_SUPPORTED 8506 if (__kmp_mic_type != non_mic) { 8507 teamsize_cutoff = 8; 8508 } 8509 #endif 8510 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8511 if (tree_available) { 8512 if (team_size <= teamsize_cutoff) { 8513 if (atomic_available) { 8514 retval = atomic_reduce_block; 8515 } 8516 } else { 8517 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8518 } 8519 } else if (atomic_available) { 8520 retval = atomic_reduce_block; 8521 } 8522 #else 8523 #error "Unknown or unsupported OS" 8524 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8525 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8526 8527 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8528 8529 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8530 8531 // basic tuning 8532 8533 if (atomic_available) { 8534 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8535 retval = atomic_reduce_block; 8536 } 8537 } // otherwise: use critical section 8538 8539 #elif KMP_OS_DARWIN 8540 8541 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8542 if (atomic_available && (num_vars <= 3)) { 8543 retval = atomic_reduce_block; 8544 } else if (tree_available) { 8545 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8546 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8547 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8548 } 8549 } // otherwise: use critical section 8550 8551 #else 8552 #error "Unknown or unsupported OS" 8553 #endif 8554 8555 #else 8556 #error "Unknown or unsupported architecture" 8557 #endif 8558 } 8559 8560 // KMP_FORCE_REDUCTION 8561 8562 // If the team is serialized (team_size == 1), ignore the forced reduction 8563 // method and stay with the unsynchronized method (empty_reduce_block) 8564 if (__kmp_force_reduction_method != reduction_method_not_defined && 8565 team_size != 1) { 8566 8567 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8568 8569 int atomic_available, tree_available; 8570 8571 switch ((forced_retval = __kmp_force_reduction_method)) { 8572 case critical_reduce_block: 8573 KMP_ASSERT(lck); // lck should be != 0 8574 break; 8575 8576 case atomic_reduce_block: 8577 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8578 if (!atomic_available) { 8579 KMP_WARNING(RedMethodNotSupported, "atomic"); 8580 forced_retval = critical_reduce_block; 8581 } 8582 break; 8583 8584 case tree_reduce_block: 8585 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8586 if (!tree_available) { 8587 KMP_WARNING(RedMethodNotSupported, "tree"); 8588 forced_retval = critical_reduce_block; 8589 } else { 8590 #if KMP_FAST_REDUCTION_BARRIER 8591 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8592 #endif 8593 } 8594 break; 8595 8596 default: 8597 KMP_ASSERT(0); // "unsupported method specified" 8598 } 8599 8600 retval = forced_retval; 8601 } 8602 8603 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8604 8605 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8606 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8607 8608 return (retval); 8609 } 8610 // this function is for testing set/get/determine reduce method 8611 kmp_int32 __kmp_get_reduce_method(void) { 8612 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8613 } 8614 8615 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8616 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8617 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8618 8619 // Hard pause shuts down the runtime completely. Resume happens naturally when 8620 // OpenMP is used subsequently. 8621 void __kmp_hard_pause() { 8622 __kmp_pause_status = kmp_hard_paused; 8623 __kmp_internal_end_thread(-1); 8624 } 8625 8626 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8627 void __kmp_resume_if_soft_paused() { 8628 if (__kmp_pause_status == kmp_soft_paused) { 8629 __kmp_pause_status = kmp_not_paused; 8630 8631 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8632 kmp_info_t *thread = __kmp_threads[gtid]; 8633 if (thread) { // Wake it if sleeping 8634 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8635 thread); 8636 if (fl.is_sleeping()) 8637 fl.resume(gtid); 8638 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8639 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8640 } else { // thread holds the lock and may sleep soon 8641 do { // until either the thread sleeps, or we can get the lock 8642 if (fl.is_sleeping()) { 8643 fl.resume(gtid); 8644 break; 8645 } else if (__kmp_try_suspend_mx(thread)) { 8646 __kmp_unlock_suspend_mx(thread); 8647 break; 8648 } 8649 } while (1); 8650 } 8651 } 8652 } 8653 } 8654 } 8655 8656 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8657 // TODO: add warning messages 8658 int __kmp_pause_resource(kmp_pause_status_t level) { 8659 if (level == kmp_not_paused) { // requesting resume 8660 if (__kmp_pause_status == kmp_not_paused) { 8661 // error message about runtime not being paused, so can't resume 8662 return 1; 8663 } else { 8664 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8665 __kmp_pause_status == kmp_hard_paused); 8666 __kmp_pause_status = kmp_not_paused; 8667 return 0; 8668 } 8669 } else if (level == kmp_soft_paused) { // requesting soft pause 8670 if (__kmp_pause_status != kmp_not_paused) { 8671 // error message about already being paused 8672 return 1; 8673 } else { 8674 __kmp_soft_pause(); 8675 return 0; 8676 } 8677 } else if (level == kmp_hard_paused) { // requesting hard pause 8678 if (__kmp_pause_status != kmp_not_paused) { 8679 // error message about already being paused 8680 return 1; 8681 } else { 8682 __kmp_hard_pause(); 8683 return 0; 8684 } 8685 } else { 8686 // error message about invalid level 8687 return 1; 8688 } 8689 } 8690 8691 void __kmp_omp_display_env(int verbose) { 8692 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8693 if (__kmp_init_serial == 0) 8694 __kmp_do_serial_initialize(); 8695 __kmp_display_env_impl(!verbose, verbose); 8696 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8697 } 8698 8699 // Globals and functions for hidden helper task 8700 kmp_info_t **__kmp_hidden_helper_threads; 8701 kmp_info_t *__kmp_hidden_helper_main_thread; 8702 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8703 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8704 #if KMP_OS_LINUX 8705 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8706 #else 8707 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8708 #endif 8709 8710 namespace { 8711 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8712 8713 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8714 // This is an explicit synchronization on all hidden helper threads in case 8715 // that when a regular thread pushes a hidden helper task to one hidden 8716 // helper thread, the thread has not been awaken once since they're released 8717 // by the main thread after creating the team. 8718 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8719 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8720 __kmp_hidden_helper_threads_num) 8721 ; 8722 8723 // If main thread, then wait for signal 8724 if (__kmpc_master(nullptr, *gtid)) { 8725 // First, unset the initial state and release the initial thread 8726 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8727 __kmp_hidden_helper_initz_release(); 8728 __kmp_hidden_helper_main_thread_wait(); 8729 // Now wake up all worker threads 8730 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8731 __kmp_hidden_helper_worker_thread_signal(); 8732 } 8733 } 8734 } 8735 } // namespace 8736 8737 void __kmp_hidden_helper_threads_initz_routine() { 8738 // Create a new root for hidden helper team/threads 8739 const int gtid = __kmp_register_root(TRUE); 8740 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8741 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8742 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8743 __kmp_hidden_helper_threads_num; 8744 8745 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8746 8747 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8748 8749 // Set the initialization flag to FALSE 8750 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8751 8752 __kmp_hidden_helper_threads_deinitz_release(); 8753 } 8754 8755 /* Nesting Mode: 8756 Set via KMP_NESTING_MODE, which takes an integer. 8757 Note: we skip duplicate topology levels, and skip levels with only 8758 one entity. 8759 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 8760 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 8761 in the topology, and initializes the number of threads at each of those 8762 levels to the number of entities at each level, respectively, below the 8763 entity at the parent level. 8764 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 8765 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 8766 the user to turn nesting on explicitly. This is an even more experimental 8767 option to this experimental feature, and may change or go away in the 8768 future. 8769 */ 8770 8771 // Allocate space to store nesting levels 8772 void __kmp_init_nesting_mode() { 8773 int levels = KMP_HW_LAST; 8774 __kmp_nesting_mode_nlevels = levels; 8775 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 8776 for (int i = 0; i < levels; ++i) 8777 __kmp_nesting_nth_level[i] = 0; 8778 if (__kmp_nested_nth.size < levels) { 8779 __kmp_nested_nth.nth = 8780 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 8781 __kmp_nested_nth.size = levels; 8782 } 8783 } 8784 8785 // Set # threads for top levels of nesting; must be called after topology set 8786 void __kmp_set_nesting_mode_threads() { 8787 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 8788 8789 if (__kmp_nesting_mode == 1) 8790 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 8791 else if (__kmp_nesting_mode > 1) 8792 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8793 8794 if (__kmp_topology) { // use topology info 8795 int loc, hw_level; 8796 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 8797 loc < __kmp_nesting_mode_nlevels; 8798 loc++, hw_level++) { 8799 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 8800 if (__kmp_nesting_nth_level[loc] == 1) 8801 loc--; 8802 } 8803 // Make sure all cores are used 8804 if (__kmp_nesting_mode > 1 && loc > 1) { 8805 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 8806 int num_cores = __kmp_topology->get_count(core_level); 8807 int upper_levels = 1; 8808 for (int level = 0; level < loc - 1; ++level) 8809 upper_levels *= __kmp_nesting_nth_level[level]; 8810 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 8811 __kmp_nesting_nth_level[loc - 1] = 8812 num_cores / __kmp_nesting_nth_level[loc - 2]; 8813 } 8814 __kmp_nesting_mode_nlevels = loc; 8815 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8816 } else { // no topology info available; provide a reasonable guesstimation 8817 if (__kmp_avail_proc >= 4) { 8818 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 8819 __kmp_nesting_nth_level[1] = 2; 8820 __kmp_nesting_mode_nlevels = 2; 8821 } else { 8822 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 8823 __kmp_nesting_mode_nlevels = 1; 8824 } 8825 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8826 } 8827 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 8828 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 8829 } 8830 set__nproc(thread, __kmp_nesting_nth_level[0]); 8831 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 8832 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8833 if (get__max_active_levels(thread) > 1) { 8834 // if max levels was set, set nesting mode levels to same 8835 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 8836 } 8837 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 8838 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 8839 } 8840