1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #if KMP_OS_WINDOWS 51 // windows does not need include files as it doesn't use shared memory 52 #else 53 #include <sys/mman.h> 54 #include <sys/stat.h> 55 #include <fcntl.h> 56 #define SHM_SIZE 1024 57 #endif 58 59 #if defined(KMP_GOMP_COMPAT) 60 char const __kmp_version_alt_comp[] = 61 KMP_VERSION_PREFIX "alternative compiler support: yes"; 62 #endif /* defined(KMP_GOMP_COMPAT) */ 63 64 char const __kmp_version_omp_api[] = 65 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 66 67 #ifdef KMP_DEBUG 68 char const __kmp_version_lock[] = 69 KMP_VERSION_PREFIX "lock type: run time selectable"; 70 #endif /* KMP_DEBUG */ 71 72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 73 74 /* ------------------------------------------------------------------------ */ 75 76 #if KMP_USE_MONITOR 77 kmp_info_t __kmp_monitor; 78 #endif 79 80 /* Forward declarations */ 81 82 void __kmp_cleanup(void); 83 84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 85 int gtid); 86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 87 kmp_internal_control_t *new_icvs, 88 ident_t *loc); 89 #if KMP_AFFINITY_SUPPORTED 90 static void __kmp_partition_places(kmp_team_t *team, 91 int update_master_only = 0); 92 #endif 93 static void __kmp_do_serial_initialize(void); 94 void __kmp_fork_barrier(int gtid, int tid); 95 void __kmp_join_barrier(int gtid); 96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 97 kmp_internal_control_t *new_icvs, ident_t *loc); 98 99 #ifdef USE_LOAD_BALANCE 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 101 #endif 102 103 static int __kmp_expand_threads(int nNeed); 104 #if KMP_OS_WINDOWS 105 static int __kmp_unregister_root_other_thread(int gtid); 106 #endif 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 109 110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 111 int new_nthreads); 112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); 113 114 /* Calculate the identifier of the current thread */ 115 /* fast (and somewhat portable) way to get unique identifier of executing 116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 117 int __kmp_get_global_thread_id() { 118 int i; 119 kmp_info_t **other_threads; 120 size_t stack_data; 121 char *stack_addr; 122 size_t stack_size; 123 char *stack_base; 124 125 KA_TRACE( 126 1000, 127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 128 __kmp_nth, __kmp_all_nth)); 129 130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 133 __kmp_init_gtid for this to work. */ 134 135 if (!TCR_4(__kmp_init_gtid)) 136 return KMP_GTID_DNE; 137 138 #ifdef KMP_TDATA_GTID 139 if (TCR_4(__kmp_gtid_mode) >= 3) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 141 return __kmp_gtid; 142 } 143 #endif 144 if (TCR_4(__kmp_gtid_mode) >= 2) { 145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 146 return __kmp_gtid_get_specific(); 147 } 148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 149 150 stack_addr = (char *)&stack_data; 151 other_threads = __kmp_threads; 152 153 /* ATT: The code below is a source of potential bugs due to unsynchronized 154 access to __kmp_threads array. For example: 155 1. Current thread loads other_threads[i] to thr and checks it, it is 156 non-NULL. 157 2. Current thread is suspended by OS. 158 3. Another thread unregisters and finishes (debug versions of free() 159 may fill memory with something like 0xEF). 160 4. Current thread is resumed. 161 5. Current thread reads junk from *thr. 162 TODO: Fix it. --ln */ 163 164 for (i = 0; i < __kmp_threads_capacity; i++) { 165 166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 167 if (!thr) 168 continue; 169 170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 172 173 /* stack grows down -- search through all of the active threads */ 174 175 if (stack_addr <= stack_base) { 176 size_t stack_diff = stack_base - stack_addr; 177 178 if (stack_diff <= stack_size) { 179 /* The only way we can be closer than the allocated */ 180 /* stack size is if we are running on this thread. */ 181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 182 return i; 183 } 184 } 185 } 186 187 /* get specific to try and determine our gtid */ 188 KA_TRACE(1000, 189 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 190 "thread, using TLS\n")); 191 i = __kmp_gtid_get_specific(); 192 193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 194 195 /* if we havn't been assigned a gtid, then return code */ 196 if (i < 0) 197 return i; 198 199 /* dynamically updated stack window for uber threads to avoid get_specific 200 call */ 201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 202 KMP_FATAL(StackOverflow, i); 203 } 204 205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 206 if (stack_addr > stack_base) { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 210 stack_base); 211 } else { 212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 213 stack_base - stack_addr); 214 } 215 216 /* Reprint stack bounds for ubermaster since they have been refined */ 217 if (__kmp_storage_map) { 218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 221 other_threads[i]->th.th_info.ds.ds_stacksize, 222 "th_%d stack (refinement)", i); 223 } 224 return i; 225 } 226 227 int __kmp_get_global_thread_id_reg() { 228 int gtid; 229 230 if (!__kmp_init_serial) { 231 gtid = KMP_GTID_DNE; 232 } else 233 #ifdef KMP_TDATA_GTID 234 if (TCR_4(__kmp_gtid_mode) >= 3) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 236 gtid = __kmp_gtid; 237 } else 238 #endif 239 if (TCR_4(__kmp_gtid_mode) >= 2) { 240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 241 gtid = __kmp_gtid_get_specific(); 242 } else { 243 KA_TRACE(1000, 244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 245 gtid = __kmp_get_global_thread_id(); 246 } 247 248 /* we must be a new uber master sibling thread */ 249 if (gtid == KMP_GTID_DNE) { 250 KA_TRACE(10, 251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 252 "Registering a new gtid.\n")); 253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 254 if (!__kmp_init_serial) { 255 __kmp_do_serial_initialize(); 256 gtid = __kmp_gtid_get_specific(); 257 } else { 258 gtid = __kmp_register_root(FALSE); 259 } 260 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 262 } 263 264 KMP_DEBUG_ASSERT(gtid >= 0); 265 266 return gtid; 267 } 268 269 /* caller must hold forkjoin_lock */ 270 void __kmp_check_stack_overlap(kmp_info_t *th) { 271 int f; 272 char *stack_beg = NULL; 273 char *stack_end = NULL; 274 int gtid; 275 276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 277 if (__kmp_storage_map) { 278 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 280 281 gtid = __kmp_gtid_from_thread(th); 282 283 if (gtid == KMP_GTID_MONITOR) { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%s stack (%s)", "mon", 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } else { 289 __kmp_print_storage_map_gtid( 290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 291 "th_%d stack (%s)", gtid, 292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 293 } 294 } 295 296 /* No point in checking ubermaster threads since they use refinement and 297 * cannot overlap */ 298 gtid = __kmp_gtid_from_thread(th); 299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 300 KA_TRACE(10, 301 ("__kmp_check_stack_overlap: performing extensive checking\n")); 302 if (stack_beg == NULL) { 303 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 305 } 306 307 for (f = 0; f < __kmp_threads_capacity; f++) { 308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 309 310 if (f_th && f_th != th) { 311 char *other_stack_end = 312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 313 char *other_stack_beg = 314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 316 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 317 318 /* Print the other stack values before the abort */ 319 if (__kmp_storage_map) 320 __kmp_print_storage_map_gtid( 321 -1, other_stack_beg, other_stack_end, 322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 324 325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 326 __kmp_msg_null); 327 } 328 } 329 } 330 } 331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 332 } 333 334 /* ------------------------------------------------------------------------ */ 335 336 void __kmp_infinite_loop(void) { 337 static int done = FALSE; 338 339 while (!done) { 340 KMP_YIELD(TRUE); 341 } 342 } 343 344 #define MAX_MESSAGE 512 345 346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 347 char const *format, ...) { 348 char buffer[MAX_MESSAGE]; 349 va_list ap; 350 351 va_start(ap, format); 352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 353 p2, (unsigned long)size, format); 354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 355 __kmp_vprintf(kmp_err, buffer, ap); 356 #if KMP_PRINT_DATA_PLACEMENT 357 int node; 358 if (gtid >= 0) { 359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 360 if (__kmp_storage_map_verbose) { 361 node = __kmp_get_host_node(p1); 362 if (node < 0) /* doesn't work, so don't try this next time */ 363 __kmp_storage_map_verbose = FALSE; 364 else { 365 char *last; 366 int lastNode; 367 int localProc = __kmp_get_cpu_from_gtid(gtid); 368 369 const int page_size = KMP_GET_PAGE_SIZE(); 370 371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 373 if (localProc >= 0) 374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 375 localProc >> 1); 376 else 377 __kmp_printf_no_lock(" GTID %d\n", gtid); 378 #if KMP_USE_PRCTL 379 /* The more elaborate format is disabled for now because of the prctl 380 * hanging bug. */ 381 do { 382 last = p1; 383 lastNode = node; 384 /* This loop collates adjacent pages with the same host node. */ 385 do { 386 (char *)p1 += page_size; 387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 389 lastNode); 390 } while (p1 <= p2); 391 #else 392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 393 (char *)p1 + (page_size - 1), 394 __kmp_get_host_node(p1)); 395 if (p1 < p2) { 396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 397 (char *)p2 + (page_size - 1), 398 __kmp_get_host_node(p2)); 399 } 400 #endif 401 } 402 } 403 } else 404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 405 } 406 #endif /* KMP_PRINT_DATA_PLACEMENT */ 407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 408 } 409 410 void __kmp_warn(char const *format, ...) { 411 char buffer[MAX_MESSAGE]; 412 va_list ap; 413 414 if (__kmp_generate_warnings == kmp_warnings_off) { 415 return; 416 } 417 418 va_start(ap, format); 419 420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 422 __kmp_vprintf(kmp_err, buffer, ap); 423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 424 425 va_end(ap); 426 } 427 428 void __kmp_abort_process() { 429 // Later threads may stall here, but that's ok because abort() will kill them. 430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 431 432 if (__kmp_debug_buf) { 433 __kmp_dump_debug_buffer(); 434 } 435 436 if (KMP_OS_WINDOWS) { 437 // Let other threads know of abnormal termination and prevent deadlock 438 // if abort happened during library initialization or shutdown 439 __kmp_global.g.g_abort = SIGABRT; 440 441 /* On Windows* OS by default abort() causes pop-up error box, which stalls 442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 443 boxes. _set_abort_behavior() works well, but this function is not 444 available in VS7 (this is not problem for DLL, but it is a problem for 445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 446 help, at least in some versions of MS C RTL. 447 448 It seems following sequence is the only way to simulate abort() and 449 avoid pop-up error box. */ 450 raise(SIGABRT); 451 _exit(3); // Just in case, if signal ignored, exit anyway. 452 } else { 453 __kmp_unregister_library(); 454 abort(); 455 } 456 457 __kmp_infinite_loop(); 458 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 459 460 } // __kmp_abort_process 461 462 void __kmp_abort_thread(void) { 463 // TODO: Eliminate g_abort global variable and this function. 464 // In case of abort just call abort(), it will kill all the threads. 465 __kmp_infinite_loop(); 466 } // __kmp_abort_thread 467 468 /* Print out the storage map for the major kmp_info_t thread data structures 469 that are allocated together. */ 470 471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 473 gtid); 474 475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 476 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 477 478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 479 sizeof(kmp_local_t), "th_%d.th_local", gtid); 480 481 __kmp_print_storage_map_gtid( 482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 486 &thr->th.th_bar[bs_plain_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 488 gtid); 489 490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 491 &thr->th.th_bar[bs_forkjoin_barrier + 1], 492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 493 gtid); 494 495 #if KMP_FAST_REDUCTION_BARRIER 496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 497 &thr->th.th_bar[bs_reduction_barrier + 1], 498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 499 gtid); 500 #endif // KMP_FAST_REDUCTION_BARRIER 501 } 502 503 /* Print out the storage map for the major kmp_team_t team data structures 504 that are allocated together. */ 505 506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 507 int team_id, int num_thr) { 508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 510 header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 513 &team->t.t_bar[bs_last_barrier], 514 sizeof(kmp_balign_team_t) * bs_last_barrier, 515 "%s_%d.t_bar", header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 518 &team->t.t_bar[bs_plain_barrier + 1], 519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 520 header, team_id); 521 522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 523 &team->t.t_bar[bs_forkjoin_barrier + 1], 524 sizeof(kmp_balign_team_t), 525 "%s_%d.t_bar[forkjoin]", header, team_id); 526 527 #if KMP_FAST_REDUCTION_BARRIER 528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 529 &team->t.t_bar[bs_reduction_barrier + 1], 530 sizeof(kmp_balign_team_t), 531 "%s_%d.t_bar[reduction]", header, team_id); 532 #endif // KMP_FAST_REDUCTION_BARRIER 533 534 __kmp_print_storage_map_gtid( 535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 537 538 __kmp_print_storage_map_gtid( 539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 541 542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 543 &team->t.t_disp_buffer[num_disp_buff], 544 sizeof(dispatch_shared_info_t) * num_disp_buff, 545 "%s_%d.t_disp_buffer", header, team_id); 546 } 547 548 static void __kmp_init_allocator() { 549 __kmp_init_memkind(); 550 __kmp_init_target_mem(); 551 } 552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 553 554 /* ------------------------------------------------------------------------ */ 555 556 #if KMP_DYNAMIC_LIB 557 #if KMP_OS_WINDOWS 558 559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 561 562 switch (fdwReason) { 563 564 case DLL_PROCESS_ATTACH: 565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 566 567 return TRUE; 568 569 case DLL_PROCESS_DETACH: 570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 571 572 // According to Windows* documentation for DllMain entry point: 573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 574 // lpReserved == NULL when FreeLibrary() is called, 575 // lpReserved != NULL when the process is terminated. 576 // When FreeLibrary() is called, worker threads remain alive. So the 577 // runtime's state is consistent and executing proper shutdown is OK. 578 // When the process is terminated, worker threads have exited or been 579 // forcefully terminated by the OS and only the shutdown thread remains. 580 // This can leave the runtime in an inconsistent state. 581 // Hence, only attempt proper cleanup when FreeLibrary() is called. 582 // Otherwise, rely on OS to reclaim resources. 583 if (lpReserved == NULL) 584 __kmp_internal_end_library(__kmp_gtid_get_specific()); 585 586 return TRUE; 587 588 case DLL_THREAD_ATTACH: 589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 590 591 /* if we want to register new siblings all the time here call 592 * __kmp_get_gtid(); */ 593 return TRUE; 594 595 case DLL_THREAD_DETACH: 596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 597 598 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 599 return TRUE; 600 } 601 602 return TRUE; 603 } 604 605 #endif /* KMP_OS_WINDOWS */ 606 #endif /* KMP_DYNAMIC_LIB */ 607 608 /* __kmp_parallel_deo -- Wait until it's our turn. */ 609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 610 int gtid = *gtid_ref; 611 #ifdef BUILD_PARALLEL_ORDERED 612 kmp_team_t *team = __kmp_team_from_gtid(gtid); 613 #endif /* BUILD_PARALLEL_ORDERED */ 614 615 if (__kmp_env_consistency_check) { 616 if (__kmp_threads[gtid]->th.th_root->r.r_active) 617 #if KMP_USE_DYNAMIC_LOCK 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 619 #else 620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 621 #endif 622 } 623 #ifdef BUILD_PARALLEL_ORDERED 624 if (!team->t.t_serialized) { 625 KMP_MB(); 626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 627 NULL); 628 KMP_MB(); 629 } 630 #endif /* BUILD_PARALLEL_ORDERED */ 631 } 632 633 /* __kmp_parallel_dxo -- Signal the next task. */ 634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 635 int gtid = *gtid_ref; 636 #ifdef BUILD_PARALLEL_ORDERED 637 int tid = __kmp_tid_from_gtid(gtid); 638 kmp_team_t *team = __kmp_team_from_gtid(gtid); 639 #endif /* BUILD_PARALLEL_ORDERED */ 640 641 if (__kmp_env_consistency_check) { 642 if (__kmp_threads[gtid]->th.th_root->r.r_active) 643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 644 } 645 #ifdef BUILD_PARALLEL_ORDERED 646 if (!team->t.t_serialized) { 647 KMP_MB(); /* Flush all pending memory write invalidates. */ 648 649 /* use the tid of the next thread in this team */ 650 /* TODO replace with general release procedure */ 651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 652 653 KMP_MB(); /* Flush all pending memory write invalidates. */ 654 } 655 #endif /* BUILD_PARALLEL_ORDERED */ 656 } 657 658 /* ------------------------------------------------------------------------ */ 659 /* The BARRIER for a SINGLE process section is always explicit */ 660 661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 662 int status; 663 kmp_info_t *th; 664 kmp_team_t *team; 665 666 if (!TCR_4(__kmp_init_parallel)) 667 __kmp_parallel_initialize(); 668 __kmp_resume_if_soft_paused(); 669 670 th = __kmp_threads[gtid]; 671 team = th->th.th_team; 672 status = 0; 673 674 th->th.th_ident = id_ref; 675 676 if (team->t.t_serialized) { 677 status = 1; 678 } else { 679 kmp_int32 old_this = th->th.th_local.this_construct; 680 681 ++th->th.th_local.this_construct; 682 /* try to set team count to thread count--success means thread got the 683 single block */ 684 /* TODO: Should this be acquire or release? */ 685 if (team->t.t_construct == old_this) { 686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 687 th->th.th_local.this_construct); 688 } 689 #if USE_ITT_BUILD 690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 692 team->t.t_active_level == 1) { 693 // Only report metadata by primary thread of active team at level 1 694 __kmp_itt_metadata_single(id_ref); 695 } 696 #endif /* USE_ITT_BUILD */ 697 } 698 699 if (__kmp_env_consistency_check) { 700 if (status && push_ws) { 701 __kmp_push_workshare(gtid, ct_psingle, id_ref); 702 } else { 703 __kmp_check_workshare(gtid, ct_psingle, id_ref); 704 } 705 } 706 #if USE_ITT_BUILD 707 if (status) { 708 __kmp_itt_single_start(gtid); 709 } 710 #endif /* USE_ITT_BUILD */ 711 return status; 712 } 713 714 void __kmp_exit_single(int gtid) { 715 #if USE_ITT_BUILD 716 __kmp_itt_single_end(gtid); 717 #endif /* USE_ITT_BUILD */ 718 if (__kmp_env_consistency_check) 719 __kmp_pop_workshare(gtid, ct_psingle, NULL); 720 } 721 722 /* determine if we can go parallel or must use a serialized parallel region and 723 * how many threads we can use 724 * set_nproc is the number of threads requested for the team 725 * returns 0 if we should serialize or only use one thread, 726 * otherwise the number of threads to use 727 * The forkjoin lock is held by the caller. */ 728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 729 int master_tid, int set_nthreads, 730 int enter_teams) { 731 int capacity; 732 int new_nthreads; 733 KMP_DEBUG_ASSERT(__kmp_init_serial); 734 KMP_DEBUG_ASSERT(root && parent_team); 735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 736 737 // If dyn-var is set, dynamically adjust the number of desired threads, 738 // according to the method specified by dynamic_mode. 739 new_nthreads = set_nthreads; 740 if (!get__dynamic_2(parent_team, master_tid)) { 741 ; 742 } 743 #ifdef USE_LOAD_BALANCE 744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 746 if (new_nthreads == 1) { 747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 748 "reservation to 1 thread\n", 749 master_tid)); 750 return 1; 751 } 752 if (new_nthreads < set_nthreads) { 753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 754 "reservation to %d threads\n", 755 master_tid, new_nthreads)); 756 } 757 } 758 #endif /* USE_LOAD_BALANCE */ 759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 760 new_nthreads = __kmp_avail_proc - __kmp_nth + 761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 762 if (new_nthreads <= 1) { 763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 764 "reservation to 1 thread\n", 765 master_tid)); 766 return 1; 767 } 768 if (new_nthreads < set_nthreads) { 769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 770 "reservation to %d threads\n", 771 master_tid, new_nthreads)); 772 } else { 773 new_nthreads = set_nthreads; 774 } 775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 776 if (set_nthreads > 2) { 777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 778 new_nthreads = (new_nthreads % set_nthreads) + 1; 779 if (new_nthreads == 1) { 780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 781 "reservation to 1 thread\n", 782 master_tid)); 783 return 1; 784 } 785 if (new_nthreads < set_nthreads) { 786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 787 "reservation to %d threads\n", 788 master_tid, new_nthreads)); 789 } 790 } 791 } else { 792 KMP_ASSERT(0); 793 } 794 795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 796 if (__kmp_nth + new_nthreads - 797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 798 __kmp_max_nth) { 799 int tl_nthreads = __kmp_max_nth - __kmp_nth + 800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 801 if (tl_nthreads <= 0) { 802 tl_nthreads = 1; 803 } 804 805 // If dyn-var is false, emit a 1-time warning. 806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 807 __kmp_reserve_warn = 1; 808 __kmp_msg(kmp_ms_warning, 809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 811 } 812 if (tl_nthreads == 1) { 813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 814 "reduced reservation to 1 thread\n", 815 master_tid)); 816 return 1; 817 } 818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 819 "reservation to %d threads\n", 820 master_tid, tl_nthreads)); 821 new_nthreads = tl_nthreads; 822 } 823 824 // Respect OMP_THREAD_LIMIT 825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 827 if (cg_nthreads + new_nthreads - 828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 829 max_cg_threads) { 830 int tl_nthreads = max_cg_threads - cg_nthreads + 831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 832 if (tl_nthreads <= 0) { 833 tl_nthreads = 1; 834 } 835 836 // If dyn-var is false, emit a 1-time warning. 837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 838 __kmp_reserve_warn = 1; 839 __kmp_msg(kmp_ms_warning, 840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 842 } 843 if (tl_nthreads == 1) { 844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 845 "reduced reservation to 1 thread\n", 846 master_tid)); 847 return 1; 848 } 849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 850 "reservation to %d threads\n", 851 master_tid, tl_nthreads)); 852 new_nthreads = tl_nthreads; 853 } 854 855 // Check if the threads array is large enough, or needs expanding. 856 // See comment in __kmp_register_root() about the adjustment if 857 // __kmp_threads[0] == NULL. 858 capacity = __kmp_threads_capacity; 859 if (TCR_PTR(__kmp_threads[0]) == NULL) { 860 --capacity; 861 } 862 // If it is not for initializing the hidden helper team, we need to take 863 // __kmp_hidden_helper_threads_num out of the capacity because it is included 864 // in __kmp_threads_capacity. 865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 866 capacity -= __kmp_hidden_helper_threads_num; 867 } 868 if (__kmp_nth + new_nthreads - 869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 870 capacity) { 871 // Expand the threads array. 872 int slotsRequired = __kmp_nth + new_nthreads - 873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 874 capacity; 875 int slotsAdded = __kmp_expand_threads(slotsRequired); 876 if (slotsAdded < slotsRequired) { 877 // The threads array was not expanded enough. 878 new_nthreads -= (slotsRequired - slotsAdded); 879 KMP_ASSERT(new_nthreads >= 1); 880 881 // If dyn-var is false, emit a 1-time warning. 882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 883 __kmp_reserve_warn = 1; 884 if (__kmp_tp_cached) { 885 __kmp_msg(kmp_ms_warning, 886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 889 } else { 890 __kmp_msg(kmp_ms_warning, 891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 893 } 894 } 895 } 896 } 897 898 #ifdef KMP_DEBUG 899 if (new_nthreads == 1) { 900 KC_TRACE(10, 901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 902 "dead roots and rechecking; requested %d threads\n", 903 __kmp_get_gtid(), set_nthreads)); 904 } else { 905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 906 " %d threads\n", 907 __kmp_get_gtid(), new_nthreads, set_nthreads)); 908 } 909 #endif // KMP_DEBUG 910 return new_nthreads; 911 } 912 913 /* Allocate threads from the thread pool and assign them to the new team. We are 914 assured that there are enough threads available, because we checked on that 915 earlier within critical section forkjoin */ 916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 917 kmp_info_t *master_th, int master_gtid) { 918 int i; 919 int use_hot_team; 920 921 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 922 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 923 KMP_MB(); 924 925 /* first, let's setup the primary thread */ 926 master_th->th.th_info.ds.ds_tid = 0; 927 master_th->th.th_team = team; 928 master_th->th.th_team_nproc = team->t.t_nproc; 929 master_th->th.th_team_master = master_th; 930 master_th->th.th_team_serialized = FALSE; 931 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 932 933 /* make sure we are not the optimized hot team */ 934 #if KMP_NESTED_HOT_TEAMS 935 use_hot_team = 0; 936 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 937 if (hot_teams) { // hot teams array is not allocated if 938 // KMP_HOT_TEAMS_MAX_LEVEL=0 939 int level = team->t.t_active_level - 1; // index in array of hot teams 940 if (master_th->th.th_teams_microtask) { // are we inside the teams? 941 if (master_th->th.th_teams_size.nteams > 1) { 942 ++level; // level was not increased in teams construct for 943 // team_of_masters 944 } 945 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 946 master_th->th.th_teams_level == team->t.t_level) { 947 ++level; // level was not increased in teams construct for 948 // team_of_workers before the parallel 949 } // team->t.t_level will be increased inside parallel 950 } 951 if (level < __kmp_hot_teams_max_level) { 952 if (hot_teams[level].hot_team) { 953 // hot team has already been allocated for given level 954 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 955 use_hot_team = 1; // the team is ready to use 956 } else { 957 use_hot_team = 0; // AC: threads are not allocated yet 958 hot_teams[level].hot_team = team; // remember new hot team 959 hot_teams[level].hot_team_nth = team->t.t_nproc; 960 } 961 } else { 962 use_hot_team = 0; 963 } 964 } 965 #else 966 use_hot_team = team == root->r.r_hot_team; 967 #endif 968 if (!use_hot_team) { 969 970 /* install the primary thread */ 971 team->t.t_threads[0] = master_th; 972 __kmp_initialize_info(master_th, team, 0, master_gtid); 973 974 /* now, install the worker threads */ 975 for (i = 1; i < team->t.t_nproc; i++) { 976 977 /* fork or reallocate a new thread and install it in team */ 978 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 979 team->t.t_threads[i] = thr; 980 KMP_DEBUG_ASSERT(thr); 981 KMP_DEBUG_ASSERT(thr->th.th_team == team); 982 /* align team and thread arrived states */ 983 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 984 "T#%d(%d:%d) join =%llu, plain=%llu\n", 985 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 986 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 987 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 988 team->t.t_bar[bs_plain_barrier].b_arrived)); 989 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 990 thr->th.th_teams_level = master_th->th.th_teams_level; 991 thr->th.th_teams_size = master_th->th.th_teams_size; 992 { // Initialize threads' barrier data. 993 int b; 994 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 995 for (b = 0; b < bs_last_barrier; ++b) { 996 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 997 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 998 #if USE_DEBUGGER 999 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1000 #endif 1001 } 1002 } 1003 } 1004 1005 #if KMP_AFFINITY_SUPPORTED 1006 __kmp_partition_places(team); 1007 #endif 1008 } 1009 1010 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1011 for (i = 0; i < team->t.t_nproc; i++) { 1012 kmp_info_t *thr = team->t.t_threads[i]; 1013 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1014 thr->th.th_prev_level != team->t.t_level) { 1015 team->t.t_display_affinity = 1; 1016 break; 1017 } 1018 } 1019 } 1020 1021 KMP_MB(); 1022 } 1023 1024 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1025 // Propagate any changes to the floating point control registers out to the team 1026 // We try to avoid unnecessary writes to the relevant cache line in the team 1027 // structure, so we don't make changes unless they are needed. 1028 inline static void propagateFPControl(kmp_team_t *team) { 1029 if (__kmp_inherit_fp_control) { 1030 kmp_int16 x87_fpu_control_word; 1031 kmp_uint32 mxcsr; 1032 1033 // Get primary thread's values of FPU control flags (both X87 and vector) 1034 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1035 __kmp_store_mxcsr(&mxcsr); 1036 mxcsr &= KMP_X86_MXCSR_MASK; 1037 1038 // There is no point looking at t_fp_control_saved here. 1039 // If it is TRUE, we still have to update the values if they are different 1040 // from those we now have. If it is FALSE we didn't save anything yet, but 1041 // our objective is the same. We have to ensure that the values in the team 1042 // are the same as those we have. 1043 // So, this code achieves what we need whether or not t_fp_control_saved is 1044 // true. By checking whether the value needs updating we avoid unnecessary 1045 // writes that would put the cache-line into a written state, causing all 1046 // threads in the team to have to read it again. 1047 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1048 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1049 // Although we don't use this value, other code in the runtime wants to know 1050 // whether it should restore them. So we must ensure it is correct. 1051 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1052 } else { 1053 // Similarly here. Don't write to this cache-line in the team structure 1054 // unless we have to. 1055 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1056 } 1057 } 1058 1059 // Do the opposite, setting the hardware registers to the updated values from 1060 // the team. 1061 inline static void updateHWFPControl(kmp_team_t *team) { 1062 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1063 // Only reset the fp control regs if they have been changed in the team. 1064 // the parallel region that we are exiting. 1065 kmp_int16 x87_fpu_control_word; 1066 kmp_uint32 mxcsr; 1067 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1068 __kmp_store_mxcsr(&mxcsr); 1069 mxcsr &= KMP_X86_MXCSR_MASK; 1070 1071 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1072 __kmp_clear_x87_fpu_status_word(); 1073 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1074 } 1075 1076 if (team->t.t_mxcsr != mxcsr) { 1077 __kmp_load_mxcsr(&team->t.t_mxcsr); 1078 } 1079 } 1080 } 1081 #else 1082 #define propagateFPControl(x) ((void)0) 1083 #define updateHWFPControl(x) ((void)0) 1084 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1085 1086 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1087 int realloc); // forward declaration 1088 1089 /* Run a parallel region that has been serialized, so runs only in a team of the 1090 single primary thread. */ 1091 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1092 kmp_info_t *this_thr; 1093 kmp_team_t *serial_team; 1094 1095 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1096 1097 /* Skip all this code for autopar serialized loops since it results in 1098 unacceptable overhead */ 1099 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1100 return; 1101 1102 if (!TCR_4(__kmp_init_parallel)) 1103 __kmp_parallel_initialize(); 1104 __kmp_resume_if_soft_paused(); 1105 1106 this_thr = __kmp_threads[global_tid]; 1107 serial_team = this_thr->th.th_serial_team; 1108 1109 /* utilize the serialized team held by this thread */ 1110 KMP_DEBUG_ASSERT(serial_team); 1111 KMP_MB(); 1112 1113 if (__kmp_tasking_mode != tskm_immediate_exec) { 1114 KMP_DEBUG_ASSERT( 1115 this_thr->th.th_task_team == 1116 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1117 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1118 NULL); 1119 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1120 "team %p, new task_team = NULL\n", 1121 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1122 this_thr->th.th_task_team = NULL; 1123 } 1124 1125 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1126 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1127 proc_bind = proc_bind_false; 1128 } else if (proc_bind == proc_bind_default) { 1129 // No proc_bind clause was specified, so use the current value 1130 // of proc-bind-var for this parallel region. 1131 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1132 } 1133 // Reset for next parallel region 1134 this_thr->th.th_set_proc_bind = proc_bind_default; 1135 1136 #if OMPT_SUPPORT 1137 ompt_data_t ompt_parallel_data = ompt_data_none; 1138 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1139 if (ompt_enabled.enabled && 1140 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1141 1142 ompt_task_info_t *parent_task_info; 1143 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1144 1145 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1146 if (ompt_enabled.ompt_callback_parallel_begin) { 1147 int team_size = 1; 1148 1149 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1150 &(parent_task_info->task_data), &(parent_task_info->frame), 1151 &ompt_parallel_data, team_size, 1152 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1153 } 1154 } 1155 #endif // OMPT_SUPPORT 1156 1157 if (this_thr->th.th_team != serial_team) { 1158 // Nested level will be an index in the nested nthreads array 1159 int level = this_thr->th.th_team->t.t_level; 1160 1161 if (serial_team->t.t_serialized) { 1162 /* this serial team was already used 1163 TODO increase performance by making this locks more specific */ 1164 kmp_team_t *new_team; 1165 1166 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1167 1168 new_team = 1169 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1170 #if OMPT_SUPPORT 1171 ompt_parallel_data, 1172 #endif 1173 proc_bind, &this_thr->th.th_current_task->td_icvs, 1174 0 USE_NESTED_HOT_ARG(NULL)); 1175 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1176 KMP_ASSERT(new_team); 1177 1178 /* setup new serialized team and install it */ 1179 new_team->t.t_threads[0] = this_thr; 1180 new_team->t.t_parent = this_thr->th.th_team; 1181 serial_team = new_team; 1182 this_thr->th.th_serial_team = serial_team; 1183 1184 KF_TRACE( 1185 10, 1186 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1187 global_tid, serial_team)); 1188 1189 /* TODO the above breaks the requirement that if we run out of resources, 1190 then we can still guarantee that serialized teams are ok, since we may 1191 need to allocate a new one */ 1192 } else { 1193 KF_TRACE( 1194 10, 1195 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1196 global_tid, serial_team)); 1197 } 1198 1199 /* we have to initialize this serial team */ 1200 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1201 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1202 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1203 serial_team->t.t_ident = loc; 1204 serial_team->t.t_serialized = 1; 1205 serial_team->t.t_nproc = 1; 1206 serial_team->t.t_parent = this_thr->th.th_team; 1207 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1208 this_thr->th.th_team = serial_team; 1209 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1210 1211 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, 1212 this_thr->th.th_current_task)); 1213 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1214 this_thr->th.th_current_task->td_flags.executing = 0; 1215 1216 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1217 1218 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1219 implicit task for each serialized task represented by 1220 team->t.t_serialized? */ 1221 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1222 &this_thr->th.th_current_task->td_parent->td_icvs); 1223 1224 // Thread value exists in the nested nthreads array for the next nested 1225 // level 1226 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1227 this_thr->th.th_current_task->td_icvs.nproc = 1228 __kmp_nested_nth.nth[level + 1]; 1229 } 1230 1231 if (__kmp_nested_proc_bind.used && 1232 (level + 1 < __kmp_nested_proc_bind.used)) { 1233 this_thr->th.th_current_task->td_icvs.proc_bind = 1234 __kmp_nested_proc_bind.bind_types[level + 1]; 1235 } 1236 1237 #if USE_DEBUGGER 1238 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1239 #endif 1240 this_thr->th.th_info.ds.ds_tid = 0; 1241 1242 /* set thread cache values */ 1243 this_thr->th.th_team_nproc = 1; 1244 this_thr->th.th_team_master = this_thr; 1245 this_thr->th.th_team_serialized = 1; 1246 1247 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1248 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1249 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1250 1251 propagateFPControl(serial_team); 1252 1253 /* check if we need to allocate dispatch buffers stack */ 1254 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1255 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1256 serial_team->t.t_dispatch->th_disp_buffer = 1257 (dispatch_private_info_t *)__kmp_allocate( 1258 sizeof(dispatch_private_info_t)); 1259 } 1260 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1261 1262 KMP_MB(); 1263 1264 } else { 1265 /* this serialized team is already being used, 1266 * that's fine, just add another nested level */ 1267 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1268 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1269 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1270 ++serial_team->t.t_serialized; 1271 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1272 1273 // Nested level will be an index in the nested nthreads array 1274 int level = this_thr->th.th_team->t.t_level; 1275 // Thread value exists in the nested nthreads array for the next nested 1276 // level 1277 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1278 this_thr->th.th_current_task->td_icvs.nproc = 1279 __kmp_nested_nth.nth[level + 1]; 1280 } 1281 serial_team->t.t_level++; 1282 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1283 "of serial team %p to %d\n", 1284 global_tid, serial_team, serial_team->t.t_level)); 1285 1286 /* allocate/push dispatch buffers stack */ 1287 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1288 { 1289 dispatch_private_info_t *disp_buffer = 1290 (dispatch_private_info_t *)__kmp_allocate( 1291 sizeof(dispatch_private_info_t)); 1292 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1293 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1294 } 1295 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1296 1297 KMP_MB(); 1298 } 1299 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1300 1301 // Perform the display affinity functionality for 1302 // serialized parallel regions 1303 if (__kmp_display_affinity) { 1304 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1305 this_thr->th.th_prev_num_threads != 1) { 1306 // NULL means use the affinity-format-var ICV 1307 __kmp_aux_display_affinity(global_tid, NULL); 1308 this_thr->th.th_prev_level = serial_team->t.t_level; 1309 this_thr->th.th_prev_num_threads = 1; 1310 } 1311 } 1312 1313 if (__kmp_env_consistency_check) 1314 __kmp_push_parallel(global_tid, NULL); 1315 #if OMPT_SUPPORT 1316 serial_team->t.ompt_team_info.master_return_address = codeptr; 1317 if (ompt_enabled.enabled && 1318 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1319 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1320 OMPT_GET_FRAME_ADDRESS(0); 1321 1322 ompt_lw_taskteam_t lw_taskteam; 1323 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1324 &ompt_parallel_data, codeptr); 1325 1326 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1327 // don't use lw_taskteam after linking. content was swaped 1328 1329 /* OMPT implicit task begin */ 1330 if (ompt_enabled.ompt_callback_implicit_task) { 1331 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1332 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1333 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1334 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1335 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1336 __kmp_tid_from_gtid(global_tid); 1337 } 1338 1339 /* OMPT state */ 1340 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1341 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1342 OMPT_GET_FRAME_ADDRESS(0); 1343 } 1344 #endif 1345 } 1346 1347 /* most of the work for a fork */ 1348 /* return true if we really went parallel, false if serialized */ 1349 int __kmp_fork_call(ident_t *loc, int gtid, 1350 enum fork_context_e call_context, // Intel, GNU, ... 1351 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1352 kmp_va_list ap) { 1353 void **argv; 1354 int i; 1355 int master_tid; 1356 int master_this_cons; 1357 kmp_team_t *team; 1358 kmp_team_t *parent_team; 1359 kmp_info_t *master_th; 1360 kmp_root_t *root; 1361 int nthreads; 1362 int master_active; 1363 int master_set_numthreads; 1364 int level; 1365 int active_level; 1366 int teams_level; 1367 #if KMP_NESTED_HOT_TEAMS 1368 kmp_hot_team_ptr_t **p_hot_teams; 1369 #endif 1370 { // KMP_TIME_BLOCK 1371 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1372 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1373 1374 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1375 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1376 /* Some systems prefer the stack for the root thread(s) to start with */ 1377 /* some gap from the parent stack to prevent false sharing. */ 1378 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1379 /* These 2 lines below are so this does not get optimized out */ 1380 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1381 __kmp_stkpadding += (short)((kmp_int64)dummy); 1382 } 1383 1384 /* initialize if needed */ 1385 KMP_DEBUG_ASSERT( 1386 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1387 if (!TCR_4(__kmp_init_parallel)) 1388 __kmp_parallel_initialize(); 1389 __kmp_resume_if_soft_paused(); 1390 1391 /* setup current data */ 1392 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1393 // shutdown 1394 parent_team = master_th->th.th_team; 1395 master_tid = master_th->th.th_info.ds.ds_tid; 1396 master_this_cons = master_th->th.th_local.this_construct; 1397 root = master_th->th.th_root; 1398 master_active = root->r.r_active; 1399 master_set_numthreads = master_th->th.th_set_nproc; 1400 1401 #if OMPT_SUPPORT 1402 ompt_data_t ompt_parallel_data = ompt_data_none; 1403 ompt_data_t *parent_task_data; 1404 ompt_frame_t *ompt_frame; 1405 ompt_data_t *implicit_task_data; 1406 void *return_address = NULL; 1407 1408 if (ompt_enabled.enabled) { 1409 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1410 NULL, NULL); 1411 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1412 } 1413 #endif 1414 1415 // Assign affinity to root thread if it hasn't happened yet 1416 __kmp_assign_root_init_mask(); 1417 1418 // Nested level will be an index in the nested nthreads array 1419 level = parent_team->t.t_level; 1420 // used to launch non-serial teams even if nested is not allowed 1421 active_level = parent_team->t.t_active_level; 1422 // needed to check nesting inside the teams 1423 teams_level = master_th->th.th_teams_level; 1424 #if KMP_NESTED_HOT_TEAMS 1425 p_hot_teams = &master_th->th.th_hot_teams; 1426 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1427 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1428 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1429 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1430 // it is either actual or not needed (when active_level > 0) 1431 (*p_hot_teams)[0].hot_team_nth = 1; 1432 } 1433 #endif 1434 1435 #if OMPT_SUPPORT 1436 if (ompt_enabled.enabled) { 1437 if (ompt_enabled.ompt_callback_parallel_begin) { 1438 int team_size = master_set_numthreads 1439 ? master_set_numthreads 1440 : get__nproc_2(parent_team, master_tid); 1441 int flags = OMPT_INVOKER(call_context) | 1442 ((microtask == (microtask_t)__kmp_teams_master) 1443 ? ompt_parallel_league 1444 : ompt_parallel_team); 1445 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1446 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1447 return_address); 1448 } 1449 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1450 } 1451 #endif 1452 1453 master_th->th.th_ident = loc; 1454 1455 if (master_th->th.th_teams_microtask && ap && 1456 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1457 // AC: This is start of parallel that is nested inside teams construct. 1458 // The team is actual (hot), all workers are ready at the fork barrier. 1459 // No lock needed to initialize the team a bit, then free workers. 1460 parent_team->t.t_ident = loc; 1461 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1462 parent_team->t.t_argc = argc; 1463 argv = (void **)parent_team->t.t_argv; 1464 for (i = argc - 1; i >= 0; --i) 1465 *argv++ = va_arg(kmp_va_deref(ap), void *); 1466 // Increment our nested depth levels, but not increase the serialization 1467 if (parent_team == master_th->th.th_serial_team) { 1468 // AC: we are in serialized parallel 1469 __kmpc_serialized_parallel(loc, gtid); 1470 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1471 1472 if (call_context == fork_context_gnu) { 1473 // AC: need to decrement t_serialized for enquiry functions to work 1474 // correctly, will restore at join time 1475 parent_team->t.t_serialized--; 1476 return TRUE; 1477 } 1478 1479 #if OMPD_SUPPORT 1480 parent_team->t.t_pkfn = microtask; 1481 #endif 1482 1483 #if OMPT_SUPPORT 1484 void *dummy; 1485 void **exit_frame_p; 1486 1487 ompt_lw_taskteam_t lw_taskteam; 1488 1489 if (ompt_enabled.enabled) { 1490 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1491 &ompt_parallel_data, return_address); 1492 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1493 1494 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1495 // don't use lw_taskteam after linking. content was swaped 1496 1497 /* OMPT implicit task begin */ 1498 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1499 if (ompt_enabled.ompt_callback_implicit_task) { 1500 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1501 __kmp_tid_from_gtid(gtid); 1502 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1503 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1504 implicit_task_data, 1, 1505 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1506 } 1507 1508 /* OMPT state */ 1509 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1510 } else { 1511 exit_frame_p = &dummy; 1512 } 1513 #endif 1514 // AC: need to decrement t_serialized for enquiry functions to work 1515 // correctly, will restore at join time 1516 parent_team->t.t_serialized--; 1517 1518 { 1519 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1520 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1521 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1522 #if OMPT_SUPPORT 1523 , 1524 exit_frame_p 1525 #endif 1526 ); 1527 } 1528 1529 #if OMPT_SUPPORT 1530 if (ompt_enabled.enabled) { 1531 *exit_frame_p = NULL; 1532 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1533 if (ompt_enabled.ompt_callback_implicit_task) { 1534 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1535 ompt_scope_end, NULL, implicit_task_data, 1, 1536 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1537 } 1538 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1539 __ompt_lw_taskteam_unlink(master_th); 1540 if (ompt_enabled.ompt_callback_parallel_end) { 1541 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1542 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1543 OMPT_INVOKER(call_context) | ompt_parallel_team, 1544 return_address); 1545 } 1546 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1547 } 1548 #endif 1549 return TRUE; 1550 } 1551 1552 parent_team->t.t_pkfn = microtask; 1553 parent_team->t.t_invoke = invoker; 1554 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1555 parent_team->t.t_active_level++; 1556 parent_team->t.t_level++; 1557 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1558 1559 #if OMPT_SUPPORT 1560 if (ompt_enabled.enabled) { 1561 ompt_lw_taskteam_t lw_taskteam; 1562 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1563 &ompt_parallel_data, return_address); 1564 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1565 } 1566 #endif 1567 1568 /* Change number of threads in the team if requested */ 1569 if (master_set_numthreads) { // The parallel has num_threads clause 1570 if (master_set_numthreads <= master_th->th.th_teams_size.nth) { 1571 // AC: only can reduce number of threads dynamically, can't increase 1572 kmp_info_t **other_threads = parent_team->t.t_threads; 1573 // NOTE: if using distributed barrier, we need to run this code block 1574 // even when the team size appears not to have changed from the max. 1575 int old_proc = master_th->th.th_teams_size.nth; 1576 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 1577 bp_dist_bar) { 1578 __kmp_resize_dist_barrier(parent_team, old_proc, 1579 master_set_numthreads); 1580 __kmp_add_threads_to_team(parent_team, master_set_numthreads); 1581 } 1582 parent_team->t.t_nproc = master_set_numthreads; 1583 for (i = 0; i < master_set_numthreads; ++i) { 1584 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1585 } 1586 } 1587 // Keep extra threads hot in the team for possible next parallels 1588 master_th->th.th_set_nproc = 0; 1589 } 1590 1591 #if USE_DEBUGGER 1592 if (__kmp_debugging) { // Let debugger override number of threads. 1593 int nth = __kmp_omp_num_threads(loc); 1594 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1595 master_set_numthreads = nth; 1596 } 1597 } 1598 #endif 1599 1600 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1601 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1602 KMP_ITT_DEBUG) && 1603 __kmp_forkjoin_frames_mode == 3 && 1604 parent_team->t.t_active_level == 1 // only report frames at level 1 1605 && master_th->th.th_teams_size.nteams == 1) { 1606 kmp_uint64 tmp_time = __itt_get_timestamp(); 1607 master_th->th.th_frame_time = tmp_time; 1608 parent_team->t.t_region_time = tmp_time; 1609 } 1610 if (__itt_stack_caller_create_ptr) { 1611 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1612 // create new stack stitching id before entering fork barrier 1613 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1614 } 1615 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1616 1617 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1618 "master_th=%p, gtid=%d\n", 1619 root, parent_team, master_th, gtid)); 1620 __kmp_internal_fork(loc, gtid, parent_team); 1621 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1622 "master_th=%p, gtid=%d\n", 1623 root, parent_team, master_th, gtid)); 1624 1625 if (call_context == fork_context_gnu) 1626 return TRUE; 1627 1628 /* Invoke microtask for PRIMARY thread */ 1629 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1630 parent_team->t.t_id, parent_team->t.t_pkfn)); 1631 1632 if (!parent_team->t.t_invoke(gtid)) { 1633 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1634 } 1635 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1636 parent_team->t.t_id, parent_team->t.t_pkfn)); 1637 KMP_MB(); /* Flush all pending memory write invalidates. */ 1638 1639 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1640 1641 return TRUE; 1642 } // Parallel closely nested in teams construct 1643 1644 #if KMP_DEBUG 1645 if (__kmp_tasking_mode != tskm_immediate_exec) { 1646 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1647 parent_team->t.t_task_team[master_th->th.th_task_state]); 1648 } 1649 #endif 1650 1651 // Need this to happen before we determine the number of threads, not while 1652 // we are allocating the team 1653 //__kmp_push_current_task_to_thread(master_th, parent_team, 0); 1654 int enter_teams = 0; 1655 if (parent_team->t.t_active_level >= 1656 master_th->th.th_current_task->td_icvs.max_active_levels) { 1657 nthreads = 1; 1658 } else { 1659 enter_teams = ((ap == NULL && active_level == 0) || 1660 (ap && teams_level > 0 && teams_level == level)); 1661 nthreads = master_set_numthreads 1662 ? master_set_numthreads 1663 // TODO: get nproc directly from current task 1664 : get__nproc_2(parent_team, master_tid); 1665 // Check if we need to take forkjoin lock? (no need for serialized 1666 // parallel out of teams construct). This code moved here from 1667 // __kmp_reserve_threads() to speedup nested serialized parallels. 1668 if (nthreads > 1) { 1669 if ((get__max_active_levels(master_th) == 1 && 1670 (root->r.r_in_parallel && !enter_teams)) || 1671 (__kmp_library == library_serial)) { 1672 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1673 " threads\n", 1674 gtid, nthreads)); 1675 nthreads = 1; 1676 } 1677 } 1678 if (nthreads > 1) { 1679 /* determine how many new threads we can use */ 1680 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1681 /* AC: If we execute teams from parallel region (on host), then teams 1682 should be created but each can only have 1 thread if nesting is 1683 disabled. If teams called from serial region, then teams and their 1684 threads should be created regardless of the nesting setting. */ 1685 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1686 nthreads, enter_teams); 1687 if (nthreads == 1) { 1688 // Free lock for single thread execution here; for multi-thread 1689 // execution it will be freed later after team of threads created 1690 // and initialized 1691 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1692 } 1693 } 1694 } 1695 KMP_DEBUG_ASSERT(nthreads > 0); 1696 1697 // If we temporarily changed the set number of threads then restore it now 1698 master_th->th.th_set_nproc = 0; 1699 1700 /* create a serialized parallel region? */ 1701 if (nthreads == 1) { 1702 /* josh todo: hypothetical question: what do we do for OS X*? */ 1703 #if KMP_OS_LINUX && \ 1704 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1705 void *args[argc]; 1706 #else 1707 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1708 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1709 KMP_ARCH_AARCH64) */ 1710 1711 KA_TRACE(20, 1712 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1713 1714 __kmpc_serialized_parallel(loc, gtid); 1715 1716 #if OMPD_SUPPORT 1717 master_th->th.th_serial_team->t.t_pkfn = microtask; 1718 #endif 1719 1720 if (call_context == fork_context_intel) { 1721 /* TODO this sucks, use the compiler itself to pass args! :) */ 1722 master_th->th.th_serial_team->t.t_ident = loc; 1723 if (!ap) { 1724 // revert change made in __kmpc_serialized_parallel() 1725 master_th->th.th_serial_team->t.t_level--; 1726 // Get args from parent team for teams construct 1727 1728 #if OMPT_SUPPORT 1729 void *dummy; 1730 void **exit_frame_p; 1731 ompt_task_info_t *task_info; 1732 1733 ompt_lw_taskteam_t lw_taskteam; 1734 1735 if (ompt_enabled.enabled) { 1736 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1737 &ompt_parallel_data, return_address); 1738 1739 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1740 // don't use lw_taskteam after linking. content was swaped 1741 1742 task_info = OMPT_CUR_TASK_INFO(master_th); 1743 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1744 if (ompt_enabled.ompt_callback_implicit_task) { 1745 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1746 __kmp_tid_from_gtid(gtid); 1747 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1748 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1749 &(task_info->task_data), 1, 1750 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1751 ompt_task_implicit); 1752 } 1753 1754 /* OMPT state */ 1755 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1756 } else { 1757 exit_frame_p = &dummy; 1758 } 1759 #endif 1760 1761 { 1762 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1763 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1764 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1765 parent_team->t.t_argv 1766 #if OMPT_SUPPORT 1767 , 1768 exit_frame_p 1769 #endif 1770 ); 1771 } 1772 1773 #if OMPT_SUPPORT 1774 if (ompt_enabled.enabled) { 1775 *exit_frame_p = NULL; 1776 if (ompt_enabled.ompt_callback_implicit_task) { 1777 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1778 ompt_scope_end, NULL, &(task_info->task_data), 1, 1779 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1780 ompt_task_implicit); 1781 } 1782 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1783 __ompt_lw_taskteam_unlink(master_th); 1784 if (ompt_enabled.ompt_callback_parallel_end) { 1785 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1786 &ompt_parallel_data, parent_task_data, 1787 OMPT_INVOKER(call_context) | ompt_parallel_team, 1788 return_address); 1789 } 1790 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1791 } 1792 #endif 1793 } else if (microtask == (microtask_t)__kmp_teams_master) { 1794 KMP_DEBUG_ASSERT(master_th->th.th_team == 1795 master_th->th.th_serial_team); 1796 team = master_th->th.th_team; 1797 // team->t.t_pkfn = microtask; 1798 team->t.t_invoke = invoker; 1799 __kmp_alloc_argv_entries(argc, team, TRUE); 1800 team->t.t_argc = argc; 1801 argv = (void **)team->t.t_argv; 1802 if (ap) { 1803 for (i = argc - 1; i >= 0; --i) 1804 *argv++ = va_arg(kmp_va_deref(ap), void *); 1805 } else { 1806 for (i = 0; i < argc; ++i) 1807 // Get args from parent team for teams construct 1808 argv[i] = parent_team->t.t_argv[i]; 1809 } 1810 // AC: revert change made in __kmpc_serialized_parallel() 1811 // because initial code in teams should have level=0 1812 team->t.t_level--; 1813 // AC: call special invoker for outer "parallel" of teams construct 1814 invoker(gtid); 1815 #if OMPT_SUPPORT 1816 if (ompt_enabled.enabled) { 1817 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1818 if (ompt_enabled.ompt_callback_implicit_task) { 1819 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1820 ompt_scope_end, NULL, &(task_info->task_data), 0, 1821 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1822 } 1823 if (ompt_enabled.ompt_callback_parallel_end) { 1824 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1825 &ompt_parallel_data, parent_task_data, 1826 OMPT_INVOKER(call_context) | ompt_parallel_league, 1827 return_address); 1828 } 1829 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1830 } 1831 #endif 1832 } else { 1833 argv = args; 1834 for (i = argc - 1; i >= 0; --i) 1835 *argv++ = va_arg(kmp_va_deref(ap), void *); 1836 KMP_MB(); 1837 1838 #if OMPT_SUPPORT 1839 void *dummy; 1840 void **exit_frame_p; 1841 ompt_task_info_t *task_info; 1842 1843 ompt_lw_taskteam_t lw_taskteam; 1844 1845 if (ompt_enabled.enabled) { 1846 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1847 &ompt_parallel_data, return_address); 1848 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1849 // don't use lw_taskteam after linking. content was swaped 1850 task_info = OMPT_CUR_TASK_INFO(master_th); 1851 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1852 1853 /* OMPT implicit task begin */ 1854 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1855 if (ompt_enabled.ompt_callback_implicit_task) { 1856 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1857 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1858 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1859 ompt_task_implicit); 1860 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1861 __kmp_tid_from_gtid(gtid); 1862 } 1863 1864 /* OMPT state */ 1865 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1866 } else { 1867 exit_frame_p = &dummy; 1868 } 1869 #endif 1870 1871 { 1872 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1873 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1874 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1875 #if OMPT_SUPPORT 1876 , 1877 exit_frame_p 1878 #endif 1879 ); 1880 } 1881 1882 #if OMPT_SUPPORT 1883 if (ompt_enabled.enabled) { 1884 *exit_frame_p = NULL; 1885 if (ompt_enabled.ompt_callback_implicit_task) { 1886 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1887 ompt_scope_end, NULL, &(task_info->task_data), 1, 1888 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1889 ompt_task_implicit); 1890 } 1891 1892 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1893 __ompt_lw_taskteam_unlink(master_th); 1894 if (ompt_enabled.ompt_callback_parallel_end) { 1895 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1896 &ompt_parallel_data, parent_task_data, 1897 OMPT_INVOKER(call_context) | ompt_parallel_team, 1898 return_address); 1899 } 1900 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1901 } 1902 #endif 1903 } 1904 } else if (call_context == fork_context_gnu) { 1905 #if OMPT_SUPPORT 1906 ompt_lw_taskteam_t lwt; 1907 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1908 return_address); 1909 1910 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1911 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1912 // don't use lw_taskteam after linking. content was swaped 1913 #endif 1914 1915 // we were called from GNU native code 1916 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1917 return FALSE; 1918 } else { 1919 KMP_ASSERT2(call_context < fork_context_last, 1920 "__kmp_fork_call: unknown fork_context parameter"); 1921 } 1922 1923 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1924 KMP_MB(); 1925 return FALSE; 1926 } // if (nthreads == 1) 1927 1928 // GEH: only modify the executing flag in the case when not serialized 1929 // serialized case is handled in kmpc_serialized_parallel 1930 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1931 "curtask=%p, curtask_max_aclevel=%d\n", 1932 parent_team->t.t_active_level, master_th, 1933 master_th->th.th_current_task, 1934 master_th->th.th_current_task->td_icvs.max_active_levels)); 1935 // TODO: GEH - cannot do this assertion because root thread not set up as 1936 // executing 1937 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1938 master_th->th.th_current_task->td_flags.executing = 0; 1939 1940 if (!master_th->th.th_teams_microtask || level > teams_level) { 1941 /* Increment our nested depth level */ 1942 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1943 } 1944 1945 // See if we need to make a copy of the ICVs. 1946 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1947 if ((level + 1 < __kmp_nested_nth.used) && 1948 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1949 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1950 } else { 1951 nthreads_icv = 0; // don't update 1952 } 1953 1954 // Figure out the proc_bind_policy for the new team. 1955 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1956 kmp_proc_bind_t proc_bind_icv = 1957 proc_bind_default; // proc_bind_default means don't update 1958 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1959 proc_bind = proc_bind_false; 1960 } else { 1961 if (proc_bind == proc_bind_default) { 1962 // No proc_bind clause specified; use current proc-bind-var for this 1963 // parallel region 1964 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1965 } 1966 /* else: The proc_bind policy was specified explicitly on parallel clause. 1967 This overrides proc-bind-var for this parallel region, but does not 1968 change proc-bind-var. */ 1969 // Figure the value of proc-bind-var for the child threads. 1970 if ((level + 1 < __kmp_nested_proc_bind.used) && 1971 (__kmp_nested_proc_bind.bind_types[level + 1] != 1972 master_th->th.th_current_task->td_icvs.proc_bind)) { 1973 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1974 } 1975 } 1976 1977 // Reset for next parallel region 1978 master_th->th.th_set_proc_bind = proc_bind_default; 1979 1980 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1981 kmp_internal_control_t new_icvs; 1982 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1983 new_icvs.next = NULL; 1984 if (nthreads_icv > 0) { 1985 new_icvs.nproc = nthreads_icv; 1986 } 1987 if (proc_bind_icv != proc_bind_default) { 1988 new_icvs.proc_bind = proc_bind_icv; 1989 } 1990 1991 /* allocate a new parallel team */ 1992 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1993 team = __kmp_allocate_team(root, nthreads, nthreads, 1994 #if OMPT_SUPPORT 1995 ompt_parallel_data, 1996 #endif 1997 proc_bind, &new_icvs, 1998 argc USE_NESTED_HOT_ARG(master_th)); 1999 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2000 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); 2001 } else { 2002 /* allocate a new parallel team */ 2003 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2004 team = __kmp_allocate_team(root, nthreads, nthreads, 2005 #if OMPT_SUPPORT 2006 ompt_parallel_data, 2007 #endif 2008 proc_bind, 2009 &master_th->th.th_current_task->td_icvs, 2010 argc USE_NESTED_HOT_ARG(master_th)); 2011 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2012 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, 2013 &master_th->th.th_current_task->td_icvs); 2014 } 2015 KF_TRACE( 2016 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2017 2018 /* setup the new team */ 2019 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2020 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2021 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2022 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2023 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2024 #if OMPT_SUPPORT 2025 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2026 return_address); 2027 #endif 2028 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2029 // TODO: parent_team->t.t_level == INT_MAX ??? 2030 if (!master_th->th.th_teams_microtask || level > teams_level) { 2031 int new_level = parent_team->t.t_level + 1; 2032 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2033 new_level = parent_team->t.t_active_level + 1; 2034 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2035 } else { 2036 // AC: Do not increase parallel level at start of the teams construct 2037 int new_level = parent_team->t.t_level; 2038 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2039 new_level = parent_team->t.t_active_level; 2040 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2041 } 2042 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2043 // set primary thread's schedule as new run-time schedule 2044 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2045 2046 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2047 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2048 2049 // Update the floating point rounding in the team if required. 2050 propagateFPControl(team); 2051 #if OMPD_SUPPORT 2052 if (ompd_state & OMPD_ENABLE_BP) 2053 ompd_bp_parallel_begin(); 2054 #endif 2055 2056 if (__kmp_tasking_mode != tskm_immediate_exec) { 2057 // Set primary thread's task team to team's task team. Unless this is hot 2058 // team, it should be NULL. 2059 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2060 parent_team->t.t_task_team[master_th->th.th_task_state]); 2061 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2062 "%p, new task_team %p / team %p\n", 2063 __kmp_gtid_from_thread(master_th), 2064 master_th->th.th_task_team, parent_team, 2065 team->t.t_task_team[master_th->th.th_task_state], team)); 2066 2067 if (active_level || master_th->th.th_task_team) { 2068 // Take a memo of primary thread's task_state 2069 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2070 if (master_th->th.th_task_state_top >= 2071 master_th->th.th_task_state_stack_sz) { // increase size 2072 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2073 kmp_uint8 *old_stack, *new_stack; 2074 kmp_uint32 i; 2075 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2076 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2077 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2078 } 2079 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2080 ++i) { // zero-init rest of stack 2081 new_stack[i] = 0; 2082 } 2083 old_stack = master_th->th.th_task_state_memo_stack; 2084 master_th->th.th_task_state_memo_stack = new_stack; 2085 master_th->th.th_task_state_stack_sz = new_size; 2086 __kmp_free(old_stack); 2087 } 2088 // Store primary thread's task_state on stack 2089 master_th->th 2090 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2091 master_th->th.th_task_state; 2092 master_th->th.th_task_state_top++; 2093 #if KMP_NESTED_HOT_TEAMS 2094 if (master_th->th.th_hot_teams && 2095 active_level < __kmp_hot_teams_max_level && 2096 team == master_th->th.th_hot_teams[active_level].hot_team) { 2097 // Restore primary thread's nested state if nested hot team 2098 master_th->th.th_task_state = 2099 master_th->th 2100 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2101 } else { 2102 #endif 2103 master_th->th.th_task_state = 0; 2104 #if KMP_NESTED_HOT_TEAMS 2105 } 2106 #endif 2107 } 2108 #if !KMP_NESTED_HOT_TEAMS 2109 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2110 (team == root->r.r_hot_team)); 2111 #endif 2112 } 2113 2114 KA_TRACE( 2115 20, 2116 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2117 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2118 team->t.t_nproc)); 2119 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2120 (team->t.t_master_tid == 0 && 2121 (team->t.t_parent == root->r.r_root_team || 2122 team->t.t_parent->t.t_serialized))); 2123 KMP_MB(); 2124 2125 /* now, setup the arguments */ 2126 argv = (void **)team->t.t_argv; 2127 if (ap) { 2128 for (i = argc - 1; i >= 0; --i) { 2129 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2130 KMP_CHECK_UPDATE(*argv, new_argv); 2131 argv++; 2132 } 2133 } else { 2134 for (i = 0; i < argc; ++i) { 2135 // Get args from parent team for teams construct 2136 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2137 } 2138 } 2139 2140 /* now actually fork the threads */ 2141 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2142 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2143 root->r.r_active = TRUE; 2144 2145 __kmp_fork_team_threads(root, team, master_th, gtid); 2146 __kmp_setup_icv_copy(team, nthreads, 2147 &master_th->th.th_current_task->td_icvs, loc); 2148 2149 #if OMPT_SUPPORT 2150 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2151 #endif 2152 2153 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2154 2155 #if USE_ITT_BUILD 2156 if (team->t.t_active_level == 1 // only report frames at level 1 2157 && !master_th->th.th_teams_microtask) { // not in teams construct 2158 #if USE_ITT_NOTIFY 2159 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2160 (__kmp_forkjoin_frames_mode == 3 || 2161 __kmp_forkjoin_frames_mode == 1)) { 2162 kmp_uint64 tmp_time = 0; 2163 if (__itt_get_timestamp_ptr) 2164 tmp_time = __itt_get_timestamp(); 2165 // Internal fork - report frame begin 2166 master_th->th.th_frame_time = tmp_time; 2167 if (__kmp_forkjoin_frames_mode == 3) 2168 team->t.t_region_time = tmp_time; 2169 } else 2170 // only one notification scheme (either "submit" or "forking/joined", not both) 2171 #endif /* USE_ITT_NOTIFY */ 2172 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2173 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2174 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2175 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2176 } 2177 } 2178 #endif /* USE_ITT_BUILD */ 2179 2180 /* now go on and do the work */ 2181 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2182 KMP_MB(); 2183 KF_TRACE(10, 2184 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2185 root, team, master_th, gtid)); 2186 2187 #if USE_ITT_BUILD 2188 if (__itt_stack_caller_create_ptr) { 2189 // create new stack stitching id before entering fork barrier 2190 if (!enter_teams) { 2191 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2192 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2193 } else if (parent_team->t.t_serialized) { 2194 // keep stack stitching id in the serialized parent_team; 2195 // current team will be used for parallel inside the teams; 2196 // if parent_team is active, then it already keeps stack stitching id 2197 // for the league of teams 2198 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2199 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2200 } 2201 } 2202 #endif /* USE_ITT_BUILD */ 2203 2204 // AC: skip __kmp_internal_fork at teams construct, let only primary 2205 // threads execute 2206 if (ap) { 2207 __kmp_internal_fork(loc, gtid, team); 2208 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2209 "master_th=%p, gtid=%d\n", 2210 root, team, master_th, gtid)); 2211 } 2212 2213 if (call_context == fork_context_gnu) { 2214 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2215 return TRUE; 2216 } 2217 2218 /* Invoke microtask for PRIMARY thread */ 2219 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2220 team->t.t_id, team->t.t_pkfn)); 2221 } // END of timer KMP_fork_call block 2222 2223 #if KMP_STATS_ENABLED 2224 // If beginning a teams construct, then change thread state 2225 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2226 if (!ap) { 2227 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2228 } 2229 #endif 2230 2231 if (!team->t.t_invoke(gtid)) { 2232 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2233 } 2234 2235 #if KMP_STATS_ENABLED 2236 // If was beginning of a teams construct, then reset thread state 2237 if (!ap) { 2238 KMP_SET_THREAD_STATE(previous_state); 2239 } 2240 #endif 2241 2242 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2243 team->t.t_id, team->t.t_pkfn)); 2244 KMP_MB(); /* Flush all pending memory write invalidates. */ 2245 2246 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2247 #if OMPT_SUPPORT 2248 if (ompt_enabled.enabled) { 2249 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2250 } 2251 #endif 2252 2253 return TRUE; 2254 } 2255 2256 #if OMPT_SUPPORT 2257 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2258 kmp_team_t *team) { 2259 // restore state outside the region 2260 thread->th.ompt_thread_info.state = 2261 ((team->t.t_serialized) ? ompt_state_work_serial 2262 : ompt_state_work_parallel); 2263 } 2264 2265 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2266 kmp_team_t *team, ompt_data_t *parallel_data, 2267 int flags, void *codeptr) { 2268 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2269 if (ompt_enabled.ompt_callback_parallel_end) { 2270 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2271 parallel_data, &(task_info->task_data), flags, codeptr); 2272 } 2273 2274 task_info->frame.enter_frame = ompt_data_none; 2275 __kmp_join_restore_state(thread, team); 2276 } 2277 #endif 2278 2279 void __kmp_join_call(ident_t *loc, int gtid 2280 #if OMPT_SUPPORT 2281 , 2282 enum fork_context_e fork_context 2283 #endif 2284 , 2285 int exit_teams) { 2286 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2287 kmp_team_t *team; 2288 kmp_team_t *parent_team; 2289 kmp_info_t *master_th; 2290 kmp_root_t *root; 2291 int master_active; 2292 2293 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2294 2295 /* setup current data */ 2296 master_th = __kmp_threads[gtid]; 2297 root = master_th->th.th_root; 2298 team = master_th->th.th_team; 2299 parent_team = team->t.t_parent; 2300 2301 master_th->th.th_ident = loc; 2302 2303 #if OMPT_SUPPORT 2304 void *team_microtask = (void *)team->t.t_pkfn; 2305 // For GOMP interface with serialized parallel, need the 2306 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2307 // and end-parallel events. 2308 if (ompt_enabled.enabled && 2309 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2310 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2311 } 2312 #endif 2313 2314 #if KMP_DEBUG 2315 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2316 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2317 "th_task_team = %p\n", 2318 __kmp_gtid_from_thread(master_th), team, 2319 team->t.t_task_team[master_th->th.th_task_state], 2320 master_th->th.th_task_team)); 2321 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2322 team->t.t_task_team[master_th->th.th_task_state]); 2323 } 2324 #endif 2325 2326 if (team->t.t_serialized) { 2327 if (master_th->th.th_teams_microtask) { 2328 // We are in teams construct 2329 int level = team->t.t_level; 2330 int tlevel = master_th->th.th_teams_level; 2331 if (level == tlevel) { 2332 // AC: we haven't incremented it earlier at start of teams construct, 2333 // so do it here - at the end of teams construct 2334 team->t.t_level++; 2335 } else if (level == tlevel + 1) { 2336 // AC: we are exiting parallel inside teams, need to increment 2337 // serialization in order to restore it in the next call to 2338 // __kmpc_end_serialized_parallel 2339 team->t.t_serialized++; 2340 } 2341 } 2342 __kmpc_end_serialized_parallel(loc, gtid); 2343 2344 #if OMPT_SUPPORT 2345 if (ompt_enabled.enabled) { 2346 __kmp_join_restore_state(master_th, parent_team); 2347 } 2348 #endif 2349 2350 return; 2351 } 2352 2353 master_active = team->t.t_master_active; 2354 2355 if (!exit_teams) { 2356 // AC: No barrier for internal teams at exit from teams construct. 2357 // But there is barrier for external team (league). 2358 __kmp_internal_join(loc, gtid, team); 2359 #if USE_ITT_BUILD 2360 if (__itt_stack_caller_create_ptr) { 2361 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2362 // destroy the stack stitching id after join barrier 2363 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2364 team->t.t_stack_id = NULL; 2365 } 2366 #endif 2367 } else { 2368 master_th->th.th_task_state = 2369 0; // AC: no tasking in teams (out of any parallel) 2370 #if USE_ITT_BUILD 2371 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2372 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2373 // destroy the stack stitching id on exit from the teams construct 2374 // if parent_team is active, then the id will be destroyed later on 2375 // by master of the league of teams 2376 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2377 parent_team->t.t_stack_id = NULL; 2378 } 2379 #endif 2380 2381 if (team->t.t_nproc > 1 && 2382 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2383 team->t.b->update_num_threads(team->t.t_nproc); 2384 __kmp_add_threads_to_team(team, team->t.t_nproc); 2385 } 2386 } 2387 2388 KMP_MB(); 2389 2390 #if OMPT_SUPPORT 2391 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2392 void *codeptr = team->t.ompt_team_info.master_return_address; 2393 #endif 2394 2395 #if USE_ITT_BUILD 2396 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2397 if (team->t.t_active_level == 1 && 2398 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2399 master_th->th.th_teams_size.nteams == 1)) { 2400 master_th->th.th_ident = loc; 2401 // only one notification scheme (either "submit" or "forking/joined", not 2402 // both) 2403 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2404 __kmp_forkjoin_frames_mode == 3) 2405 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2406 master_th->th.th_frame_time, 0, loc, 2407 master_th->th.th_team_nproc, 1); 2408 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2409 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2410 __kmp_itt_region_joined(gtid); 2411 } // active_level == 1 2412 #endif /* USE_ITT_BUILD */ 2413 2414 if (master_th->th.th_teams_microtask && !exit_teams && 2415 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2416 team->t.t_level == master_th->th.th_teams_level + 1) { 2417 // AC: We need to leave the team structure intact at the end of parallel 2418 // inside the teams construct, so that at the next parallel same (hot) team 2419 // works, only adjust nesting levels 2420 #if OMPT_SUPPORT 2421 ompt_data_t ompt_parallel_data = ompt_data_none; 2422 if (ompt_enabled.enabled) { 2423 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2424 if (ompt_enabled.ompt_callback_implicit_task) { 2425 int ompt_team_size = team->t.t_nproc; 2426 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2427 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2428 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2429 } 2430 task_info->frame.exit_frame = ompt_data_none; 2431 task_info->task_data = ompt_data_none; 2432 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2433 __ompt_lw_taskteam_unlink(master_th); 2434 } 2435 #endif 2436 /* Decrement our nested depth level */ 2437 team->t.t_level--; 2438 team->t.t_active_level--; 2439 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2440 2441 // Restore number of threads in the team if needed. This code relies on 2442 // the proper adjustment of th_teams_size.nth after the fork in 2443 // __kmp_teams_master on each teams primary thread in the case that 2444 // __kmp_reserve_threads reduced it. 2445 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2446 int old_num = master_th->th.th_team_nproc; 2447 int new_num = master_th->th.th_teams_size.nth; 2448 kmp_info_t **other_threads = team->t.t_threads; 2449 team->t.t_nproc = new_num; 2450 for (int i = 0; i < old_num; ++i) { 2451 other_threads[i]->th.th_team_nproc = new_num; 2452 } 2453 // Adjust states of non-used threads of the team 2454 for (int i = old_num; i < new_num; ++i) { 2455 // Re-initialize thread's barrier data. 2456 KMP_DEBUG_ASSERT(other_threads[i]); 2457 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2458 for (int b = 0; b < bs_last_barrier; ++b) { 2459 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2460 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2461 #if USE_DEBUGGER 2462 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2463 #endif 2464 } 2465 if (__kmp_tasking_mode != tskm_immediate_exec) { 2466 // Synchronize thread's task state 2467 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2468 } 2469 } 2470 } 2471 2472 #if OMPT_SUPPORT 2473 if (ompt_enabled.enabled) { 2474 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2475 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2476 } 2477 #endif 2478 2479 return; 2480 } 2481 2482 /* do cleanup and restore the parent team */ 2483 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2484 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2485 2486 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2487 2488 /* jc: The following lock has instructions with REL and ACQ semantics, 2489 separating the parallel user code called in this parallel region 2490 from the serial user code called after this function returns. */ 2491 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2492 2493 if (!master_th->th.th_teams_microtask || 2494 team->t.t_level > master_th->th.th_teams_level) { 2495 /* Decrement our nested depth level */ 2496 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2497 } 2498 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2499 2500 #if OMPT_SUPPORT 2501 if (ompt_enabled.enabled) { 2502 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2503 if (ompt_enabled.ompt_callback_implicit_task) { 2504 int flags = (team_microtask == (void *)__kmp_teams_master) 2505 ? ompt_task_initial 2506 : ompt_task_implicit; 2507 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2508 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2509 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2510 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2511 } 2512 task_info->frame.exit_frame = ompt_data_none; 2513 task_info->task_data = ompt_data_none; 2514 } 2515 #endif 2516 2517 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2518 master_th, team)); 2519 __kmp_pop_current_task_from_thread(master_th); 2520 2521 #if KMP_AFFINITY_SUPPORTED 2522 // Restore master thread's partition. 2523 master_th->th.th_first_place = team->t.t_first_place; 2524 master_th->th.th_last_place = team->t.t_last_place; 2525 #endif // KMP_AFFINITY_SUPPORTED 2526 master_th->th.th_def_allocator = team->t.t_def_allocator; 2527 2528 #if OMPD_SUPPORT 2529 if (ompd_state & OMPD_ENABLE_BP) 2530 ompd_bp_parallel_end(); 2531 #endif 2532 updateHWFPControl(team); 2533 2534 if (root->r.r_active != master_active) 2535 root->r.r_active = master_active; 2536 2537 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2538 master_th)); // this will free worker threads 2539 2540 /* this race was fun to find. make sure the following is in the critical 2541 region otherwise assertions may fail occasionally since the old team may be 2542 reallocated and the hierarchy appears inconsistent. it is actually safe to 2543 run and won't cause any bugs, but will cause those assertion failures. it's 2544 only one deref&assign so might as well put this in the critical region */ 2545 master_th->th.th_team = parent_team; 2546 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2547 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2548 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2549 2550 /* restore serialized team, if need be */ 2551 if (parent_team->t.t_serialized && 2552 parent_team != master_th->th.th_serial_team && 2553 parent_team != root->r.r_root_team) { 2554 __kmp_free_team(root, 2555 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2556 master_th->th.th_serial_team = parent_team; 2557 } 2558 2559 if (__kmp_tasking_mode != tskm_immediate_exec) { 2560 if (master_th->th.th_task_state_top > 2561 0) { // Restore task state from memo stack 2562 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2563 // Remember primary thread's state if we re-use this nested hot team 2564 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2565 master_th->th.th_task_state; 2566 --master_th->th.th_task_state_top; // pop 2567 // Now restore state at this level 2568 master_th->th.th_task_state = 2569 master_th->th 2570 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2571 } 2572 // Copy the task team from the parent team to the primary thread 2573 master_th->th.th_task_team = 2574 parent_team->t.t_task_team[master_th->th.th_task_state]; 2575 KA_TRACE(20, 2576 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2577 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2578 parent_team)); 2579 } 2580 2581 // TODO: GEH - cannot do this assertion because root thread not set up as 2582 // executing 2583 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2584 master_th->th.th_current_task->td_flags.executing = 1; 2585 2586 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2587 2588 #if OMPT_SUPPORT 2589 int flags = 2590 OMPT_INVOKER(fork_context) | 2591 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2592 : ompt_parallel_team); 2593 if (ompt_enabled.enabled) { 2594 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2595 codeptr); 2596 } 2597 #endif 2598 2599 KMP_MB(); 2600 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2601 } 2602 2603 /* Check whether we should push an internal control record onto the 2604 serial team stack. If so, do it. */ 2605 void __kmp_save_internal_controls(kmp_info_t *thread) { 2606 2607 if (thread->th.th_team != thread->th.th_serial_team) { 2608 return; 2609 } 2610 if (thread->th.th_team->t.t_serialized > 1) { 2611 int push = 0; 2612 2613 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2614 push = 1; 2615 } else { 2616 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2617 thread->th.th_team->t.t_serialized) { 2618 push = 1; 2619 } 2620 } 2621 if (push) { /* push a record on the serial team's stack */ 2622 kmp_internal_control_t *control = 2623 (kmp_internal_control_t *)__kmp_allocate( 2624 sizeof(kmp_internal_control_t)); 2625 2626 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2627 2628 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2629 2630 control->next = thread->th.th_team->t.t_control_stack_top; 2631 thread->th.th_team->t.t_control_stack_top = control; 2632 } 2633 } 2634 } 2635 2636 /* Changes set_nproc */ 2637 void __kmp_set_num_threads(int new_nth, int gtid) { 2638 kmp_info_t *thread; 2639 kmp_root_t *root; 2640 2641 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2642 KMP_DEBUG_ASSERT(__kmp_init_serial); 2643 2644 if (new_nth < 1) 2645 new_nth = 1; 2646 else if (new_nth > __kmp_max_nth) 2647 new_nth = __kmp_max_nth; 2648 2649 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2650 thread = __kmp_threads[gtid]; 2651 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2652 return; // nothing to do 2653 2654 __kmp_save_internal_controls(thread); 2655 2656 set__nproc(thread, new_nth); 2657 2658 // If this omp_set_num_threads() call will cause the hot team size to be 2659 // reduced (in the absence of a num_threads clause), then reduce it now, 2660 // rather than waiting for the next parallel region. 2661 root = thread->th.th_root; 2662 if (__kmp_init_parallel && (!root->r.r_active) && 2663 (root->r.r_hot_team->t.t_nproc > new_nth) 2664 #if KMP_NESTED_HOT_TEAMS 2665 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2666 #endif 2667 ) { 2668 kmp_team_t *hot_team = root->r.r_hot_team; 2669 int f; 2670 2671 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2672 2673 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2674 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); 2675 } 2676 // Release the extra threads we don't need any more. 2677 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2678 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2679 if (__kmp_tasking_mode != tskm_immediate_exec) { 2680 // When decreasing team size, threads no longer in the team should unref 2681 // task team. 2682 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2683 } 2684 __kmp_free_thread(hot_team->t.t_threads[f]); 2685 hot_team->t.t_threads[f] = NULL; 2686 } 2687 hot_team->t.t_nproc = new_nth; 2688 #if KMP_NESTED_HOT_TEAMS 2689 if (thread->th.th_hot_teams) { 2690 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2691 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2692 } 2693 #endif 2694 2695 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2696 hot_team->t.b->update_num_threads(new_nth); 2697 __kmp_add_threads_to_team(hot_team, new_nth); 2698 } 2699 2700 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2701 2702 // Update the t_nproc field in the threads that are still active. 2703 for (f = 0; f < new_nth; f++) { 2704 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2705 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2706 } 2707 // Special flag in case omp_set_num_threads() call 2708 hot_team->t.t_size_changed = -1; 2709 } 2710 } 2711 2712 /* Changes max_active_levels */ 2713 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2714 kmp_info_t *thread; 2715 2716 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2717 "%d = (%d)\n", 2718 gtid, max_active_levels)); 2719 KMP_DEBUG_ASSERT(__kmp_init_serial); 2720 2721 // validate max_active_levels 2722 if (max_active_levels < 0) { 2723 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2724 // We ignore this call if the user has specified a negative value. 2725 // The current setting won't be changed. The last valid setting will be 2726 // used. A warning will be issued (if warnings are allowed as controlled by 2727 // the KMP_WARNINGS env var). 2728 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2729 "max_active_levels for thread %d = (%d)\n", 2730 gtid, max_active_levels)); 2731 return; 2732 } 2733 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2734 // it's OK, the max_active_levels is within the valid range: [ 0; 2735 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2736 // We allow a zero value. (implementation defined behavior) 2737 } else { 2738 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2739 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2740 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2741 // Current upper limit is MAX_INT. (implementation defined behavior) 2742 // If the input exceeds the upper limit, we correct the input to be the 2743 // upper limit. (implementation defined behavior) 2744 // Actually, the flow should never get here until we use MAX_INT limit. 2745 } 2746 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2747 "max_active_levels for thread %d = (%d)\n", 2748 gtid, max_active_levels)); 2749 2750 thread = __kmp_threads[gtid]; 2751 2752 __kmp_save_internal_controls(thread); 2753 2754 set__max_active_levels(thread, max_active_levels); 2755 } 2756 2757 /* Gets max_active_levels */ 2758 int __kmp_get_max_active_levels(int gtid) { 2759 kmp_info_t *thread; 2760 2761 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2762 KMP_DEBUG_ASSERT(__kmp_init_serial); 2763 2764 thread = __kmp_threads[gtid]; 2765 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2766 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2767 "curtask_maxaclevel=%d\n", 2768 gtid, thread->th.th_current_task, 2769 thread->th.th_current_task->td_icvs.max_active_levels)); 2770 return thread->th.th_current_task->td_icvs.max_active_levels; 2771 } 2772 2773 // nteams-var per-device ICV 2774 void __kmp_set_num_teams(int num_teams) { 2775 if (num_teams > 0) 2776 __kmp_nteams = num_teams; 2777 } 2778 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2779 // teams-thread-limit-var per-device ICV 2780 void __kmp_set_teams_thread_limit(int limit) { 2781 if (limit > 0) 2782 __kmp_teams_thread_limit = limit; 2783 } 2784 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2785 2786 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2787 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2788 2789 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2790 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2791 kmp_info_t *thread; 2792 kmp_sched_t orig_kind; 2793 // kmp_team_t *team; 2794 2795 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2796 gtid, (int)kind, chunk)); 2797 KMP_DEBUG_ASSERT(__kmp_init_serial); 2798 2799 // Check if the kind parameter is valid, correct if needed. 2800 // Valid parameters should fit in one of two intervals - standard or extended: 2801 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2802 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2803 orig_kind = kind; 2804 kind = __kmp_sched_without_mods(kind); 2805 2806 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2807 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2808 // TODO: Hint needs attention in case we change the default schedule. 2809 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2810 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2811 __kmp_msg_null); 2812 kind = kmp_sched_default; 2813 chunk = 0; // ignore chunk value in case of bad kind 2814 } 2815 2816 thread = __kmp_threads[gtid]; 2817 2818 __kmp_save_internal_controls(thread); 2819 2820 if (kind < kmp_sched_upper_std) { 2821 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2822 // differ static chunked vs. unchunked: chunk should be invalid to 2823 // indicate unchunked schedule (which is the default) 2824 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2825 } else { 2826 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2827 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2828 } 2829 } else { 2830 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2831 // kmp_sched_lower - 2 ]; 2832 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2833 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2834 kmp_sched_lower - 2]; 2835 } 2836 __kmp_sched_apply_mods_intkind( 2837 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2838 if (kind == kmp_sched_auto || chunk < 1) { 2839 // ignore parameter chunk for schedule auto 2840 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2841 } else { 2842 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2843 } 2844 } 2845 2846 /* Gets def_sched_var ICV values */ 2847 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2848 kmp_info_t *thread; 2849 enum sched_type th_type; 2850 2851 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2852 KMP_DEBUG_ASSERT(__kmp_init_serial); 2853 2854 thread = __kmp_threads[gtid]; 2855 2856 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2857 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2858 case kmp_sch_static: 2859 case kmp_sch_static_greedy: 2860 case kmp_sch_static_balanced: 2861 *kind = kmp_sched_static; 2862 __kmp_sched_apply_mods_stdkind(kind, th_type); 2863 *chunk = 0; // chunk was not set, try to show this fact via zero value 2864 return; 2865 case kmp_sch_static_chunked: 2866 *kind = kmp_sched_static; 2867 break; 2868 case kmp_sch_dynamic_chunked: 2869 *kind = kmp_sched_dynamic; 2870 break; 2871 case kmp_sch_guided_chunked: 2872 case kmp_sch_guided_iterative_chunked: 2873 case kmp_sch_guided_analytical_chunked: 2874 *kind = kmp_sched_guided; 2875 break; 2876 case kmp_sch_auto: 2877 *kind = kmp_sched_auto; 2878 break; 2879 case kmp_sch_trapezoidal: 2880 *kind = kmp_sched_trapezoidal; 2881 break; 2882 #if KMP_STATIC_STEAL_ENABLED 2883 case kmp_sch_static_steal: 2884 *kind = kmp_sched_static_steal; 2885 break; 2886 #endif 2887 default: 2888 KMP_FATAL(UnknownSchedulingType, th_type); 2889 } 2890 2891 __kmp_sched_apply_mods_stdkind(kind, th_type); 2892 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2893 } 2894 2895 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2896 2897 int ii, dd; 2898 kmp_team_t *team; 2899 kmp_info_t *thr; 2900 2901 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2902 KMP_DEBUG_ASSERT(__kmp_init_serial); 2903 2904 // validate level 2905 if (level == 0) 2906 return 0; 2907 if (level < 0) 2908 return -1; 2909 thr = __kmp_threads[gtid]; 2910 team = thr->th.th_team; 2911 ii = team->t.t_level; 2912 if (level > ii) 2913 return -1; 2914 2915 if (thr->th.th_teams_microtask) { 2916 // AC: we are in teams region where multiple nested teams have same level 2917 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2918 if (level <= 2919 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2920 KMP_DEBUG_ASSERT(ii >= tlevel); 2921 // AC: As we need to pass by the teams league, we need to artificially 2922 // increase ii 2923 if (ii == tlevel) { 2924 ii += 2; // three teams have same level 2925 } else { 2926 ii++; // two teams have same level 2927 } 2928 } 2929 } 2930 2931 if (ii == level) 2932 return __kmp_tid_from_gtid(gtid); 2933 2934 dd = team->t.t_serialized; 2935 level++; 2936 while (ii > level) { 2937 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2938 } 2939 if ((team->t.t_serialized) && (!dd)) { 2940 team = team->t.t_parent; 2941 continue; 2942 } 2943 if (ii > level) { 2944 team = team->t.t_parent; 2945 dd = team->t.t_serialized; 2946 ii--; 2947 } 2948 } 2949 2950 return (dd > 1) ? (0) : (team->t.t_master_tid); 2951 } 2952 2953 int __kmp_get_team_size(int gtid, int level) { 2954 2955 int ii, dd; 2956 kmp_team_t *team; 2957 kmp_info_t *thr; 2958 2959 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2960 KMP_DEBUG_ASSERT(__kmp_init_serial); 2961 2962 // validate level 2963 if (level == 0) 2964 return 1; 2965 if (level < 0) 2966 return -1; 2967 thr = __kmp_threads[gtid]; 2968 team = thr->th.th_team; 2969 ii = team->t.t_level; 2970 if (level > ii) 2971 return -1; 2972 2973 if (thr->th.th_teams_microtask) { 2974 // AC: we are in teams region where multiple nested teams have same level 2975 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2976 if (level <= 2977 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2978 KMP_DEBUG_ASSERT(ii >= tlevel); 2979 // AC: As we need to pass by the teams league, we need to artificially 2980 // increase ii 2981 if (ii == tlevel) { 2982 ii += 2; // three teams have same level 2983 } else { 2984 ii++; // two teams have same level 2985 } 2986 } 2987 } 2988 2989 while (ii > level) { 2990 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2991 } 2992 if (team->t.t_serialized && (!dd)) { 2993 team = team->t.t_parent; 2994 continue; 2995 } 2996 if (ii > level) { 2997 team = team->t.t_parent; 2998 ii--; 2999 } 3000 } 3001 3002 return team->t.t_nproc; 3003 } 3004 3005 kmp_r_sched_t __kmp_get_schedule_global() { 3006 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3007 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3008 // independently. So one can get the updated schedule here. 3009 3010 kmp_r_sched_t r_sched; 3011 3012 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3013 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3014 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3015 // different roots (even in OMP 2.5) 3016 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 3017 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 3018 if (s == kmp_sch_static) { 3019 // replace STATIC with more detailed schedule (balanced or greedy) 3020 r_sched.r_sched_type = __kmp_static; 3021 } else if (s == kmp_sch_guided_chunked) { 3022 // replace GUIDED with more detailed schedule (iterative or analytical) 3023 r_sched.r_sched_type = __kmp_guided; 3024 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3025 r_sched.r_sched_type = __kmp_sched; 3026 } 3027 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3028 3029 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3030 // __kmp_chunk may be wrong here (if it was not ever set) 3031 r_sched.chunk = KMP_DEFAULT_CHUNK; 3032 } else { 3033 r_sched.chunk = __kmp_chunk; 3034 } 3035 3036 return r_sched; 3037 } 3038 3039 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3040 at least argc number of *t_argv entries for the requested team. */ 3041 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3042 3043 KMP_DEBUG_ASSERT(team); 3044 if (!realloc || argc > team->t.t_max_argc) { 3045 3046 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3047 "current entries=%d\n", 3048 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3049 /* if previously allocated heap space for args, free them */ 3050 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3051 __kmp_free((void *)team->t.t_argv); 3052 3053 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3054 /* use unused space in the cache line for arguments */ 3055 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3056 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3057 "argv entries\n", 3058 team->t.t_id, team->t.t_max_argc)); 3059 team->t.t_argv = &team->t.t_inline_argv[0]; 3060 if (__kmp_storage_map) { 3061 __kmp_print_storage_map_gtid( 3062 -1, &team->t.t_inline_argv[0], 3063 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3064 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3065 team->t.t_id); 3066 } 3067 } else { 3068 /* allocate space for arguments in the heap */ 3069 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3070 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3071 : 2 * argc; 3072 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3073 "argv entries\n", 3074 team->t.t_id, team->t.t_max_argc)); 3075 team->t.t_argv = 3076 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3077 if (__kmp_storage_map) { 3078 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3079 &team->t.t_argv[team->t.t_max_argc], 3080 sizeof(void *) * team->t.t_max_argc, 3081 "team_%d.t_argv", team->t.t_id); 3082 } 3083 } 3084 } 3085 } 3086 3087 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3088 int i; 3089 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3090 team->t.t_threads = 3091 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3092 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3093 sizeof(dispatch_shared_info_t) * num_disp_buff); 3094 team->t.t_dispatch = 3095 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3096 team->t.t_implicit_task_taskdata = 3097 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3098 team->t.t_max_nproc = max_nth; 3099 3100 /* setup dispatch buffers */ 3101 for (i = 0; i < num_disp_buff; ++i) { 3102 team->t.t_disp_buffer[i].buffer_index = i; 3103 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3104 } 3105 } 3106 3107 static void __kmp_free_team_arrays(kmp_team_t *team) { 3108 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3109 int i; 3110 for (i = 0; i < team->t.t_max_nproc; ++i) { 3111 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3112 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3113 team->t.t_dispatch[i].th_disp_buffer = NULL; 3114 } 3115 } 3116 #if KMP_USE_HIER_SCHED 3117 __kmp_dispatch_free_hierarchies(team); 3118 #endif 3119 __kmp_free(team->t.t_threads); 3120 __kmp_free(team->t.t_disp_buffer); 3121 __kmp_free(team->t.t_dispatch); 3122 __kmp_free(team->t.t_implicit_task_taskdata); 3123 team->t.t_threads = NULL; 3124 team->t.t_disp_buffer = NULL; 3125 team->t.t_dispatch = NULL; 3126 team->t.t_implicit_task_taskdata = 0; 3127 } 3128 3129 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3130 kmp_info_t **oldThreads = team->t.t_threads; 3131 3132 __kmp_free(team->t.t_disp_buffer); 3133 __kmp_free(team->t.t_dispatch); 3134 __kmp_free(team->t.t_implicit_task_taskdata); 3135 __kmp_allocate_team_arrays(team, max_nth); 3136 3137 KMP_MEMCPY(team->t.t_threads, oldThreads, 3138 team->t.t_nproc * sizeof(kmp_info_t *)); 3139 3140 __kmp_free(oldThreads); 3141 } 3142 3143 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3144 3145 kmp_r_sched_t r_sched = 3146 __kmp_get_schedule_global(); // get current state of scheduling globals 3147 3148 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3149 3150 kmp_internal_control_t g_icvs = { 3151 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3152 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3153 // adjustment of threads (per thread) 3154 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3155 // whether blocktime is explicitly set 3156 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3157 #if KMP_USE_MONITOR 3158 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3159 // intervals 3160 #endif 3161 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3162 // next parallel region (per thread) 3163 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3164 __kmp_cg_max_nth, // int thread_limit; 3165 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3166 // for max_active_levels 3167 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3168 // {sched,chunk} pair 3169 __kmp_nested_proc_bind.bind_types[0], 3170 __kmp_default_device, 3171 NULL // struct kmp_internal_control *next; 3172 }; 3173 3174 return g_icvs; 3175 } 3176 3177 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3178 3179 kmp_internal_control_t gx_icvs; 3180 gx_icvs.serial_nesting_level = 3181 0; // probably =team->t.t_serial like in save_inter_controls 3182 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3183 gx_icvs.next = NULL; 3184 3185 return gx_icvs; 3186 } 3187 3188 static void __kmp_initialize_root(kmp_root_t *root) { 3189 int f; 3190 kmp_team_t *root_team; 3191 kmp_team_t *hot_team; 3192 int hot_team_max_nth; 3193 kmp_r_sched_t r_sched = 3194 __kmp_get_schedule_global(); // get current state of scheduling globals 3195 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3196 KMP_DEBUG_ASSERT(root); 3197 KMP_ASSERT(!root->r.r_begin); 3198 3199 /* setup the root state structure */ 3200 __kmp_init_lock(&root->r.r_begin_lock); 3201 root->r.r_begin = FALSE; 3202 root->r.r_active = FALSE; 3203 root->r.r_in_parallel = 0; 3204 root->r.r_blocktime = __kmp_dflt_blocktime; 3205 #if KMP_AFFINITY_SUPPORTED 3206 root->r.r_affinity_assigned = FALSE; 3207 #endif 3208 3209 /* setup the root team for this task */ 3210 /* allocate the root team structure */ 3211 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3212 3213 root_team = 3214 __kmp_allocate_team(root, 3215 1, // new_nproc 3216 1, // max_nproc 3217 #if OMPT_SUPPORT 3218 ompt_data_none, // root parallel id 3219 #endif 3220 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3221 0 // argc 3222 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3223 ); 3224 #if USE_DEBUGGER 3225 // Non-NULL value should be assigned to make the debugger display the root 3226 // team. 3227 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3228 #endif 3229 3230 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3231 3232 root->r.r_root_team = root_team; 3233 root_team->t.t_control_stack_top = NULL; 3234 3235 /* initialize root team */ 3236 root_team->t.t_threads[0] = NULL; 3237 root_team->t.t_nproc = 1; 3238 root_team->t.t_serialized = 1; 3239 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3240 root_team->t.t_sched.sched = r_sched.sched; 3241 KA_TRACE( 3242 20, 3243 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3244 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3245 3246 /* setup the hot team for this task */ 3247 /* allocate the hot team structure */ 3248 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3249 3250 hot_team = 3251 __kmp_allocate_team(root, 3252 1, // new_nproc 3253 __kmp_dflt_team_nth_ub * 2, // max_nproc 3254 #if OMPT_SUPPORT 3255 ompt_data_none, // root parallel id 3256 #endif 3257 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3258 0 // argc 3259 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3260 ); 3261 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3262 3263 root->r.r_hot_team = hot_team; 3264 root_team->t.t_control_stack_top = NULL; 3265 3266 /* first-time initialization */ 3267 hot_team->t.t_parent = root_team; 3268 3269 /* initialize hot team */ 3270 hot_team_max_nth = hot_team->t.t_max_nproc; 3271 for (f = 0; f < hot_team_max_nth; ++f) { 3272 hot_team->t.t_threads[f] = NULL; 3273 } 3274 hot_team->t.t_nproc = 1; 3275 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3276 hot_team->t.t_sched.sched = r_sched.sched; 3277 hot_team->t.t_size_changed = 0; 3278 } 3279 3280 #ifdef KMP_DEBUG 3281 3282 typedef struct kmp_team_list_item { 3283 kmp_team_p const *entry; 3284 struct kmp_team_list_item *next; 3285 } kmp_team_list_item_t; 3286 typedef kmp_team_list_item_t *kmp_team_list_t; 3287 3288 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3289 kmp_team_list_t list, // List of teams. 3290 kmp_team_p const *team // Team to add. 3291 ) { 3292 3293 // List must terminate with item where both entry and next are NULL. 3294 // Team is added to the list only once. 3295 // List is sorted in ascending order by team id. 3296 // Team id is *not* a key. 3297 3298 kmp_team_list_t l; 3299 3300 KMP_DEBUG_ASSERT(list != NULL); 3301 if (team == NULL) { 3302 return; 3303 } 3304 3305 __kmp_print_structure_team_accum(list, team->t.t_parent); 3306 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3307 3308 // Search list for the team. 3309 l = list; 3310 while (l->next != NULL && l->entry != team) { 3311 l = l->next; 3312 } 3313 if (l->next != NULL) { 3314 return; // Team has been added before, exit. 3315 } 3316 3317 // Team is not found. Search list again for insertion point. 3318 l = list; 3319 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3320 l = l->next; 3321 } 3322 3323 // Insert team. 3324 { 3325 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3326 sizeof(kmp_team_list_item_t)); 3327 *item = *l; 3328 l->entry = team; 3329 l->next = item; 3330 } 3331 } 3332 3333 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3334 3335 ) { 3336 __kmp_printf("%s", title); 3337 if (team != NULL) { 3338 __kmp_printf("%2x %p\n", team->t.t_id, team); 3339 } else { 3340 __kmp_printf(" - (nil)\n"); 3341 } 3342 } 3343 3344 static void __kmp_print_structure_thread(char const *title, 3345 kmp_info_p const *thread) { 3346 __kmp_printf("%s", title); 3347 if (thread != NULL) { 3348 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3349 } else { 3350 __kmp_printf(" - (nil)\n"); 3351 } 3352 } 3353 3354 void __kmp_print_structure(void) { 3355 3356 kmp_team_list_t list; 3357 3358 // Initialize list of teams. 3359 list = 3360 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3361 list->entry = NULL; 3362 list->next = NULL; 3363 3364 __kmp_printf("\n------------------------------\nGlobal Thread " 3365 "Table\n------------------------------\n"); 3366 { 3367 int gtid; 3368 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3369 __kmp_printf("%2d", gtid); 3370 if (__kmp_threads != NULL) { 3371 __kmp_printf(" %p", __kmp_threads[gtid]); 3372 } 3373 if (__kmp_root != NULL) { 3374 __kmp_printf(" %p", __kmp_root[gtid]); 3375 } 3376 __kmp_printf("\n"); 3377 } 3378 } 3379 3380 // Print out __kmp_threads array. 3381 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3382 "----------\n"); 3383 if (__kmp_threads != NULL) { 3384 int gtid; 3385 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3386 kmp_info_t const *thread = __kmp_threads[gtid]; 3387 if (thread != NULL) { 3388 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3389 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3390 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3391 __kmp_print_structure_team(" Serial Team: ", 3392 thread->th.th_serial_team); 3393 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3394 __kmp_print_structure_thread(" Primary: ", 3395 thread->th.th_team_master); 3396 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3397 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3398 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3399 __kmp_print_structure_thread(" Next in pool: ", 3400 thread->th.th_next_pool); 3401 __kmp_printf("\n"); 3402 __kmp_print_structure_team_accum(list, thread->th.th_team); 3403 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3404 } 3405 } 3406 } else { 3407 __kmp_printf("Threads array is not allocated.\n"); 3408 } 3409 3410 // Print out __kmp_root array. 3411 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3412 "--------\n"); 3413 if (__kmp_root != NULL) { 3414 int gtid; 3415 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3416 kmp_root_t const *root = __kmp_root[gtid]; 3417 if (root != NULL) { 3418 __kmp_printf("GTID %2d %p:\n", gtid, root); 3419 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3420 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3421 __kmp_print_structure_thread(" Uber Thread: ", 3422 root->r.r_uber_thread); 3423 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3424 __kmp_printf(" In Parallel: %2d\n", 3425 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3426 __kmp_printf("\n"); 3427 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3428 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3429 } 3430 } 3431 } else { 3432 __kmp_printf("Ubers array is not allocated.\n"); 3433 } 3434 3435 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3436 "--------\n"); 3437 while (list->next != NULL) { 3438 kmp_team_p const *team = list->entry; 3439 int i; 3440 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3441 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3442 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3443 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3444 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3445 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3446 for (i = 0; i < team->t.t_nproc; ++i) { 3447 __kmp_printf(" Thread %2d: ", i); 3448 __kmp_print_structure_thread("", team->t.t_threads[i]); 3449 } 3450 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3451 __kmp_printf("\n"); 3452 list = list->next; 3453 } 3454 3455 // Print out __kmp_thread_pool and __kmp_team_pool. 3456 __kmp_printf("\n------------------------------\nPools\n----------------------" 3457 "--------\n"); 3458 __kmp_print_structure_thread("Thread pool: ", 3459 CCAST(kmp_info_t *, __kmp_thread_pool)); 3460 __kmp_print_structure_team("Team pool: ", 3461 CCAST(kmp_team_t *, __kmp_team_pool)); 3462 __kmp_printf("\n"); 3463 3464 // Free team list. 3465 while (list != NULL) { 3466 kmp_team_list_item_t *item = list; 3467 list = list->next; 3468 KMP_INTERNAL_FREE(item); 3469 } 3470 } 3471 3472 #endif 3473 3474 //--------------------------------------------------------------------------- 3475 // Stuff for per-thread fast random number generator 3476 // Table of primes 3477 static const unsigned __kmp_primes[] = { 3478 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3479 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3480 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3481 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3482 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3483 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3484 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3485 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3486 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3487 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3488 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3489 3490 //--------------------------------------------------------------------------- 3491 // __kmp_get_random: Get a random number using a linear congruential method. 3492 unsigned short __kmp_get_random(kmp_info_t *thread) { 3493 unsigned x = thread->th.th_x; 3494 unsigned short r = (unsigned short)(x >> 16); 3495 3496 thread->th.th_x = x * thread->th.th_a + 1; 3497 3498 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3499 thread->th.th_info.ds.ds_tid, r)); 3500 3501 return r; 3502 } 3503 //-------------------------------------------------------- 3504 // __kmp_init_random: Initialize a random number generator 3505 void __kmp_init_random(kmp_info_t *thread) { 3506 unsigned seed = thread->th.th_info.ds.ds_tid; 3507 3508 thread->th.th_a = 3509 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3510 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3511 KA_TRACE(30, 3512 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3513 } 3514 3515 #if KMP_OS_WINDOWS 3516 /* reclaim array entries for root threads that are already dead, returns number 3517 * reclaimed */ 3518 static int __kmp_reclaim_dead_roots(void) { 3519 int i, r = 0; 3520 3521 for (i = 0; i < __kmp_threads_capacity; ++i) { 3522 if (KMP_UBER_GTID(i) && 3523 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3524 !__kmp_root[i] 3525 ->r.r_active) { // AC: reclaim only roots died in non-active state 3526 r += __kmp_unregister_root_other_thread(i); 3527 } 3528 } 3529 return r; 3530 } 3531 #endif 3532 3533 /* This function attempts to create free entries in __kmp_threads and 3534 __kmp_root, and returns the number of free entries generated. 3535 3536 For Windows* OS static library, the first mechanism used is to reclaim array 3537 entries for root threads that are already dead. 3538 3539 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3540 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3541 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3542 threadprivate cache array has been created. Synchronization with 3543 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3544 3545 After any dead root reclamation, if the clipping value allows array expansion 3546 to result in the generation of a total of nNeed free slots, the function does 3547 that expansion. If not, nothing is done beyond the possible initial root 3548 thread reclamation. 3549 3550 If any argument is negative, the behavior is undefined. */ 3551 static int __kmp_expand_threads(int nNeed) { 3552 int added = 0; 3553 int minimumRequiredCapacity; 3554 int newCapacity; 3555 kmp_info_t **newThreads; 3556 kmp_root_t **newRoot; 3557 3558 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3559 // resizing __kmp_threads does not need additional protection if foreign 3560 // threads are present 3561 3562 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3563 /* only for Windows static library */ 3564 /* reclaim array entries for root threads that are already dead */ 3565 added = __kmp_reclaim_dead_roots(); 3566 3567 if (nNeed) { 3568 nNeed -= added; 3569 if (nNeed < 0) 3570 nNeed = 0; 3571 } 3572 #endif 3573 if (nNeed <= 0) 3574 return added; 3575 3576 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3577 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3578 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3579 // > __kmp_max_nth in one of two ways: 3580 // 3581 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3582 // may not be reused by another thread, so we may need to increase 3583 // __kmp_threads_capacity to __kmp_max_nth + 1. 3584 // 3585 // 2) New foreign root(s) are encountered. We always register new foreign 3586 // roots. This may cause a smaller # of threads to be allocated at 3587 // subsequent parallel regions, but the worker threads hang around (and 3588 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3589 // 3590 // Anyway, that is the reason for moving the check to see if 3591 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3592 // instead of having it performed here. -BB 3593 3594 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3595 3596 /* compute expansion headroom to check if we can expand */ 3597 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3598 /* possible expansion too small -- give up */ 3599 return added; 3600 } 3601 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3602 3603 newCapacity = __kmp_threads_capacity; 3604 do { 3605 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3606 : __kmp_sys_max_nth; 3607 } while (newCapacity < minimumRequiredCapacity); 3608 newThreads = (kmp_info_t **)__kmp_allocate( 3609 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3610 newRoot = 3611 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3612 KMP_MEMCPY(newThreads, __kmp_threads, 3613 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3614 KMP_MEMCPY(newRoot, __kmp_root, 3615 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3616 3617 kmp_info_t **temp_threads = __kmp_threads; 3618 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3619 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3620 __kmp_free(temp_threads); 3621 added += newCapacity - __kmp_threads_capacity; 3622 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3623 3624 if (newCapacity > __kmp_tp_capacity) { 3625 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3626 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3627 __kmp_threadprivate_resize_cache(newCapacity); 3628 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3629 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3630 } 3631 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3632 } 3633 3634 return added; 3635 } 3636 3637 /* Register the current thread as a root thread and obtain our gtid. We must 3638 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3639 thread that calls from __kmp_do_serial_initialize() */ 3640 int __kmp_register_root(int initial_thread) { 3641 kmp_info_t *root_thread; 3642 kmp_root_t *root; 3643 int gtid; 3644 int capacity; 3645 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3646 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3647 KMP_MB(); 3648 3649 /* 2007-03-02: 3650 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3651 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3652 work as expected -- it may return false (that means there is at least one 3653 empty slot in __kmp_threads array), but it is possible the only free slot 3654 is #0, which is reserved for initial thread and so cannot be used for this 3655 one. Following code workarounds this bug. 3656 3657 However, right solution seems to be not reserving slot #0 for initial 3658 thread because: 3659 (1) there is no magic in slot #0, 3660 (2) we cannot detect initial thread reliably (the first thread which does 3661 serial initialization may be not a real initial thread). 3662 */ 3663 capacity = __kmp_threads_capacity; 3664 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3665 --capacity; 3666 } 3667 3668 // If it is not for initializing the hidden helper team, we need to take 3669 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3670 // in __kmp_threads_capacity. 3671 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3672 capacity -= __kmp_hidden_helper_threads_num; 3673 } 3674 3675 /* see if there are too many threads */ 3676 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3677 if (__kmp_tp_cached) { 3678 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3679 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3680 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3681 } else { 3682 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3683 __kmp_msg_null); 3684 } 3685 } 3686 3687 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3688 // 0: initial thread, also a regular OpenMP thread. 3689 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3690 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3691 // regular OpenMP threads. 3692 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3693 // Find an available thread slot for hidden helper thread. Slots for hidden 3694 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3695 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3696 gtid <= __kmp_hidden_helper_threads_num; 3697 gtid++) 3698 ; 3699 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3700 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3701 "hidden helper thread: T#%d\n", 3702 gtid)); 3703 } else { 3704 /* find an available thread slot */ 3705 // Don't reassign the zero slot since we need that to only be used by 3706 // initial thread. Slots for hidden helper threads should also be skipped. 3707 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3708 gtid = 0; 3709 } else { 3710 for (gtid = __kmp_hidden_helper_threads_num + 1; 3711 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3712 ; 3713 } 3714 KA_TRACE( 3715 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3716 KMP_ASSERT(gtid < __kmp_threads_capacity); 3717 } 3718 3719 /* update global accounting */ 3720 __kmp_all_nth++; 3721 TCW_4(__kmp_nth, __kmp_nth + 1); 3722 3723 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3724 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3725 if (__kmp_adjust_gtid_mode) { 3726 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3727 if (TCR_4(__kmp_gtid_mode) != 2) { 3728 TCW_4(__kmp_gtid_mode, 2); 3729 } 3730 } else { 3731 if (TCR_4(__kmp_gtid_mode) != 1) { 3732 TCW_4(__kmp_gtid_mode, 1); 3733 } 3734 } 3735 } 3736 3737 #ifdef KMP_ADJUST_BLOCKTIME 3738 /* Adjust blocktime to zero if necessary */ 3739 /* Middle initialization might not have occurred yet */ 3740 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3741 if (__kmp_nth > __kmp_avail_proc) { 3742 __kmp_zero_bt = TRUE; 3743 } 3744 } 3745 #endif /* KMP_ADJUST_BLOCKTIME */ 3746 3747 /* setup this new hierarchy */ 3748 if (!(root = __kmp_root[gtid])) { 3749 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3750 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3751 } 3752 3753 #if KMP_STATS_ENABLED 3754 // Initialize stats as soon as possible (right after gtid assignment). 3755 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3756 __kmp_stats_thread_ptr->startLife(); 3757 KMP_SET_THREAD_STATE(SERIAL_REGION); 3758 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3759 #endif 3760 __kmp_initialize_root(root); 3761 3762 /* setup new root thread structure */ 3763 if (root->r.r_uber_thread) { 3764 root_thread = root->r.r_uber_thread; 3765 } else { 3766 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3767 if (__kmp_storage_map) { 3768 __kmp_print_thread_storage_map(root_thread, gtid); 3769 } 3770 root_thread->th.th_info.ds.ds_gtid = gtid; 3771 #if OMPT_SUPPORT 3772 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3773 #endif 3774 root_thread->th.th_root = root; 3775 if (__kmp_env_consistency_check) { 3776 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3777 } 3778 #if USE_FAST_MEMORY 3779 __kmp_initialize_fast_memory(root_thread); 3780 #endif /* USE_FAST_MEMORY */ 3781 3782 #if KMP_USE_BGET 3783 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3784 __kmp_initialize_bget(root_thread); 3785 #endif 3786 __kmp_init_random(root_thread); // Initialize random number generator 3787 } 3788 3789 /* setup the serial team held in reserve by the root thread */ 3790 if (!root_thread->th.th_serial_team) { 3791 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3792 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3793 root_thread->th.th_serial_team = __kmp_allocate_team( 3794 root, 1, 1, 3795 #if OMPT_SUPPORT 3796 ompt_data_none, // root parallel id 3797 #endif 3798 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3799 } 3800 KMP_ASSERT(root_thread->th.th_serial_team); 3801 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3802 root_thread->th.th_serial_team)); 3803 3804 /* drop root_thread into place */ 3805 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3806 3807 root->r.r_root_team->t.t_threads[0] = root_thread; 3808 root->r.r_hot_team->t.t_threads[0] = root_thread; 3809 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3810 // AC: the team created in reserve, not for execution (it is unused for now). 3811 root_thread->th.th_serial_team->t.t_serialized = 0; 3812 root->r.r_uber_thread = root_thread; 3813 3814 /* initialize the thread, get it ready to go */ 3815 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3816 TCW_4(__kmp_init_gtid, TRUE); 3817 3818 /* prepare the primary thread for get_gtid() */ 3819 __kmp_gtid_set_specific(gtid); 3820 3821 #if USE_ITT_BUILD 3822 __kmp_itt_thread_name(gtid); 3823 #endif /* USE_ITT_BUILD */ 3824 3825 #ifdef KMP_TDATA_GTID 3826 __kmp_gtid = gtid; 3827 #endif 3828 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3829 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3830 3831 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3832 "plain=%u\n", 3833 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3834 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3835 KMP_INIT_BARRIER_STATE)); 3836 { // Initialize barrier data. 3837 int b; 3838 for (b = 0; b < bs_last_barrier; ++b) { 3839 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3840 #if USE_DEBUGGER 3841 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3842 #endif 3843 } 3844 } 3845 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3846 KMP_INIT_BARRIER_STATE); 3847 3848 #if KMP_AFFINITY_SUPPORTED 3849 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3850 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3851 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3852 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3853 #endif /* KMP_AFFINITY_SUPPORTED */ 3854 root_thread->th.th_def_allocator = __kmp_def_allocator; 3855 root_thread->th.th_prev_level = 0; 3856 root_thread->th.th_prev_num_threads = 1; 3857 3858 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3859 tmp->cg_root = root_thread; 3860 tmp->cg_thread_limit = __kmp_cg_max_nth; 3861 tmp->cg_nthreads = 1; 3862 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3863 " cg_nthreads init to 1\n", 3864 root_thread, tmp)); 3865 tmp->up = NULL; 3866 root_thread->th.th_cg_roots = tmp; 3867 3868 __kmp_root_counter++; 3869 3870 #if OMPT_SUPPORT 3871 if (!initial_thread && ompt_enabled.enabled) { 3872 3873 kmp_info_t *root_thread = ompt_get_thread(); 3874 3875 ompt_set_thread_state(root_thread, ompt_state_overhead); 3876 3877 if (ompt_enabled.ompt_callback_thread_begin) { 3878 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3879 ompt_thread_initial, __ompt_get_thread_data_internal()); 3880 } 3881 ompt_data_t *task_data; 3882 ompt_data_t *parallel_data; 3883 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3884 NULL); 3885 if (ompt_enabled.ompt_callback_implicit_task) { 3886 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3887 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3888 } 3889 3890 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3891 } 3892 #endif 3893 #if OMPD_SUPPORT 3894 if (ompd_state & OMPD_ENABLE_BP) 3895 ompd_bp_thread_begin(); 3896 #endif 3897 3898 KMP_MB(); 3899 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3900 3901 return gtid; 3902 } 3903 3904 #if KMP_NESTED_HOT_TEAMS 3905 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3906 const int max_level) { 3907 int i, n, nth; 3908 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3909 if (!hot_teams || !hot_teams[level].hot_team) { 3910 return 0; 3911 } 3912 KMP_DEBUG_ASSERT(level < max_level); 3913 kmp_team_t *team = hot_teams[level].hot_team; 3914 nth = hot_teams[level].hot_team_nth; 3915 n = nth - 1; // primary thread is not freed 3916 if (level < max_level - 1) { 3917 for (i = 0; i < nth; ++i) { 3918 kmp_info_t *th = team->t.t_threads[i]; 3919 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3920 if (i > 0 && th->th.th_hot_teams) { 3921 __kmp_free(th->th.th_hot_teams); 3922 th->th.th_hot_teams = NULL; 3923 } 3924 } 3925 } 3926 __kmp_free_team(root, team, NULL); 3927 return n; 3928 } 3929 #endif 3930 3931 // Resets a root thread and clear its root and hot teams. 3932 // Returns the number of __kmp_threads entries directly and indirectly freed. 3933 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3934 kmp_team_t *root_team = root->r.r_root_team; 3935 kmp_team_t *hot_team = root->r.r_hot_team; 3936 int n = hot_team->t.t_nproc; 3937 int i; 3938 3939 KMP_DEBUG_ASSERT(!root->r.r_active); 3940 3941 root->r.r_root_team = NULL; 3942 root->r.r_hot_team = NULL; 3943 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3944 // before call to __kmp_free_team(). 3945 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3946 #if KMP_NESTED_HOT_TEAMS 3947 if (__kmp_hot_teams_max_level > 3948 0) { // need to free nested hot teams and their threads if any 3949 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3950 kmp_info_t *th = hot_team->t.t_threads[i]; 3951 if (__kmp_hot_teams_max_level > 1) { 3952 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3953 } 3954 if (th->th.th_hot_teams) { 3955 __kmp_free(th->th.th_hot_teams); 3956 th->th.th_hot_teams = NULL; 3957 } 3958 } 3959 } 3960 #endif 3961 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3962 3963 // Before we can reap the thread, we need to make certain that all other 3964 // threads in the teams that had this root as ancestor have stopped trying to 3965 // steal tasks. 3966 if (__kmp_tasking_mode != tskm_immediate_exec) { 3967 __kmp_wait_to_unref_task_teams(); 3968 } 3969 3970 #if KMP_OS_WINDOWS 3971 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3972 KA_TRACE( 3973 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3974 "\n", 3975 (LPVOID) & (root->r.r_uber_thread->th), 3976 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3977 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3978 #endif /* KMP_OS_WINDOWS */ 3979 3980 #if OMPD_SUPPORT 3981 if (ompd_state & OMPD_ENABLE_BP) 3982 ompd_bp_thread_end(); 3983 #endif 3984 3985 #if OMPT_SUPPORT 3986 ompt_data_t *task_data; 3987 ompt_data_t *parallel_data; 3988 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3989 NULL); 3990 if (ompt_enabled.ompt_callback_implicit_task) { 3991 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3992 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3993 } 3994 if (ompt_enabled.ompt_callback_thread_end) { 3995 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3996 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3997 } 3998 #endif 3999 4000 TCW_4(__kmp_nth, 4001 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 4002 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 4003 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 4004 " to %d\n", 4005 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 4006 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 4007 if (i == 1) { 4008 // need to free contention group structure 4009 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 4010 root->r.r_uber_thread->th.th_cg_roots->cg_root); 4011 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 4012 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 4013 root->r.r_uber_thread->th.th_cg_roots = NULL; 4014 } 4015 __kmp_reap_thread(root->r.r_uber_thread, 1); 4016 4017 // We canot put root thread to __kmp_thread_pool, so we have to reap it 4018 // instead of freeing. 4019 root->r.r_uber_thread = NULL; 4020 /* mark root as no longer in use */ 4021 root->r.r_begin = FALSE; 4022 4023 return n; 4024 } 4025 4026 void __kmp_unregister_root_current_thread(int gtid) { 4027 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4028 /* this lock should be ok, since unregister_root_current_thread is never 4029 called during an abort, only during a normal close. furthermore, if you 4030 have the forkjoin lock, you should never try to get the initz lock */ 4031 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4032 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4033 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4034 "exiting T#%d\n", 4035 gtid)); 4036 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4037 return; 4038 } 4039 kmp_root_t *root = __kmp_root[gtid]; 4040 4041 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4042 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4043 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4044 KMP_ASSERT(root->r.r_active == FALSE); 4045 4046 KMP_MB(); 4047 4048 kmp_info_t *thread = __kmp_threads[gtid]; 4049 kmp_team_t *team = thread->th.th_team; 4050 kmp_task_team_t *task_team = thread->th.th_task_team; 4051 4052 // we need to wait for the proxy tasks before finishing the thread 4053 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4054 #if OMPT_SUPPORT 4055 // the runtime is shutting down so we won't report any events 4056 thread->th.ompt_thread_info.state = ompt_state_undefined; 4057 #endif 4058 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4059 } 4060 4061 __kmp_reset_root(gtid, root); 4062 4063 KMP_MB(); 4064 KC_TRACE(10, 4065 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4066 4067 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4068 } 4069 4070 #if KMP_OS_WINDOWS 4071 /* __kmp_forkjoin_lock must be already held 4072 Unregisters a root thread that is not the current thread. Returns the number 4073 of __kmp_threads entries freed as a result. */ 4074 static int __kmp_unregister_root_other_thread(int gtid) { 4075 kmp_root_t *root = __kmp_root[gtid]; 4076 int r; 4077 4078 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4079 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4080 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4081 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4082 KMP_ASSERT(root->r.r_active == FALSE); 4083 4084 r = __kmp_reset_root(gtid, root); 4085 KC_TRACE(10, 4086 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4087 return r; 4088 } 4089 #endif 4090 4091 #if KMP_DEBUG 4092 void __kmp_task_info() { 4093 4094 kmp_int32 gtid = __kmp_entry_gtid(); 4095 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4096 kmp_info_t *this_thr = __kmp_threads[gtid]; 4097 kmp_team_t *steam = this_thr->th.th_serial_team; 4098 kmp_team_t *team = this_thr->th.th_team; 4099 4100 __kmp_printf( 4101 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4102 "ptask=%p\n", 4103 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4104 team->t.t_implicit_task_taskdata[tid].td_parent); 4105 } 4106 #endif // KMP_DEBUG 4107 4108 /* TODO optimize with one big memclr, take out what isn't needed, split 4109 responsibility to workers as much as possible, and delay initialization of 4110 features as much as possible */ 4111 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4112 int tid, int gtid) { 4113 /* this_thr->th.th_info.ds.ds_gtid is setup in 4114 kmp_allocate_thread/create_worker. 4115 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4116 KMP_DEBUG_ASSERT(this_thr != NULL); 4117 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4118 KMP_DEBUG_ASSERT(team); 4119 KMP_DEBUG_ASSERT(team->t.t_threads); 4120 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4121 kmp_info_t *master = team->t.t_threads[0]; 4122 KMP_DEBUG_ASSERT(master); 4123 KMP_DEBUG_ASSERT(master->th.th_root); 4124 4125 KMP_MB(); 4126 4127 TCW_SYNC_PTR(this_thr->th.th_team, team); 4128 4129 this_thr->th.th_info.ds.ds_tid = tid; 4130 this_thr->th.th_set_nproc = 0; 4131 if (__kmp_tasking_mode != tskm_immediate_exec) 4132 // When tasking is possible, threads are not safe to reap until they are 4133 // done tasking; this will be set when tasking code is exited in wait 4134 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4135 else // no tasking --> always safe to reap 4136 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4137 this_thr->th.th_set_proc_bind = proc_bind_default; 4138 #if KMP_AFFINITY_SUPPORTED 4139 this_thr->th.th_new_place = this_thr->th.th_current_place; 4140 #endif 4141 this_thr->th.th_root = master->th.th_root; 4142 4143 /* setup the thread's cache of the team structure */ 4144 this_thr->th.th_team_nproc = team->t.t_nproc; 4145 this_thr->th.th_team_master = master; 4146 this_thr->th.th_team_serialized = team->t.t_serialized; 4147 4148 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4149 4150 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4151 tid, gtid, this_thr, this_thr->th.th_current_task)); 4152 4153 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4154 team, tid, TRUE); 4155 4156 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4157 tid, gtid, this_thr, this_thr->th.th_current_task)); 4158 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4159 // __kmp_initialize_team()? 4160 4161 /* TODO no worksharing in speculative threads */ 4162 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4163 4164 this_thr->th.th_local.this_construct = 0; 4165 4166 if (!this_thr->th.th_pri_common) { 4167 this_thr->th.th_pri_common = 4168 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4169 if (__kmp_storage_map) { 4170 __kmp_print_storage_map_gtid( 4171 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4172 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4173 } 4174 this_thr->th.th_pri_head = NULL; 4175 } 4176 4177 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4178 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4179 // Make new thread's CG root same as primary thread's 4180 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4181 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4182 if (tmp) { 4183 // worker changes CG, need to check if old CG should be freed 4184 int i = tmp->cg_nthreads--; 4185 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4186 " on node %p of thread %p to %d\n", 4187 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4188 if (i == 1) { 4189 __kmp_free(tmp); // last thread left CG --> free it 4190 } 4191 } 4192 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4193 // Increment new thread's CG root's counter to add the new thread 4194 this_thr->th.th_cg_roots->cg_nthreads++; 4195 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4196 " node %p of thread %p to %d\n", 4197 this_thr, this_thr->th.th_cg_roots, 4198 this_thr->th.th_cg_roots->cg_root, 4199 this_thr->th.th_cg_roots->cg_nthreads)); 4200 this_thr->th.th_current_task->td_icvs.thread_limit = 4201 this_thr->th.th_cg_roots->cg_thread_limit; 4202 } 4203 4204 /* Initialize dynamic dispatch */ 4205 { 4206 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4207 // Use team max_nproc since this will never change for the team. 4208 size_t disp_size = 4209 sizeof(dispatch_private_info_t) * 4210 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4211 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4212 team->t.t_max_nproc)); 4213 KMP_ASSERT(dispatch); 4214 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4215 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4216 4217 dispatch->th_disp_index = 0; 4218 dispatch->th_doacross_buf_idx = 0; 4219 if (!dispatch->th_disp_buffer) { 4220 dispatch->th_disp_buffer = 4221 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4222 4223 if (__kmp_storage_map) { 4224 __kmp_print_storage_map_gtid( 4225 gtid, &dispatch->th_disp_buffer[0], 4226 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4227 ? 1 4228 : __kmp_dispatch_num_buffers], 4229 disp_size, 4230 "th_%d.th_dispatch.th_disp_buffer " 4231 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4232 gtid, team->t.t_id, gtid); 4233 } 4234 } else { 4235 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4236 } 4237 4238 dispatch->th_dispatch_pr_current = 0; 4239 dispatch->th_dispatch_sh_current = 0; 4240 4241 dispatch->th_deo_fcn = 0; /* ORDERED */ 4242 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4243 } 4244 4245 this_thr->th.th_next_pool = NULL; 4246 4247 if (!this_thr->th.th_task_state_memo_stack) { 4248 size_t i; 4249 this_thr->th.th_task_state_memo_stack = 4250 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4251 this_thr->th.th_task_state_top = 0; 4252 this_thr->th.th_task_state_stack_sz = 4; 4253 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4254 ++i) // zero init the stack 4255 this_thr->th.th_task_state_memo_stack[i] = 0; 4256 } 4257 4258 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4259 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4260 4261 KMP_MB(); 4262 } 4263 4264 /* allocate a new thread for the requesting team. this is only called from 4265 within a forkjoin critical section. we will first try to get an available 4266 thread from the thread pool. if none is available, we will fork a new one 4267 assuming we are able to create a new one. this should be assured, as the 4268 caller should check on this first. */ 4269 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4270 int new_tid) { 4271 kmp_team_t *serial_team; 4272 kmp_info_t *new_thr; 4273 int new_gtid; 4274 4275 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4276 KMP_DEBUG_ASSERT(root && team); 4277 #if !KMP_NESTED_HOT_TEAMS 4278 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4279 #endif 4280 KMP_MB(); 4281 4282 /* first, try to get one from the thread pool */ 4283 if (__kmp_thread_pool) { 4284 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4285 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4286 if (new_thr == __kmp_thread_pool_insert_pt) { 4287 __kmp_thread_pool_insert_pt = NULL; 4288 } 4289 TCW_4(new_thr->th.th_in_pool, FALSE); 4290 __kmp_suspend_initialize_thread(new_thr); 4291 __kmp_lock_suspend_mx(new_thr); 4292 if (new_thr->th.th_active_in_pool == TRUE) { 4293 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4294 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4295 new_thr->th.th_active_in_pool = FALSE; 4296 } 4297 __kmp_unlock_suspend_mx(new_thr); 4298 4299 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4300 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4301 KMP_ASSERT(!new_thr->th.th_team); 4302 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4303 4304 /* setup the thread structure */ 4305 __kmp_initialize_info(new_thr, team, new_tid, 4306 new_thr->th.th_info.ds.ds_gtid); 4307 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4308 4309 TCW_4(__kmp_nth, __kmp_nth + 1); 4310 4311 new_thr->th.th_task_state = 0; 4312 new_thr->th.th_task_state_top = 0; 4313 new_thr->th.th_task_state_stack_sz = 4; 4314 4315 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 4316 // Make sure pool thread has transitioned to waiting on own thread struct 4317 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); 4318 // Thread activated in __kmp_allocate_team when increasing team size 4319 } 4320 4321 #ifdef KMP_ADJUST_BLOCKTIME 4322 /* Adjust blocktime back to zero if necessary */ 4323 /* Middle initialization might not have occurred yet */ 4324 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4325 if (__kmp_nth > __kmp_avail_proc) { 4326 __kmp_zero_bt = TRUE; 4327 } 4328 } 4329 #endif /* KMP_ADJUST_BLOCKTIME */ 4330 4331 #if KMP_DEBUG 4332 // If thread entered pool via __kmp_free_thread, wait_flag should != 4333 // KMP_BARRIER_PARENT_FLAG. 4334 int b; 4335 kmp_balign_t *balign = new_thr->th.th_bar; 4336 for (b = 0; b < bs_last_barrier; ++b) 4337 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4338 #endif 4339 4340 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4341 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4342 4343 KMP_MB(); 4344 return new_thr; 4345 } 4346 4347 /* no, well fork a new one */ 4348 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4349 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4350 4351 #if KMP_USE_MONITOR 4352 // If this is the first worker thread the RTL is creating, then also 4353 // launch the monitor thread. We try to do this as early as possible. 4354 if (!TCR_4(__kmp_init_monitor)) { 4355 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4356 if (!TCR_4(__kmp_init_monitor)) { 4357 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4358 TCW_4(__kmp_init_monitor, 1); 4359 __kmp_create_monitor(&__kmp_monitor); 4360 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4361 #if KMP_OS_WINDOWS 4362 // AC: wait until monitor has started. This is a fix for CQ232808. 4363 // The reason is that if the library is loaded/unloaded in a loop with 4364 // small (parallel) work in between, then there is high probability that 4365 // monitor thread started after the library shutdown. At shutdown it is 4366 // too late to cope with the problem, because when the primary thread is 4367 // in DllMain (process detach) the monitor has no chances to start (it is 4368 // blocked), and primary thread has no means to inform the monitor that 4369 // the library has gone, because all the memory which the monitor can 4370 // access is going to be released/reset. 4371 while (TCR_4(__kmp_init_monitor) < 2) { 4372 KMP_YIELD(TRUE); 4373 } 4374 KF_TRACE(10, ("after monitor thread has started\n")); 4375 #endif 4376 } 4377 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4378 } 4379 #endif 4380 4381 KMP_MB(); 4382 4383 { 4384 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4385 ? 1 4386 : __kmp_hidden_helper_threads_num + 1; 4387 4388 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4389 ++new_gtid) { 4390 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4391 } 4392 4393 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4394 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4395 } 4396 } 4397 4398 /* allocate space for it. */ 4399 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4400 4401 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4402 4403 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4404 // suppress race conditions detection on synchronization flags in debug mode 4405 // this helps to analyze library internals eliminating false positives 4406 __itt_suppress_mark_range( 4407 __itt_suppress_range, __itt_suppress_threading_errors, 4408 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4409 __itt_suppress_mark_range( 4410 __itt_suppress_range, __itt_suppress_threading_errors, 4411 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4412 #if KMP_OS_WINDOWS 4413 __itt_suppress_mark_range( 4414 __itt_suppress_range, __itt_suppress_threading_errors, 4415 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4416 #else 4417 __itt_suppress_mark_range(__itt_suppress_range, 4418 __itt_suppress_threading_errors, 4419 &new_thr->th.th_suspend_init_count, 4420 sizeof(new_thr->th.th_suspend_init_count)); 4421 #endif 4422 // TODO: check if we need to also suppress b_arrived flags 4423 __itt_suppress_mark_range(__itt_suppress_range, 4424 __itt_suppress_threading_errors, 4425 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4426 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4427 __itt_suppress_mark_range(__itt_suppress_range, 4428 __itt_suppress_threading_errors, 4429 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4430 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4431 __itt_suppress_mark_range(__itt_suppress_range, 4432 __itt_suppress_threading_errors, 4433 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4434 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4435 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4436 if (__kmp_storage_map) { 4437 __kmp_print_thread_storage_map(new_thr, new_gtid); 4438 } 4439 4440 // add the reserve serialized team, initialized from the team's primary thread 4441 { 4442 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4443 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4444 new_thr->th.th_serial_team = serial_team = 4445 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4446 #if OMPT_SUPPORT 4447 ompt_data_none, // root parallel id 4448 #endif 4449 proc_bind_default, &r_icvs, 4450 0 USE_NESTED_HOT_ARG(NULL)); 4451 } 4452 KMP_ASSERT(serial_team); 4453 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4454 // execution (it is unused for now). 4455 serial_team->t.t_threads[0] = new_thr; 4456 KF_TRACE(10, 4457 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4458 new_thr)); 4459 4460 /* setup the thread structures */ 4461 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4462 4463 #if USE_FAST_MEMORY 4464 __kmp_initialize_fast_memory(new_thr); 4465 #endif /* USE_FAST_MEMORY */ 4466 4467 #if KMP_USE_BGET 4468 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4469 __kmp_initialize_bget(new_thr); 4470 #endif 4471 4472 __kmp_init_random(new_thr); // Initialize random number generator 4473 4474 /* Initialize these only once when thread is grabbed for a team allocation */ 4475 KA_TRACE(20, 4476 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4477 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4478 4479 int b; 4480 kmp_balign_t *balign = new_thr->th.th_bar; 4481 for (b = 0; b < bs_last_barrier; ++b) { 4482 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4483 balign[b].bb.team = NULL; 4484 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4485 balign[b].bb.use_oncore_barrier = 0; 4486 } 4487 4488 TCW_PTR(new_thr->th.th_sleep_loc, NULL); 4489 new_thr->th.th_sleep_loc_type = flag_unset; 4490 4491 new_thr->th.th_spin_here = FALSE; 4492 new_thr->th.th_next_waiting = 0; 4493 #if KMP_OS_UNIX 4494 new_thr->th.th_blocking = false; 4495 #endif 4496 4497 #if KMP_AFFINITY_SUPPORTED 4498 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4499 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4500 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4501 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4502 #endif 4503 new_thr->th.th_def_allocator = __kmp_def_allocator; 4504 new_thr->th.th_prev_level = 0; 4505 new_thr->th.th_prev_num_threads = 1; 4506 4507 TCW_4(new_thr->th.th_in_pool, FALSE); 4508 new_thr->th.th_active_in_pool = FALSE; 4509 TCW_4(new_thr->th.th_active, TRUE); 4510 4511 /* adjust the global counters */ 4512 __kmp_all_nth++; 4513 __kmp_nth++; 4514 4515 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4516 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4517 if (__kmp_adjust_gtid_mode) { 4518 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4519 if (TCR_4(__kmp_gtid_mode) != 2) { 4520 TCW_4(__kmp_gtid_mode, 2); 4521 } 4522 } else { 4523 if (TCR_4(__kmp_gtid_mode) != 1) { 4524 TCW_4(__kmp_gtid_mode, 1); 4525 } 4526 } 4527 } 4528 4529 #ifdef KMP_ADJUST_BLOCKTIME 4530 /* Adjust blocktime back to zero if necessary */ 4531 /* Middle initialization might not have occurred yet */ 4532 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4533 if (__kmp_nth > __kmp_avail_proc) { 4534 __kmp_zero_bt = TRUE; 4535 } 4536 } 4537 #endif /* KMP_ADJUST_BLOCKTIME */ 4538 4539 /* actually fork it and create the new worker thread */ 4540 KF_TRACE( 4541 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4542 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4543 KF_TRACE(10, 4544 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4545 4546 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4547 new_gtid)); 4548 KMP_MB(); 4549 return new_thr; 4550 } 4551 4552 /* Reinitialize team for reuse. 4553 The hot team code calls this case at every fork barrier, so EPCC barrier 4554 test are extremely sensitive to changes in it, esp. writes to the team 4555 struct, which cause a cache invalidation in all threads. 4556 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4557 static void __kmp_reinitialize_team(kmp_team_t *team, 4558 kmp_internal_control_t *new_icvs, 4559 ident_t *loc) { 4560 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4561 team->t.t_threads[0], team)); 4562 KMP_DEBUG_ASSERT(team && new_icvs); 4563 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4564 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4565 4566 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4567 // Copy ICVs to the primary thread's implicit taskdata 4568 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4569 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4570 4571 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4572 team->t.t_threads[0], team)); 4573 } 4574 4575 /* Initialize the team data structure. 4576 This assumes the t_threads and t_max_nproc are already set. 4577 Also, we don't touch the arguments */ 4578 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4579 kmp_internal_control_t *new_icvs, 4580 ident_t *loc) { 4581 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4582 4583 /* verify */ 4584 KMP_DEBUG_ASSERT(team); 4585 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4586 KMP_DEBUG_ASSERT(team->t.t_threads); 4587 KMP_MB(); 4588 4589 team->t.t_master_tid = 0; /* not needed */ 4590 /* team->t.t_master_bar; not needed */ 4591 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4592 team->t.t_nproc = new_nproc; 4593 4594 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4595 team->t.t_next_pool = NULL; 4596 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4597 * up hot team */ 4598 4599 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4600 team->t.t_invoke = NULL; /* not needed */ 4601 4602 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4603 team->t.t_sched.sched = new_icvs->sched.sched; 4604 4605 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4606 team->t.t_fp_control_saved = FALSE; /* not needed */ 4607 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4608 team->t.t_mxcsr = 0; /* not needed */ 4609 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4610 4611 team->t.t_construct = 0; 4612 4613 team->t.t_ordered.dt.t_value = 0; 4614 team->t.t_master_active = FALSE; 4615 4616 #ifdef KMP_DEBUG 4617 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4618 #endif 4619 #if KMP_OS_WINDOWS 4620 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4621 #endif 4622 4623 team->t.t_control_stack_top = NULL; 4624 4625 __kmp_reinitialize_team(team, new_icvs, loc); 4626 4627 KMP_MB(); 4628 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4629 } 4630 4631 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4632 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4633 static void 4634 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4635 if (KMP_AFFINITY_CAPABLE()) { 4636 int status; 4637 if (old_mask != NULL) { 4638 status = __kmp_get_system_affinity(old_mask, TRUE); 4639 int error = errno; 4640 if (status != 0) { 4641 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4642 __kmp_msg_null); 4643 } 4644 } 4645 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4646 } 4647 } 4648 #endif 4649 4650 #if KMP_AFFINITY_SUPPORTED 4651 4652 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4653 // It calculates the worker + primary thread's partition based upon the parent 4654 // thread's partition, and binds each worker to a thread in their partition. 4655 // The primary thread's partition should already include its current binding. 4656 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4657 // Do not partition places for the hidden helper team 4658 if (KMP_HIDDEN_HELPER_TEAM(team)) 4659 return; 4660 // Copy the primary thread's place partition to the team struct 4661 kmp_info_t *master_th = team->t.t_threads[0]; 4662 KMP_DEBUG_ASSERT(master_th != NULL); 4663 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4664 int first_place = master_th->th.th_first_place; 4665 int last_place = master_th->th.th_last_place; 4666 int masters_place = master_th->th.th_current_place; 4667 team->t.t_first_place = first_place; 4668 team->t.t_last_place = last_place; 4669 4670 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4671 "bound to place %d partition = [%d,%d]\n", 4672 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4673 team->t.t_id, masters_place, first_place, last_place)); 4674 4675 switch (proc_bind) { 4676 4677 case proc_bind_default: 4678 // Serial teams might have the proc_bind policy set to proc_bind_default. 4679 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4680 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4681 break; 4682 4683 case proc_bind_primary: { 4684 int f; 4685 int n_th = team->t.t_nproc; 4686 for (f = 1; f < n_th; f++) { 4687 kmp_info_t *th = team->t.t_threads[f]; 4688 KMP_DEBUG_ASSERT(th != NULL); 4689 th->th.th_first_place = first_place; 4690 th->th.th_last_place = last_place; 4691 th->th.th_new_place = masters_place; 4692 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4693 team->t.t_display_affinity != 1) { 4694 team->t.t_display_affinity = 1; 4695 } 4696 4697 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4698 "partition = [%d,%d]\n", 4699 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4700 f, masters_place, first_place, last_place)); 4701 } 4702 } break; 4703 4704 case proc_bind_close: { 4705 int f; 4706 int n_th = team->t.t_nproc; 4707 int n_places; 4708 if (first_place <= last_place) { 4709 n_places = last_place - first_place + 1; 4710 } else { 4711 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4712 } 4713 if (n_th <= n_places) { 4714 int place = masters_place; 4715 for (f = 1; f < n_th; f++) { 4716 kmp_info_t *th = team->t.t_threads[f]; 4717 KMP_DEBUG_ASSERT(th != NULL); 4718 4719 if (place == last_place) { 4720 place = first_place; 4721 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4722 place = 0; 4723 } else { 4724 place++; 4725 } 4726 th->th.th_first_place = first_place; 4727 th->th.th_last_place = last_place; 4728 th->th.th_new_place = place; 4729 if (__kmp_display_affinity && place != th->th.th_current_place && 4730 team->t.t_display_affinity != 1) { 4731 team->t.t_display_affinity = 1; 4732 } 4733 4734 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4735 "partition = [%d,%d]\n", 4736 __kmp_gtid_from_thread(team->t.t_threads[f]), 4737 team->t.t_id, f, place, first_place, last_place)); 4738 } 4739 } else { 4740 int S, rem, gap, s_count; 4741 S = n_th / n_places; 4742 s_count = 0; 4743 rem = n_th - (S * n_places); 4744 gap = rem > 0 ? n_places / rem : n_places; 4745 int place = masters_place; 4746 int gap_ct = gap; 4747 for (f = 0; f < n_th; f++) { 4748 kmp_info_t *th = team->t.t_threads[f]; 4749 KMP_DEBUG_ASSERT(th != NULL); 4750 4751 th->th.th_first_place = first_place; 4752 th->th.th_last_place = last_place; 4753 th->th.th_new_place = place; 4754 if (__kmp_display_affinity && place != th->th.th_current_place && 4755 team->t.t_display_affinity != 1) { 4756 team->t.t_display_affinity = 1; 4757 } 4758 s_count++; 4759 4760 if ((s_count == S) && rem && (gap_ct == gap)) { 4761 // do nothing, add an extra thread to place on next iteration 4762 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4763 // we added an extra thread to this place; move to next place 4764 if (place == last_place) { 4765 place = first_place; 4766 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4767 place = 0; 4768 } else { 4769 place++; 4770 } 4771 s_count = 0; 4772 gap_ct = 1; 4773 rem--; 4774 } else if (s_count == S) { // place full; don't add extra 4775 if (place == last_place) { 4776 place = first_place; 4777 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4778 place = 0; 4779 } else { 4780 place++; 4781 } 4782 gap_ct++; 4783 s_count = 0; 4784 } 4785 4786 KA_TRACE(100, 4787 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4788 "partition = [%d,%d]\n", 4789 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4790 th->th.th_new_place, first_place, last_place)); 4791 } 4792 KMP_DEBUG_ASSERT(place == masters_place); 4793 } 4794 } break; 4795 4796 case proc_bind_spread: { 4797 int f; 4798 int n_th = team->t.t_nproc; 4799 int n_places; 4800 int thidx; 4801 if (first_place <= last_place) { 4802 n_places = last_place - first_place + 1; 4803 } else { 4804 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4805 } 4806 if (n_th <= n_places) { 4807 int place = -1; 4808 4809 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4810 int S = n_places / n_th; 4811 int s_count, rem, gap, gap_ct; 4812 4813 place = masters_place; 4814 rem = n_places - n_th * S; 4815 gap = rem ? n_th / rem : 1; 4816 gap_ct = gap; 4817 thidx = n_th; 4818 if (update_master_only == 1) 4819 thidx = 1; 4820 for (f = 0; f < thidx; f++) { 4821 kmp_info_t *th = team->t.t_threads[f]; 4822 KMP_DEBUG_ASSERT(th != NULL); 4823 4824 th->th.th_first_place = place; 4825 th->th.th_new_place = place; 4826 if (__kmp_display_affinity && place != th->th.th_current_place && 4827 team->t.t_display_affinity != 1) { 4828 team->t.t_display_affinity = 1; 4829 } 4830 s_count = 1; 4831 while (s_count < S) { 4832 if (place == last_place) { 4833 place = first_place; 4834 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4835 place = 0; 4836 } else { 4837 place++; 4838 } 4839 s_count++; 4840 } 4841 if (rem && (gap_ct == gap)) { 4842 if (place == last_place) { 4843 place = first_place; 4844 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4845 place = 0; 4846 } else { 4847 place++; 4848 } 4849 rem--; 4850 gap_ct = 0; 4851 } 4852 th->th.th_last_place = place; 4853 gap_ct++; 4854 4855 if (place == last_place) { 4856 place = first_place; 4857 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4858 place = 0; 4859 } else { 4860 place++; 4861 } 4862 4863 KA_TRACE(100, 4864 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4865 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4866 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4867 f, th->th.th_new_place, th->th.th_first_place, 4868 th->th.th_last_place, __kmp_affinity_num_masks)); 4869 } 4870 } else { 4871 /* Having uniform space of available computation places I can create 4872 T partitions of round(P/T) size and put threads into the first 4873 place of each partition. */ 4874 double current = static_cast<double>(masters_place); 4875 double spacing = 4876 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4877 int first, last; 4878 kmp_info_t *th; 4879 4880 thidx = n_th + 1; 4881 if (update_master_only == 1) 4882 thidx = 1; 4883 for (f = 0; f < thidx; f++) { 4884 first = static_cast<int>(current); 4885 last = static_cast<int>(current + spacing) - 1; 4886 KMP_DEBUG_ASSERT(last >= first); 4887 if (first >= n_places) { 4888 if (masters_place) { 4889 first -= n_places; 4890 last -= n_places; 4891 if (first == (masters_place + 1)) { 4892 KMP_DEBUG_ASSERT(f == n_th); 4893 first--; 4894 } 4895 if (last == masters_place) { 4896 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4897 last--; 4898 } 4899 } else { 4900 KMP_DEBUG_ASSERT(f == n_th); 4901 first = 0; 4902 last = 0; 4903 } 4904 } 4905 if (last >= n_places) { 4906 last = (n_places - 1); 4907 } 4908 place = first; 4909 current += spacing; 4910 if (f < n_th) { 4911 KMP_DEBUG_ASSERT(0 <= first); 4912 KMP_DEBUG_ASSERT(n_places > first); 4913 KMP_DEBUG_ASSERT(0 <= last); 4914 KMP_DEBUG_ASSERT(n_places > last); 4915 KMP_DEBUG_ASSERT(last_place >= first_place); 4916 th = team->t.t_threads[f]; 4917 KMP_DEBUG_ASSERT(th); 4918 th->th.th_first_place = first; 4919 th->th.th_new_place = place; 4920 th->th.th_last_place = last; 4921 if (__kmp_display_affinity && place != th->th.th_current_place && 4922 team->t.t_display_affinity != 1) { 4923 team->t.t_display_affinity = 1; 4924 } 4925 KA_TRACE(100, 4926 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4927 "partition = [%d,%d], spacing = %.4f\n", 4928 __kmp_gtid_from_thread(team->t.t_threads[f]), 4929 team->t.t_id, f, th->th.th_new_place, 4930 th->th.th_first_place, th->th.th_last_place, spacing)); 4931 } 4932 } 4933 } 4934 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4935 } else { 4936 int S, rem, gap, s_count; 4937 S = n_th / n_places; 4938 s_count = 0; 4939 rem = n_th - (S * n_places); 4940 gap = rem > 0 ? n_places / rem : n_places; 4941 int place = masters_place; 4942 int gap_ct = gap; 4943 thidx = n_th; 4944 if (update_master_only == 1) 4945 thidx = 1; 4946 for (f = 0; f < thidx; f++) { 4947 kmp_info_t *th = team->t.t_threads[f]; 4948 KMP_DEBUG_ASSERT(th != NULL); 4949 4950 th->th.th_first_place = place; 4951 th->th.th_last_place = place; 4952 th->th.th_new_place = place; 4953 if (__kmp_display_affinity && place != th->th.th_current_place && 4954 team->t.t_display_affinity != 1) { 4955 team->t.t_display_affinity = 1; 4956 } 4957 s_count++; 4958 4959 if ((s_count == S) && rem && (gap_ct == gap)) { 4960 // do nothing, add an extra thread to place on next iteration 4961 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4962 // we added an extra thread to this place; move on to next place 4963 if (place == last_place) { 4964 place = first_place; 4965 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4966 place = 0; 4967 } else { 4968 place++; 4969 } 4970 s_count = 0; 4971 gap_ct = 1; 4972 rem--; 4973 } else if (s_count == S) { // place is full; don't add extra thread 4974 if (place == last_place) { 4975 place = first_place; 4976 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4977 place = 0; 4978 } else { 4979 place++; 4980 } 4981 gap_ct++; 4982 s_count = 0; 4983 } 4984 4985 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4986 "partition = [%d,%d]\n", 4987 __kmp_gtid_from_thread(team->t.t_threads[f]), 4988 team->t.t_id, f, th->th.th_new_place, 4989 th->th.th_first_place, th->th.th_last_place)); 4990 } 4991 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4992 } 4993 } break; 4994 4995 default: 4996 break; 4997 } 4998 4999 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 5000 } 5001 5002 #endif // KMP_AFFINITY_SUPPORTED 5003 5004 /* allocate a new team data structure to use. take one off of the free pool if 5005 available */ 5006 kmp_team_t * 5007 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 5008 #if OMPT_SUPPORT 5009 ompt_data_t ompt_parallel_data, 5010 #endif 5011 kmp_proc_bind_t new_proc_bind, 5012 kmp_internal_control_t *new_icvs, 5013 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5014 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 5015 int f; 5016 kmp_team_t *team; 5017 int use_hot_team = !root->r.r_active; 5018 int level = 0; 5019 5020 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 5021 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 5022 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 5023 KMP_MB(); 5024 5025 #if KMP_NESTED_HOT_TEAMS 5026 kmp_hot_team_ptr_t *hot_teams; 5027 if (master) { 5028 team = master->th.th_team; 5029 level = team->t.t_active_level; 5030 if (master->th.th_teams_microtask) { // in teams construct? 5031 if (master->th.th_teams_size.nteams > 1 && 5032 ( // #teams > 1 5033 team->t.t_pkfn == 5034 (microtask_t)__kmp_teams_master || // inner fork of the teams 5035 master->th.th_teams_level < 5036 team->t.t_level)) { // or nested parallel inside the teams 5037 ++level; // not increment if #teams==1, or for outer fork of the teams; 5038 // increment otherwise 5039 } 5040 } 5041 hot_teams = master->th.th_hot_teams; 5042 if (level < __kmp_hot_teams_max_level && hot_teams && 5043 hot_teams[level].hot_team) { 5044 // hot team has already been allocated for given level 5045 use_hot_team = 1; 5046 } else { 5047 use_hot_team = 0; 5048 } 5049 } else { 5050 // check we won't access uninitialized hot_teams, just in case 5051 KMP_DEBUG_ASSERT(new_nproc == 1); 5052 } 5053 #endif 5054 // Optimization to use a "hot" team 5055 if (use_hot_team && new_nproc > 1) { 5056 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5057 #if KMP_NESTED_HOT_TEAMS 5058 team = hot_teams[level].hot_team; 5059 #else 5060 team = root->r.r_hot_team; 5061 #endif 5062 #if KMP_DEBUG 5063 if (__kmp_tasking_mode != tskm_immediate_exec) { 5064 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5065 "task_team[1] = %p before reinit\n", 5066 team->t.t_task_team[0], team->t.t_task_team[1])); 5067 } 5068 #endif 5069 5070 if (team->t.t_nproc != new_nproc && 5071 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5072 // Distributed barrier may need a resize 5073 int old_nthr = team->t.t_nproc; 5074 __kmp_resize_dist_barrier(team, old_nthr, new_nproc); 5075 } 5076 5077 // Has the number of threads changed? 5078 /* Let's assume the most common case is that the number of threads is 5079 unchanged, and put that case first. */ 5080 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5081 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5082 // This case can mean that omp_set_num_threads() was called and the hot 5083 // team size was already reduced, so we check the special flag 5084 if (team->t.t_size_changed == -1) { 5085 team->t.t_size_changed = 1; 5086 } else { 5087 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5088 } 5089 5090 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5091 kmp_r_sched_t new_sched = new_icvs->sched; 5092 // set primary thread's schedule as new run-time schedule 5093 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5094 5095 __kmp_reinitialize_team(team, new_icvs, 5096 root->r.r_uber_thread->th.th_ident); 5097 5098 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5099 team->t.t_threads[0], team)); 5100 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5101 5102 #if KMP_AFFINITY_SUPPORTED 5103 if ((team->t.t_size_changed == 0) && 5104 (team->t.t_proc_bind == new_proc_bind)) { 5105 if (new_proc_bind == proc_bind_spread) { 5106 __kmp_partition_places( 5107 team, 1); // add flag to update only master for spread 5108 } 5109 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5110 "proc_bind = %d, partition = [%d,%d]\n", 5111 team->t.t_id, new_proc_bind, team->t.t_first_place, 5112 team->t.t_last_place)); 5113 } else { 5114 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5115 __kmp_partition_places(team); 5116 } 5117 #else 5118 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5119 #endif /* KMP_AFFINITY_SUPPORTED */ 5120 } else if (team->t.t_nproc > new_nproc) { 5121 KA_TRACE(20, 5122 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5123 new_nproc)); 5124 5125 team->t.t_size_changed = 1; 5126 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5127 // Barrier size already reduced earlier in this function 5128 // Activate team threads via th_used_in_team 5129 __kmp_add_threads_to_team(team, new_nproc); 5130 } 5131 #if KMP_NESTED_HOT_TEAMS 5132 if (__kmp_hot_teams_mode == 0) { 5133 // AC: saved number of threads should correspond to team's value in this 5134 // mode, can be bigger in mode 1, when hot team has threads in reserve 5135 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5136 hot_teams[level].hot_team_nth = new_nproc; 5137 #endif // KMP_NESTED_HOT_TEAMS 5138 /* release the extra threads we don't need any more */ 5139 for (f = new_nproc; f < team->t.t_nproc; f++) { 5140 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5141 if (__kmp_tasking_mode != tskm_immediate_exec) { 5142 // When decreasing team size, threads no longer in the team should 5143 // unref task team. 5144 team->t.t_threads[f]->th.th_task_team = NULL; 5145 } 5146 __kmp_free_thread(team->t.t_threads[f]); 5147 team->t.t_threads[f] = NULL; 5148 } 5149 #if KMP_NESTED_HOT_TEAMS 5150 } // (__kmp_hot_teams_mode == 0) 5151 else { 5152 // When keeping extra threads in team, switch threads to wait on own 5153 // b_go flag 5154 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5155 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5156 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5157 for (int b = 0; b < bs_last_barrier; ++b) { 5158 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5159 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5160 } 5161 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5162 } 5163 } 5164 } 5165 #endif // KMP_NESTED_HOT_TEAMS 5166 team->t.t_nproc = new_nproc; 5167 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5168 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5169 __kmp_reinitialize_team(team, new_icvs, 5170 root->r.r_uber_thread->th.th_ident); 5171 5172 // Update remaining threads 5173 for (f = 0; f < new_nproc; ++f) { 5174 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5175 } 5176 5177 // restore the current task state of the primary thread: should be the 5178 // implicit task 5179 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5180 team->t.t_threads[0], team)); 5181 5182 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5183 5184 #ifdef KMP_DEBUG 5185 for (f = 0; f < team->t.t_nproc; f++) { 5186 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5187 team->t.t_threads[f]->th.th_team_nproc == 5188 team->t.t_nproc); 5189 } 5190 #endif 5191 5192 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5193 #if KMP_AFFINITY_SUPPORTED 5194 __kmp_partition_places(team); 5195 #endif 5196 } else { // team->t.t_nproc < new_nproc 5197 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5198 kmp_affin_mask_t *old_mask; 5199 if (KMP_AFFINITY_CAPABLE()) { 5200 KMP_CPU_ALLOC(old_mask); 5201 } 5202 #endif 5203 5204 KA_TRACE(20, 5205 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5206 new_nproc)); 5207 int old_nproc = team->t.t_nproc; // save old value and use to update only 5208 team->t.t_size_changed = 1; 5209 5210 #if KMP_NESTED_HOT_TEAMS 5211 int avail_threads = hot_teams[level].hot_team_nth; 5212 if (new_nproc < avail_threads) 5213 avail_threads = new_nproc; 5214 kmp_info_t **other_threads = team->t.t_threads; 5215 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5216 // Adjust barrier data of reserved threads (if any) of the team 5217 // Other data will be set in __kmp_initialize_info() below. 5218 int b; 5219 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5220 for (b = 0; b < bs_last_barrier; ++b) { 5221 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5222 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5223 #if USE_DEBUGGER 5224 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5225 #endif 5226 } 5227 } 5228 if (hot_teams[level].hot_team_nth >= new_nproc) { 5229 // we have all needed threads in reserve, no need to allocate any 5230 // this only possible in mode 1, cannot have reserved threads in mode 0 5231 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5232 team->t.t_nproc = new_nproc; // just get reserved threads involved 5233 } else { 5234 // We may have some threads in reserve, but not enough; 5235 // get reserved threads involved if any. 5236 team->t.t_nproc = hot_teams[level].hot_team_nth; 5237 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5238 #endif // KMP_NESTED_HOT_TEAMS 5239 if (team->t.t_max_nproc < new_nproc) { 5240 /* reallocate larger arrays */ 5241 __kmp_reallocate_team_arrays(team, new_nproc); 5242 __kmp_reinitialize_team(team, new_icvs, NULL); 5243 } 5244 5245 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5246 /* Temporarily set full mask for primary thread before creation of 5247 workers. The reason is that workers inherit the affinity from the 5248 primary thread, so if a lot of workers are created on the single 5249 core quickly, they don't get a chance to set their own affinity for 5250 a long time. */ 5251 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5252 #endif 5253 5254 /* allocate new threads for the hot team */ 5255 for (f = team->t.t_nproc; f < new_nproc; f++) { 5256 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5257 KMP_DEBUG_ASSERT(new_worker); 5258 team->t.t_threads[f] = new_worker; 5259 5260 KA_TRACE(20, 5261 ("__kmp_allocate_team: team %d init T#%d arrived: " 5262 "join=%llu, plain=%llu\n", 5263 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5264 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5265 team->t.t_bar[bs_plain_barrier].b_arrived)); 5266 5267 { // Initialize barrier data for new threads. 5268 int b; 5269 kmp_balign_t *balign = new_worker->th.th_bar; 5270 for (b = 0; b < bs_last_barrier; ++b) { 5271 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5272 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5273 KMP_BARRIER_PARENT_FLAG); 5274 #if USE_DEBUGGER 5275 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5276 #endif 5277 } 5278 } 5279 } 5280 5281 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5282 if (KMP_AFFINITY_CAPABLE()) { 5283 /* Restore initial primary thread's affinity mask */ 5284 __kmp_set_system_affinity(old_mask, TRUE); 5285 KMP_CPU_FREE(old_mask); 5286 } 5287 #endif 5288 #if KMP_NESTED_HOT_TEAMS 5289 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5290 #endif // KMP_NESTED_HOT_TEAMS 5291 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5292 // Barrier size already increased earlier in this function 5293 // Activate team threads via th_used_in_team 5294 __kmp_add_threads_to_team(team, new_nproc); 5295 } 5296 /* make sure everyone is syncronized */ 5297 // new threads below 5298 __kmp_initialize_team(team, new_nproc, new_icvs, 5299 root->r.r_uber_thread->th.th_ident); 5300 5301 /* reinitialize the threads */ 5302 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5303 for (f = 0; f < team->t.t_nproc; ++f) 5304 __kmp_initialize_info(team->t.t_threads[f], team, f, 5305 __kmp_gtid_from_tid(f, team)); 5306 5307 if (level) { // set th_task_state for new threads in nested hot team 5308 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5309 // only need to set the th_task_state for the new threads. th_task_state 5310 // for primary thread will not be accurate until after this in 5311 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5312 // get the correct value. 5313 for (f = old_nproc; f < team->t.t_nproc; ++f) 5314 team->t.t_threads[f]->th.th_task_state = 5315 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5316 } else { // set th_task_state for new threads in non-nested hot team 5317 // copy primary thread's state 5318 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5319 for (f = old_nproc; f < team->t.t_nproc; ++f) 5320 team->t.t_threads[f]->th.th_task_state = old_state; 5321 } 5322 5323 #ifdef KMP_DEBUG 5324 for (f = 0; f < team->t.t_nproc; ++f) { 5325 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5326 team->t.t_threads[f]->th.th_team_nproc == 5327 team->t.t_nproc); 5328 } 5329 #endif 5330 5331 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5332 #if KMP_AFFINITY_SUPPORTED 5333 __kmp_partition_places(team); 5334 #endif 5335 } // Check changes in number of threads 5336 5337 kmp_info_t *master = team->t.t_threads[0]; 5338 if (master->th.th_teams_microtask) { 5339 for (f = 1; f < new_nproc; ++f) { 5340 // propagate teams construct specific info to workers 5341 kmp_info_t *thr = team->t.t_threads[f]; 5342 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5343 thr->th.th_teams_level = master->th.th_teams_level; 5344 thr->th.th_teams_size = master->th.th_teams_size; 5345 } 5346 } 5347 #if KMP_NESTED_HOT_TEAMS 5348 if (level) { 5349 // Sync barrier state for nested hot teams, not needed for outermost hot 5350 // team. 5351 for (f = 1; f < new_nproc; ++f) { 5352 kmp_info_t *thr = team->t.t_threads[f]; 5353 int b; 5354 kmp_balign_t *balign = thr->th.th_bar; 5355 for (b = 0; b < bs_last_barrier; ++b) { 5356 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5357 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5358 #if USE_DEBUGGER 5359 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5360 #endif 5361 } 5362 } 5363 } 5364 #endif // KMP_NESTED_HOT_TEAMS 5365 5366 /* reallocate space for arguments if necessary */ 5367 __kmp_alloc_argv_entries(argc, team, TRUE); 5368 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5369 // The hot team re-uses the previous task team, 5370 // if untouched during the previous release->gather phase. 5371 5372 KF_TRACE(10, (" hot_team = %p\n", team)); 5373 5374 #if KMP_DEBUG 5375 if (__kmp_tasking_mode != tskm_immediate_exec) { 5376 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5377 "task_team[1] = %p after reinit\n", 5378 team->t.t_task_team[0], team->t.t_task_team[1])); 5379 } 5380 #endif 5381 5382 #if OMPT_SUPPORT 5383 __ompt_team_assign_id(team, ompt_parallel_data); 5384 #endif 5385 5386 KMP_MB(); 5387 5388 return team; 5389 } 5390 5391 /* next, let's try to take one from the team pool */ 5392 KMP_MB(); 5393 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5394 /* TODO: consider resizing undersized teams instead of reaping them, now 5395 that we have a resizing mechanism */ 5396 if (team->t.t_max_nproc >= max_nproc) { 5397 /* take this team from the team pool */ 5398 __kmp_team_pool = team->t.t_next_pool; 5399 5400 if (max_nproc > 1 && 5401 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5402 if (!team->t.b) { // Allocate barrier structure 5403 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5404 } 5405 } 5406 5407 /* setup the team for fresh use */ 5408 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5409 5410 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5411 "task_team[1] %p to NULL\n", 5412 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5413 team->t.t_task_team[0] = NULL; 5414 team->t.t_task_team[1] = NULL; 5415 5416 /* reallocate space for arguments if necessary */ 5417 __kmp_alloc_argv_entries(argc, team, TRUE); 5418 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5419 5420 KA_TRACE( 5421 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5422 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5423 { // Initialize barrier data. 5424 int b; 5425 for (b = 0; b < bs_last_barrier; ++b) { 5426 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5427 #if USE_DEBUGGER 5428 team->t.t_bar[b].b_master_arrived = 0; 5429 team->t.t_bar[b].b_team_arrived = 0; 5430 #endif 5431 } 5432 } 5433 5434 team->t.t_proc_bind = new_proc_bind; 5435 5436 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5437 team->t.t_id)); 5438 5439 #if OMPT_SUPPORT 5440 __ompt_team_assign_id(team, ompt_parallel_data); 5441 #endif 5442 5443 KMP_MB(); 5444 5445 return team; 5446 } 5447 5448 /* reap team if it is too small, then loop back and check the next one */ 5449 // not sure if this is wise, but, will be redone during the hot-teams 5450 // rewrite. 5451 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5452 team = __kmp_reap_team(team); 5453 __kmp_team_pool = team; 5454 } 5455 5456 /* nothing available in the pool, no matter, make a new team! */ 5457 KMP_MB(); 5458 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5459 5460 /* and set it up */ 5461 team->t.t_max_nproc = max_nproc; 5462 if (max_nproc > 1 && 5463 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5464 // Allocate barrier structure 5465 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5466 } 5467 5468 /* NOTE well, for some reason allocating one big buffer and dividing it up 5469 seems to really hurt performance a lot on the P4, so, let's not use this */ 5470 __kmp_allocate_team_arrays(team, max_nproc); 5471 5472 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5473 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5474 5475 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5476 "%p to NULL\n", 5477 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5478 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5479 // memory, no need to duplicate 5480 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5481 // memory, no need to duplicate 5482 5483 if (__kmp_storage_map) { 5484 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5485 } 5486 5487 /* allocate space for arguments */ 5488 __kmp_alloc_argv_entries(argc, team, FALSE); 5489 team->t.t_argc = argc; 5490 5491 KA_TRACE(20, 5492 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5493 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5494 { // Initialize barrier data. 5495 int b; 5496 for (b = 0; b < bs_last_barrier; ++b) { 5497 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5498 #if USE_DEBUGGER 5499 team->t.t_bar[b].b_master_arrived = 0; 5500 team->t.t_bar[b].b_team_arrived = 0; 5501 #endif 5502 } 5503 } 5504 5505 team->t.t_proc_bind = new_proc_bind; 5506 5507 #if OMPT_SUPPORT 5508 __ompt_team_assign_id(team, ompt_parallel_data); 5509 team->t.ompt_serialized_team_info = NULL; 5510 #endif 5511 5512 KMP_MB(); 5513 5514 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5515 team->t.t_id)); 5516 5517 return team; 5518 } 5519 5520 /* TODO implement hot-teams at all levels */ 5521 /* TODO implement lazy thread release on demand (disband request) */ 5522 5523 /* free the team. return it to the team pool. release all the threads 5524 * associated with it */ 5525 void __kmp_free_team(kmp_root_t *root, 5526 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5527 int f; 5528 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5529 team->t.t_id)); 5530 5531 /* verify state */ 5532 KMP_DEBUG_ASSERT(root); 5533 KMP_DEBUG_ASSERT(team); 5534 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5535 KMP_DEBUG_ASSERT(team->t.t_threads); 5536 5537 int use_hot_team = team == root->r.r_hot_team; 5538 #if KMP_NESTED_HOT_TEAMS 5539 int level; 5540 if (master) { 5541 level = team->t.t_active_level - 1; 5542 if (master->th.th_teams_microtask) { // in teams construct? 5543 if (master->th.th_teams_size.nteams > 1) { 5544 ++level; // level was not increased in teams construct for 5545 // team_of_masters 5546 } 5547 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5548 master->th.th_teams_level == team->t.t_level) { 5549 ++level; // level was not increased in teams construct for 5550 // team_of_workers before the parallel 5551 } // team->t.t_level will be increased inside parallel 5552 } 5553 #if KMP_DEBUG 5554 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; 5555 #endif 5556 if (level < __kmp_hot_teams_max_level) { 5557 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5558 use_hot_team = 1; 5559 } 5560 } 5561 #endif // KMP_NESTED_HOT_TEAMS 5562 5563 /* team is done working */ 5564 TCW_SYNC_PTR(team->t.t_pkfn, 5565 NULL); // Important for Debugging Support Library. 5566 #if KMP_OS_WINDOWS 5567 team->t.t_copyin_counter = 0; // init counter for possible reuse 5568 #endif 5569 // Do not reset pointer to parent team to NULL for hot teams. 5570 5571 /* if we are non-hot team, release our threads */ 5572 if (!use_hot_team) { 5573 if (__kmp_tasking_mode != tskm_immediate_exec) { 5574 // Wait for threads to reach reapable state 5575 for (f = 1; f < team->t.t_nproc; ++f) { 5576 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5577 kmp_info_t *th = team->t.t_threads[f]; 5578 volatile kmp_uint32 *state = &th->th.th_reap_state; 5579 while (*state != KMP_SAFE_TO_REAP) { 5580 #if KMP_OS_WINDOWS 5581 // On Windows a thread can be killed at any time, check this 5582 DWORD ecode; 5583 if (!__kmp_is_thread_alive(th, &ecode)) { 5584 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5585 break; 5586 } 5587 #endif 5588 // first check if thread is sleeping 5589 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5590 if (fl.is_sleeping()) 5591 fl.resume(__kmp_gtid_from_thread(th)); 5592 KMP_CPU_PAUSE(); 5593 } 5594 } 5595 5596 // Delete task teams 5597 int tt_idx; 5598 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5599 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5600 if (task_team != NULL) { 5601 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5602 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5603 team->t.t_threads[f]->th.th_task_team = NULL; 5604 } 5605 KA_TRACE( 5606 20, 5607 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5608 __kmp_get_gtid(), task_team, team->t.t_id)); 5609 #if KMP_NESTED_HOT_TEAMS 5610 __kmp_free_task_team(master, task_team); 5611 #endif 5612 team->t.t_task_team[tt_idx] = NULL; 5613 } 5614 } 5615 } 5616 5617 // Reset pointer to parent team only for non-hot teams. 5618 team->t.t_parent = NULL; 5619 team->t.t_level = 0; 5620 team->t.t_active_level = 0; 5621 5622 /* free the worker threads */ 5623 for (f = 1; f < team->t.t_nproc; ++f) { 5624 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5625 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5626 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 5627 1, 2); 5628 } 5629 __kmp_free_thread(team->t.t_threads[f]); 5630 } 5631 5632 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5633 if (team->t.b) { 5634 // wake up thread at old location 5635 team->t.b->go_release(); 5636 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5637 for (f = 1; f < team->t.t_nproc; ++f) { 5638 if (team->t.b->sleep[f].sleep) { 5639 __kmp_atomic_resume_64( 5640 team->t.t_threads[f]->th.th_info.ds.ds_gtid, 5641 (kmp_atomic_flag_64<> *)NULL); 5642 } 5643 } 5644 } 5645 // Wait for threads to be removed from team 5646 for (int f = 1; f < team->t.t_nproc; ++f) { 5647 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) 5648 KMP_CPU_PAUSE(); 5649 } 5650 } 5651 } 5652 5653 for (f = 1; f < team->t.t_nproc; ++f) { 5654 team->t.t_threads[f] = NULL; 5655 } 5656 5657 if (team->t.t_max_nproc > 1 && 5658 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5659 distributedBarrier::deallocate(team->t.b); 5660 team->t.b = NULL; 5661 } 5662 /* put the team back in the team pool */ 5663 /* TODO limit size of team pool, call reap_team if pool too large */ 5664 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5665 __kmp_team_pool = (volatile kmp_team_t *)team; 5666 } else { // Check if team was created for primary threads in teams construct 5667 // See if first worker is a CG root 5668 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5669 team->t.t_threads[1]->th.th_cg_roots); 5670 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5671 // Clean up the CG root nodes on workers so that this team can be re-used 5672 for (f = 1; f < team->t.t_nproc; ++f) { 5673 kmp_info_t *thr = team->t.t_threads[f]; 5674 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5675 thr->th.th_cg_roots->cg_root == thr); 5676 // Pop current CG root off list 5677 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5678 thr->th.th_cg_roots = tmp->up; 5679 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5680 " up to node %p. cg_nthreads was %d\n", 5681 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5682 int i = tmp->cg_nthreads--; 5683 if (i == 1) { 5684 __kmp_free(tmp); // free CG if we are the last thread in it 5685 } 5686 // Restore current task's thread_limit from CG root 5687 if (thr->th.th_cg_roots) 5688 thr->th.th_current_task->td_icvs.thread_limit = 5689 thr->th.th_cg_roots->cg_thread_limit; 5690 } 5691 } 5692 } 5693 5694 KMP_MB(); 5695 } 5696 5697 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5698 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5699 kmp_team_t *next_pool = team->t.t_next_pool; 5700 5701 KMP_DEBUG_ASSERT(team); 5702 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5703 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5704 KMP_DEBUG_ASSERT(team->t.t_threads); 5705 KMP_DEBUG_ASSERT(team->t.t_argv); 5706 5707 /* TODO clean the threads that are a part of this? */ 5708 5709 /* free stuff */ 5710 __kmp_free_team_arrays(team); 5711 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5712 __kmp_free((void *)team->t.t_argv); 5713 __kmp_free(team); 5714 5715 KMP_MB(); 5716 return next_pool; 5717 } 5718 5719 // Free the thread. Don't reap it, just place it on the pool of available 5720 // threads. 5721 // 5722 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5723 // binding for the affinity mechanism to be useful. 5724 // 5725 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5726 // However, we want to avoid a potential performance problem by always 5727 // scanning through the list to find the correct point at which to insert 5728 // the thread (potential N**2 behavior). To do this we keep track of the 5729 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5730 // With single-level parallelism, threads will always be added to the tail 5731 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5732 // parallelism, all bets are off and we may need to scan through the entire 5733 // free list. 5734 // 5735 // This change also has a potentially large performance benefit, for some 5736 // applications. Previously, as threads were freed from the hot team, they 5737 // would be placed back on the free list in inverse order. If the hot team 5738 // grew back to it's original size, then the freed thread would be placed 5739 // back on the hot team in reverse order. This could cause bad cache 5740 // locality problems on programs where the size of the hot team regularly 5741 // grew and shrunk. 5742 // 5743 // Now, for single-level parallelism, the OMP tid is always == gtid. 5744 void __kmp_free_thread(kmp_info_t *this_th) { 5745 int gtid; 5746 kmp_info_t **scan; 5747 5748 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5749 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5750 5751 KMP_DEBUG_ASSERT(this_th); 5752 5753 // When moving thread to pool, switch thread to wait on own b_go flag, and 5754 // uninitialized (NULL team). 5755 int b; 5756 kmp_balign_t *balign = this_th->th.th_bar; 5757 for (b = 0; b < bs_last_barrier; ++b) { 5758 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5759 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5760 balign[b].bb.team = NULL; 5761 balign[b].bb.leaf_kids = 0; 5762 } 5763 this_th->th.th_task_state = 0; 5764 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5765 5766 /* put thread back on the free pool */ 5767 TCW_PTR(this_th->th.th_team, NULL); 5768 TCW_PTR(this_th->th.th_root, NULL); 5769 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5770 5771 while (this_th->th.th_cg_roots) { 5772 this_th->th.th_cg_roots->cg_nthreads--; 5773 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5774 " %p of thread %p to %d\n", 5775 this_th, this_th->th.th_cg_roots, 5776 this_th->th.th_cg_roots->cg_root, 5777 this_th->th.th_cg_roots->cg_nthreads)); 5778 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5779 if (tmp->cg_root == this_th) { // Thread is a cg_root 5780 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5781 KA_TRACE( 5782 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5783 this_th->th.th_cg_roots = tmp->up; 5784 __kmp_free(tmp); 5785 } else { // Worker thread 5786 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5787 __kmp_free(tmp); 5788 } 5789 this_th->th.th_cg_roots = NULL; 5790 break; 5791 } 5792 } 5793 5794 /* If the implicit task assigned to this thread can be used by other threads 5795 * -> multiple threads can share the data and try to free the task at 5796 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5797 * with higher probability when hot team is disabled but can occurs even when 5798 * the hot team is enabled */ 5799 __kmp_free_implicit_task(this_th); 5800 this_th->th.th_current_task = NULL; 5801 5802 // If the __kmp_thread_pool_insert_pt is already past the new insert 5803 // point, then we need to re-scan the entire list. 5804 gtid = this_th->th.th_info.ds.ds_gtid; 5805 if (__kmp_thread_pool_insert_pt != NULL) { 5806 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5807 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5808 __kmp_thread_pool_insert_pt = NULL; 5809 } 5810 } 5811 5812 // Scan down the list to find the place to insert the thread. 5813 // scan is the address of a link in the list, possibly the address of 5814 // __kmp_thread_pool itself. 5815 // 5816 // In the absence of nested parallelism, the for loop will have 0 iterations. 5817 if (__kmp_thread_pool_insert_pt != NULL) { 5818 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5819 } else { 5820 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5821 } 5822 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5823 scan = &((*scan)->th.th_next_pool)) 5824 ; 5825 5826 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5827 // to its address. 5828 TCW_PTR(this_th->th.th_next_pool, *scan); 5829 __kmp_thread_pool_insert_pt = *scan = this_th; 5830 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5831 (this_th->th.th_info.ds.ds_gtid < 5832 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5833 TCW_4(this_th->th.th_in_pool, TRUE); 5834 __kmp_suspend_initialize_thread(this_th); 5835 __kmp_lock_suspend_mx(this_th); 5836 if (this_th->th.th_active == TRUE) { 5837 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5838 this_th->th.th_active_in_pool = TRUE; 5839 } 5840 #if KMP_DEBUG 5841 else { 5842 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5843 } 5844 #endif 5845 __kmp_unlock_suspend_mx(this_th); 5846 5847 TCW_4(__kmp_nth, __kmp_nth - 1); 5848 5849 #ifdef KMP_ADJUST_BLOCKTIME 5850 /* Adjust blocktime back to user setting or default if necessary */ 5851 /* Middle initialization might never have occurred */ 5852 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5853 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5854 if (__kmp_nth <= __kmp_avail_proc) { 5855 __kmp_zero_bt = FALSE; 5856 } 5857 } 5858 #endif /* KMP_ADJUST_BLOCKTIME */ 5859 5860 KMP_MB(); 5861 } 5862 5863 /* ------------------------------------------------------------------------ */ 5864 5865 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5866 #if OMP_PROFILING_SUPPORT 5867 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5868 // TODO: add a configuration option for time granularity 5869 if (ProfileTraceFile) 5870 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5871 #endif 5872 5873 int gtid = this_thr->th.th_info.ds.ds_gtid; 5874 /* void *stack_data;*/ 5875 kmp_team_t **volatile pteam; 5876 5877 KMP_MB(); 5878 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5879 5880 if (__kmp_env_consistency_check) { 5881 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5882 } 5883 5884 #if OMPD_SUPPORT 5885 if (ompd_state & OMPD_ENABLE_BP) 5886 ompd_bp_thread_begin(); 5887 #endif 5888 5889 #if OMPT_SUPPORT 5890 ompt_data_t *thread_data = nullptr; 5891 if (ompt_enabled.enabled) { 5892 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5893 *thread_data = ompt_data_none; 5894 5895 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5896 this_thr->th.ompt_thread_info.wait_id = 0; 5897 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5898 this_thr->th.ompt_thread_info.parallel_flags = 0; 5899 if (ompt_enabled.ompt_callback_thread_begin) { 5900 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5901 ompt_thread_worker, thread_data); 5902 } 5903 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5904 } 5905 #endif 5906 5907 /* This is the place where threads wait for work */ 5908 while (!TCR_4(__kmp_global.g.g_done)) { 5909 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5910 KMP_MB(); 5911 5912 /* wait for work to do */ 5913 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5914 5915 /* No tid yet since not part of a team */ 5916 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5917 5918 #if OMPT_SUPPORT 5919 if (ompt_enabled.enabled) { 5920 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5921 } 5922 #endif 5923 5924 pteam = &this_thr->th.th_team; 5925 5926 /* have we been allocated? */ 5927 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5928 /* we were just woken up, so run our new task */ 5929 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5930 int rc; 5931 KA_TRACE(20, 5932 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5933 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5934 (*pteam)->t.t_pkfn)); 5935 5936 updateHWFPControl(*pteam); 5937 5938 #if OMPT_SUPPORT 5939 if (ompt_enabled.enabled) { 5940 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5941 } 5942 #endif 5943 5944 rc = (*pteam)->t.t_invoke(gtid); 5945 KMP_ASSERT(rc); 5946 5947 KMP_MB(); 5948 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5949 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5950 (*pteam)->t.t_pkfn)); 5951 } 5952 #if OMPT_SUPPORT 5953 if (ompt_enabled.enabled) { 5954 /* no frame set while outside task */ 5955 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5956 5957 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5958 } 5959 #endif 5960 /* join barrier after parallel region */ 5961 __kmp_join_barrier(gtid); 5962 } 5963 } 5964 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5965 5966 #if OMPD_SUPPORT 5967 if (ompd_state & OMPD_ENABLE_BP) 5968 ompd_bp_thread_end(); 5969 #endif 5970 5971 #if OMPT_SUPPORT 5972 if (ompt_enabled.ompt_callback_thread_end) { 5973 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5974 } 5975 #endif 5976 5977 this_thr->th.th_task_team = NULL; 5978 /* run the destructors for the threadprivate data for this thread */ 5979 __kmp_common_destroy_gtid(gtid); 5980 5981 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5982 KMP_MB(); 5983 5984 #if OMP_PROFILING_SUPPORT 5985 llvm::timeTraceProfilerFinishThread(); 5986 #endif 5987 return this_thr; 5988 } 5989 5990 /* ------------------------------------------------------------------------ */ 5991 5992 void __kmp_internal_end_dest(void *specific_gtid) { 5993 // Make sure no significant bits are lost 5994 int gtid; 5995 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5996 5997 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5998 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5999 * this is because 0 is reserved for the nothing-stored case */ 6000 6001 __kmp_internal_end_thread(gtid); 6002 } 6003 6004 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 6005 6006 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 6007 __kmp_internal_end_atexit(); 6008 } 6009 6010 #endif 6011 6012 /* [Windows] josh: when the atexit handler is called, there may still be more 6013 than one thread alive */ 6014 void __kmp_internal_end_atexit(void) { 6015 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 6016 /* [Windows] 6017 josh: ideally, we want to completely shutdown the library in this atexit 6018 handler, but stat code that depends on thread specific data for gtid fails 6019 because that data becomes unavailable at some point during the shutdown, so 6020 we call __kmp_internal_end_thread instead. We should eventually remove the 6021 dependency on __kmp_get_specific_gtid in the stat code and use 6022 __kmp_internal_end_library to cleanly shutdown the library. 6023 6024 // TODO: Can some of this comment about GVS be removed? 6025 I suspect that the offending stat code is executed when the calling thread 6026 tries to clean up a dead root thread's data structures, resulting in GVS 6027 code trying to close the GVS structures for that thread, but since the stat 6028 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 6029 the calling thread is cleaning up itself instead of another thread, it get 6030 confused. This happens because allowing a thread to unregister and cleanup 6031 another thread is a recent modification for addressing an issue. 6032 Based on the current design (20050722), a thread may end up 6033 trying to unregister another thread only if thread death does not trigger 6034 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 6035 thread specific data destructor function to detect thread death. For 6036 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 6037 is nothing. Thus, the workaround is applicable only for Windows static 6038 stat library. */ 6039 __kmp_internal_end_library(-1); 6040 #if KMP_OS_WINDOWS 6041 __kmp_close_console(); 6042 #endif 6043 } 6044 6045 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 6046 // It is assumed __kmp_forkjoin_lock is acquired. 6047 6048 int gtid; 6049 6050 KMP_DEBUG_ASSERT(thread != NULL); 6051 6052 gtid = thread->th.th_info.ds.ds_gtid; 6053 6054 if (!is_root) { 6055 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 6056 /* Assume the threads are at the fork barrier here */ 6057 KA_TRACE( 6058 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 6059 gtid)); 6060 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 6061 while ( 6062 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) 6063 KMP_CPU_PAUSE(); 6064 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); 6065 } else { 6066 /* Need release fence here to prevent seg faults for tree forkjoin 6067 barrier (GEH) */ 6068 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 6069 thread); 6070 __kmp_release_64(&flag); 6071 } 6072 } 6073 6074 // Terminate OS thread. 6075 __kmp_reap_worker(thread); 6076 6077 // The thread was killed asynchronously. If it was actively 6078 // spinning in the thread pool, decrement the global count. 6079 // 6080 // There is a small timing hole here - if the worker thread was just waking 6081 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 6082 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 6083 // the global counter might not get updated. 6084 // 6085 // Currently, this can only happen as the library is unloaded, 6086 // so there are no harmful side effects. 6087 if (thread->th.th_active_in_pool) { 6088 thread->th.th_active_in_pool = FALSE; 6089 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 6090 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 6091 } 6092 } 6093 6094 __kmp_free_implicit_task(thread); 6095 6096 // Free the fast memory for tasking 6097 #if USE_FAST_MEMORY 6098 __kmp_free_fast_memory(thread); 6099 #endif /* USE_FAST_MEMORY */ 6100 6101 __kmp_suspend_uninitialize_thread(thread); 6102 6103 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6104 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6105 6106 --__kmp_all_nth; 6107 // __kmp_nth was decremented when thread is added to the pool. 6108 6109 #ifdef KMP_ADJUST_BLOCKTIME 6110 /* Adjust blocktime back to user setting or default if necessary */ 6111 /* Middle initialization might never have occurred */ 6112 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6113 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6114 if (__kmp_nth <= __kmp_avail_proc) { 6115 __kmp_zero_bt = FALSE; 6116 } 6117 } 6118 #endif /* KMP_ADJUST_BLOCKTIME */ 6119 6120 /* free the memory being used */ 6121 if (__kmp_env_consistency_check) { 6122 if (thread->th.th_cons) { 6123 __kmp_free_cons_stack(thread->th.th_cons); 6124 thread->th.th_cons = NULL; 6125 } 6126 } 6127 6128 if (thread->th.th_pri_common != NULL) { 6129 __kmp_free(thread->th.th_pri_common); 6130 thread->th.th_pri_common = NULL; 6131 } 6132 6133 if (thread->th.th_task_state_memo_stack != NULL) { 6134 __kmp_free(thread->th.th_task_state_memo_stack); 6135 thread->th.th_task_state_memo_stack = NULL; 6136 } 6137 6138 #if KMP_USE_BGET 6139 if (thread->th.th_local.bget_data != NULL) { 6140 __kmp_finalize_bget(thread); 6141 } 6142 #endif 6143 6144 #if KMP_AFFINITY_SUPPORTED 6145 if (thread->th.th_affin_mask != NULL) { 6146 KMP_CPU_FREE(thread->th.th_affin_mask); 6147 thread->th.th_affin_mask = NULL; 6148 } 6149 #endif /* KMP_AFFINITY_SUPPORTED */ 6150 6151 #if KMP_USE_HIER_SCHED 6152 if (thread->th.th_hier_bar_data != NULL) { 6153 __kmp_free(thread->th.th_hier_bar_data); 6154 thread->th.th_hier_bar_data = NULL; 6155 } 6156 #endif 6157 6158 __kmp_reap_team(thread->th.th_serial_team); 6159 thread->th.th_serial_team = NULL; 6160 __kmp_free(thread); 6161 6162 KMP_MB(); 6163 6164 } // __kmp_reap_thread 6165 6166 static void __kmp_itthash_clean(kmp_info_t *th) { 6167 #if USE_ITT_NOTIFY 6168 if (__kmp_itt_region_domains.count > 0) { 6169 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6170 kmp_itthash_entry_t *bucket = __kmp_itt_region_domains.buckets[i]; 6171 while (bucket) { 6172 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6173 __kmp_thread_free(th, bucket); 6174 bucket = next; 6175 } 6176 } 6177 } 6178 if (__kmp_itt_barrier_domains.count > 0) { 6179 for (int i = 0; i < KMP_MAX_FRAME_DOMAINS; ++i) { 6180 kmp_itthash_entry_t *bucket = __kmp_itt_barrier_domains.buckets[i]; 6181 while (bucket) { 6182 kmp_itthash_entry_t *next = bucket->next_in_bucket; 6183 __kmp_thread_free(th, bucket); 6184 bucket = next; 6185 } 6186 } 6187 } 6188 #endif 6189 } 6190 6191 static void __kmp_internal_end(void) { 6192 int i; 6193 6194 /* First, unregister the library */ 6195 __kmp_unregister_library(); 6196 6197 #if KMP_OS_WINDOWS 6198 /* In Win static library, we can't tell when a root actually dies, so we 6199 reclaim the data structures for any root threads that have died but not 6200 unregistered themselves, in order to shut down cleanly. 6201 In Win dynamic library we also can't tell when a thread dies. */ 6202 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6203 // dead roots 6204 #endif 6205 6206 for (i = 0; i < __kmp_threads_capacity; i++) 6207 if (__kmp_root[i]) 6208 if (__kmp_root[i]->r.r_active) 6209 break; 6210 KMP_MB(); /* Flush all pending memory write invalidates. */ 6211 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6212 6213 if (i < __kmp_threads_capacity) { 6214 #if KMP_USE_MONITOR 6215 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6216 KMP_MB(); /* Flush all pending memory write invalidates. */ 6217 6218 // Need to check that monitor was initialized before reaping it. If we are 6219 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6220 // __kmp_monitor will appear to contain valid data, but it is only valid in 6221 // the parent process, not the child. 6222 // New behavior (201008): instead of keying off of the flag 6223 // __kmp_init_parallel, the monitor thread creation is keyed off 6224 // of the new flag __kmp_init_monitor. 6225 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6226 if (TCR_4(__kmp_init_monitor)) { 6227 __kmp_reap_monitor(&__kmp_monitor); 6228 TCW_4(__kmp_init_monitor, 0); 6229 } 6230 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6231 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6232 #endif // KMP_USE_MONITOR 6233 } else { 6234 /* TODO move this to cleanup code */ 6235 #ifdef KMP_DEBUG 6236 /* make sure that everything has properly ended */ 6237 for (i = 0; i < __kmp_threads_capacity; i++) { 6238 if (__kmp_root[i]) { 6239 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6240 // there can be uber threads alive here 6241 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6242 } 6243 } 6244 #endif 6245 6246 KMP_MB(); 6247 6248 // Reap the worker threads. 6249 // This is valid for now, but be careful if threads are reaped sooner. 6250 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6251 // Get the next thread from the pool. 6252 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6253 __kmp_thread_pool = thread->th.th_next_pool; 6254 // Reap it. 6255 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6256 thread->th.th_next_pool = NULL; 6257 thread->th.th_in_pool = FALSE; 6258 __kmp_reap_thread(thread, 0); 6259 } 6260 __kmp_thread_pool_insert_pt = NULL; 6261 6262 // Reap teams. 6263 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6264 // Get the next team from the pool. 6265 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6266 __kmp_team_pool = team->t.t_next_pool; 6267 // Reap it. 6268 team->t.t_next_pool = NULL; 6269 __kmp_reap_team(team); 6270 } 6271 6272 __kmp_reap_task_teams(); 6273 6274 #if KMP_OS_UNIX 6275 // Threads that are not reaped should not access any resources since they 6276 // are going to be deallocated soon, so the shutdown sequence should wait 6277 // until all threads either exit the final spin-waiting loop or begin 6278 // sleeping after the given blocktime. 6279 for (i = 0; i < __kmp_threads_capacity; i++) { 6280 kmp_info_t *thr = __kmp_threads[i]; 6281 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6282 KMP_CPU_PAUSE(); 6283 } 6284 #endif 6285 6286 for (i = 0; i < __kmp_threads_capacity; ++i) { 6287 // TBD: Add some checking... 6288 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6289 } 6290 6291 /* Make sure all threadprivate destructors get run by joining with all 6292 worker threads before resetting this flag */ 6293 TCW_SYNC_4(__kmp_init_common, FALSE); 6294 6295 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6296 KMP_MB(); 6297 6298 #if KMP_USE_MONITOR 6299 // See note above: One of the possible fixes for CQ138434 / CQ140126 6300 // 6301 // FIXME: push both code fragments down and CSE them? 6302 // push them into __kmp_cleanup() ? 6303 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6304 if (TCR_4(__kmp_init_monitor)) { 6305 __kmp_reap_monitor(&__kmp_monitor); 6306 TCW_4(__kmp_init_monitor, 0); 6307 } 6308 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6309 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6310 #endif 6311 } /* else !__kmp_global.t_active */ 6312 TCW_4(__kmp_init_gtid, FALSE); 6313 KMP_MB(); /* Flush all pending memory write invalidates. */ 6314 6315 __kmp_cleanup(); 6316 #if OMPT_SUPPORT 6317 ompt_fini(); 6318 #endif 6319 } 6320 6321 void __kmp_internal_end_library(int gtid_req) { 6322 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6323 /* this shouldn't be a race condition because __kmp_internal_end() is the 6324 only place to clear __kmp_serial_init */ 6325 /* we'll check this later too, after we get the lock */ 6326 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6327 // redundant, because the next check will work in any case. 6328 if (__kmp_global.g.g_abort) { 6329 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6330 /* TODO abort? */ 6331 return; 6332 } 6333 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6334 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6335 return; 6336 } 6337 6338 // If hidden helper team has been initialized, we need to deinit it 6339 if (TCR_4(__kmp_init_hidden_helper) && 6340 !TCR_4(__kmp_hidden_helper_team_done)) { 6341 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6342 // First release the main thread to let it continue its work 6343 __kmp_hidden_helper_main_thread_release(); 6344 // Wait until the hidden helper team has been destroyed 6345 __kmp_hidden_helper_threads_deinitz_wait(); 6346 } 6347 6348 KMP_MB(); /* Flush all pending memory write invalidates. */ 6349 /* find out who we are and what we should do */ 6350 { 6351 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6352 KA_TRACE( 6353 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6354 if (gtid == KMP_GTID_SHUTDOWN) { 6355 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6356 "already shutdown\n")); 6357 return; 6358 } else if (gtid == KMP_GTID_MONITOR) { 6359 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6360 "registered, or system shutdown\n")); 6361 return; 6362 } else if (gtid == KMP_GTID_DNE) { 6363 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6364 "shutdown\n")); 6365 /* we don't know who we are, but we may still shutdown the library */ 6366 } else if (KMP_UBER_GTID(gtid)) { 6367 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6368 if (__kmp_root[gtid]->r.r_active) { 6369 __kmp_global.g.g_abort = -1; 6370 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6371 __kmp_unregister_library(); 6372 KA_TRACE(10, 6373 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6374 gtid)); 6375 return; 6376 } else { 6377 __kmp_itthash_clean(__kmp_threads[gtid]); 6378 KA_TRACE( 6379 10, 6380 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6381 __kmp_unregister_root_current_thread(gtid); 6382 } 6383 } else { 6384 /* worker threads may call this function through the atexit handler, if they 6385 * call exit() */ 6386 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6387 TODO: do a thorough shutdown instead */ 6388 #ifdef DUMP_DEBUG_ON_EXIT 6389 if (__kmp_debug_buf) 6390 __kmp_dump_debug_buffer(); 6391 #endif 6392 // added unregister library call here when we switch to shm linux 6393 // if we don't, it will leave lots of files in /dev/shm 6394 // cleanup shared memory file before exiting. 6395 __kmp_unregister_library(); 6396 return; 6397 } 6398 } 6399 /* synchronize the termination process */ 6400 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6401 6402 /* have we already finished */ 6403 if (__kmp_global.g.g_abort) { 6404 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6405 /* TODO abort? */ 6406 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6407 return; 6408 } 6409 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6410 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6411 return; 6412 } 6413 6414 /* We need this lock to enforce mutex between this reading of 6415 __kmp_threads_capacity and the writing by __kmp_register_root. 6416 Alternatively, we can use a counter of roots that is atomically updated by 6417 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6418 __kmp_internal_end_*. */ 6419 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6420 6421 /* now we can safely conduct the actual termination */ 6422 __kmp_internal_end(); 6423 6424 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6425 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6426 6427 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6428 6429 #ifdef DUMP_DEBUG_ON_EXIT 6430 if (__kmp_debug_buf) 6431 __kmp_dump_debug_buffer(); 6432 #endif 6433 6434 #if KMP_OS_WINDOWS 6435 __kmp_close_console(); 6436 #endif 6437 6438 __kmp_fini_allocator(); 6439 6440 } // __kmp_internal_end_library 6441 6442 void __kmp_internal_end_thread(int gtid_req) { 6443 int i; 6444 6445 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6446 /* this shouldn't be a race condition because __kmp_internal_end() is the 6447 * only place to clear __kmp_serial_init */ 6448 /* we'll check this later too, after we get the lock */ 6449 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6450 // redundant, because the next check will work in any case. 6451 if (__kmp_global.g.g_abort) { 6452 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6453 /* TODO abort? */ 6454 return; 6455 } 6456 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6457 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6458 return; 6459 } 6460 6461 // If hidden helper team has been initialized, we need to deinit it 6462 if (TCR_4(__kmp_init_hidden_helper) && 6463 !TCR_4(__kmp_hidden_helper_team_done)) { 6464 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6465 // First release the main thread to let it continue its work 6466 __kmp_hidden_helper_main_thread_release(); 6467 // Wait until the hidden helper team has been destroyed 6468 __kmp_hidden_helper_threads_deinitz_wait(); 6469 } 6470 6471 KMP_MB(); /* Flush all pending memory write invalidates. */ 6472 6473 /* find out who we are and what we should do */ 6474 { 6475 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6476 KA_TRACE(10, 6477 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6478 if (gtid == KMP_GTID_SHUTDOWN) { 6479 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6480 "already shutdown\n")); 6481 return; 6482 } else if (gtid == KMP_GTID_MONITOR) { 6483 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6484 "registered, or system shutdown\n")); 6485 return; 6486 } else if (gtid == KMP_GTID_DNE) { 6487 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6488 "shutdown\n")); 6489 return; 6490 /* we don't know who we are */ 6491 } else if (KMP_UBER_GTID(gtid)) { 6492 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6493 if (__kmp_root[gtid]->r.r_active) { 6494 __kmp_global.g.g_abort = -1; 6495 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6496 KA_TRACE(10, 6497 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6498 gtid)); 6499 return; 6500 } else { 6501 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6502 gtid)); 6503 __kmp_unregister_root_current_thread(gtid); 6504 } 6505 } else { 6506 /* just a worker thread, let's leave */ 6507 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6508 6509 if (gtid >= 0) { 6510 __kmp_threads[gtid]->th.th_task_team = NULL; 6511 } 6512 6513 KA_TRACE(10, 6514 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6515 gtid)); 6516 return; 6517 } 6518 } 6519 #if KMP_DYNAMIC_LIB 6520 if (__kmp_pause_status != kmp_hard_paused) 6521 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6522 // because we will better shutdown later in the library destructor. 6523 { 6524 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6525 return; 6526 } 6527 #endif 6528 /* synchronize the termination process */ 6529 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6530 6531 /* have we already finished */ 6532 if (__kmp_global.g.g_abort) { 6533 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6534 /* TODO abort? */ 6535 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6536 return; 6537 } 6538 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6539 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6540 return; 6541 } 6542 6543 /* We need this lock to enforce mutex between this reading of 6544 __kmp_threads_capacity and the writing by __kmp_register_root. 6545 Alternatively, we can use a counter of roots that is atomically updated by 6546 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6547 __kmp_internal_end_*. */ 6548 6549 /* should we finish the run-time? are all siblings done? */ 6550 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6551 6552 for (i = 0; i < __kmp_threads_capacity; ++i) { 6553 if (KMP_UBER_GTID(i)) { 6554 KA_TRACE( 6555 10, 6556 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6557 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6558 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6559 return; 6560 } 6561 } 6562 6563 /* now we can safely conduct the actual termination */ 6564 6565 __kmp_internal_end(); 6566 6567 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6568 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6569 6570 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6571 6572 #ifdef DUMP_DEBUG_ON_EXIT 6573 if (__kmp_debug_buf) 6574 __kmp_dump_debug_buffer(); 6575 #endif 6576 } // __kmp_internal_end_thread 6577 6578 // ----------------------------------------------------------------------------- 6579 // Library registration stuff. 6580 6581 static long __kmp_registration_flag = 0; 6582 // Random value used to indicate library initialization. 6583 static char *__kmp_registration_str = NULL; 6584 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6585 6586 static inline char *__kmp_reg_status_name() { 6587 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6588 each thread. If registration and unregistration go in different threads 6589 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6590 env var can not be found, because the name will contain different pid. */ 6591 // macOS* complains about name being too long with additional getuid() 6592 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6593 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6594 (int)getuid()); 6595 #else 6596 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6597 #endif 6598 } // __kmp_reg_status_get 6599 6600 void __kmp_register_library_startup(void) { 6601 6602 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6603 int done = 0; 6604 union { 6605 double dtime; 6606 long ltime; 6607 } time; 6608 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6609 __kmp_initialize_system_tick(); 6610 #endif 6611 __kmp_read_system_time(&time.dtime); 6612 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6613 __kmp_registration_str = 6614 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6615 __kmp_registration_flag, KMP_LIBRARY_FILE); 6616 6617 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6618 __kmp_registration_str)); 6619 6620 while (!done) { 6621 6622 char *value = NULL; // Actual value of the environment variable. 6623 6624 #if defined(KMP_USE_SHM) 6625 char *shm_name = __kmp_str_format("/%s", name); 6626 int shm_preexist = 0; 6627 char *data1; 6628 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6629 if ((fd1 == -1) && (errno == EEXIST)) { 6630 // file didn't open because it already exists. 6631 // try opening existing file 6632 fd1 = shm_open(shm_name, O_RDWR, 0666); 6633 if (fd1 == -1) { // file didn't open 6634 // error out here 6635 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6636 __kmp_msg_null); 6637 } else { 6638 // able to open existing file 6639 shm_preexist = 1; 6640 } 6641 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6642 // already exists. 6643 // error out here. 6644 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6645 __kmp_msg_null); 6646 } 6647 if (shm_preexist == 0) { 6648 // we created SHM now set size 6649 if (ftruncate(fd1, SHM_SIZE) == -1) { 6650 // error occured setting size; 6651 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6652 KMP_ERR(errno), __kmp_msg_null); 6653 } 6654 } 6655 data1 = 6656 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6657 if (data1 == MAP_FAILED) { 6658 // failed to map shared memory 6659 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6660 __kmp_msg_null); 6661 } 6662 if (shm_preexist == 0) { // set data to SHM, set value 6663 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6664 } 6665 // Read value from either what we just wrote or existing file. 6666 value = __kmp_str_format("%s", data1); // read value from SHM 6667 munmap(data1, SHM_SIZE); 6668 close(fd1); 6669 #else // Windows and unix with static library 6670 // Set environment variable, but do not overwrite if it is exist. 6671 __kmp_env_set(name, __kmp_registration_str, 0); 6672 // read value to see if it got set 6673 value = __kmp_env_get(name); 6674 #endif 6675 6676 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6677 done = 1; // Ok, environment variable set successfully, exit the loop. 6678 } else { 6679 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6680 // Check whether it alive or dead. 6681 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6682 char *tail = value; 6683 char *flag_addr_str = NULL; 6684 char *flag_val_str = NULL; 6685 char const *file_name = NULL; 6686 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6687 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6688 file_name = tail; 6689 if (tail != NULL) { 6690 unsigned long *flag_addr = 0; 6691 unsigned long flag_val = 0; 6692 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6693 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6694 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6695 // First, check whether environment-encoded address is mapped into 6696 // addr space. 6697 // If so, dereference it to see if it still has the right value. 6698 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6699 neighbor = 1; 6700 } else { 6701 // If not, then we know the other copy of the library is no longer 6702 // running. 6703 neighbor = 2; 6704 } 6705 } 6706 } 6707 switch (neighbor) { 6708 case 0: // Cannot parse environment variable -- neighbor status unknown. 6709 // Assume it is the incompatible format of future version of the 6710 // library. Assume the other library is alive. 6711 // WARN( ... ); // TODO: Issue a warning. 6712 file_name = "unknown library"; 6713 KMP_FALLTHROUGH(); 6714 // Attention! Falling to the next case. That's intentional. 6715 case 1: { // Neighbor is alive. 6716 // Check it is allowed. 6717 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6718 if (!__kmp_str_match_true(duplicate_ok)) { 6719 // That's not allowed. Issue fatal error. 6720 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6721 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6722 } 6723 KMP_INTERNAL_FREE(duplicate_ok); 6724 __kmp_duplicate_library_ok = 1; 6725 done = 1; // Exit the loop. 6726 } break; 6727 case 2: { // Neighbor is dead. 6728 6729 #if defined(KMP_USE_SHM) 6730 // close shared memory. 6731 shm_unlink(shm_name); // this removes file in /dev/shm 6732 #else 6733 // Clear the variable and try to register library again. 6734 __kmp_env_unset(name); 6735 #endif 6736 } break; 6737 default: { 6738 KMP_DEBUG_ASSERT(0); 6739 } break; 6740 } 6741 } 6742 KMP_INTERNAL_FREE((void *)value); 6743 #if defined(KMP_USE_SHM) 6744 KMP_INTERNAL_FREE((void *)shm_name); 6745 #endif 6746 } // while 6747 KMP_INTERNAL_FREE((void *)name); 6748 6749 } // func __kmp_register_library_startup 6750 6751 void __kmp_unregister_library(void) { 6752 6753 char *name = __kmp_reg_status_name(); 6754 char *value = NULL; 6755 6756 #if defined(KMP_USE_SHM) 6757 char *shm_name = __kmp_str_format("/%s", name); 6758 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6759 if (fd1 == -1) { 6760 // file did not open. return. 6761 return; 6762 } 6763 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6764 if (data1 != MAP_FAILED) { 6765 value = __kmp_str_format("%s", data1); // read value from SHM 6766 munmap(data1, SHM_SIZE); 6767 } 6768 close(fd1); 6769 #else 6770 value = __kmp_env_get(name); 6771 #endif 6772 6773 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6774 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6775 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6776 // Ok, this is our variable. Delete it. 6777 #if defined(KMP_USE_SHM) 6778 shm_unlink(shm_name); // this removes file in /dev/shm 6779 #else 6780 __kmp_env_unset(name); 6781 #endif 6782 } 6783 6784 #if defined(KMP_USE_SHM) 6785 KMP_INTERNAL_FREE(shm_name); 6786 #endif 6787 6788 KMP_INTERNAL_FREE(__kmp_registration_str); 6789 KMP_INTERNAL_FREE(value); 6790 KMP_INTERNAL_FREE(name); 6791 6792 __kmp_registration_flag = 0; 6793 __kmp_registration_str = NULL; 6794 6795 } // __kmp_unregister_library 6796 6797 // End of Library registration stuff. 6798 // ----------------------------------------------------------------------------- 6799 6800 #if KMP_MIC_SUPPORTED 6801 6802 static void __kmp_check_mic_type() { 6803 kmp_cpuid_t cpuid_state = {0}; 6804 kmp_cpuid_t *cs_p = &cpuid_state; 6805 __kmp_x86_cpuid(1, 0, cs_p); 6806 // We don't support mic1 at the moment 6807 if ((cs_p->eax & 0xff0) == 0xB10) { 6808 __kmp_mic_type = mic2; 6809 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6810 __kmp_mic_type = mic3; 6811 } else { 6812 __kmp_mic_type = non_mic; 6813 } 6814 } 6815 6816 #endif /* KMP_MIC_SUPPORTED */ 6817 6818 #if KMP_HAVE_UMWAIT 6819 static void __kmp_user_level_mwait_init() { 6820 struct kmp_cpuid buf; 6821 __kmp_x86_cpuid(7, 0, &buf); 6822 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6823 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6824 __kmp_umwait_enabled)); 6825 } 6826 #elif KMP_HAVE_MWAIT 6827 #ifndef AT_INTELPHIUSERMWAIT 6828 // Spurious, non-existent value that should always fail to return anything. 6829 // Will be replaced with the correct value when we know that. 6830 #define AT_INTELPHIUSERMWAIT 10000 6831 #endif 6832 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6833 // earlier OS is used to build the RTL, we'll use the following internal 6834 // function when the entry is not found. 6835 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6836 unsigned long getauxval(unsigned long) { return 0; } 6837 6838 static void __kmp_user_level_mwait_init() { 6839 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6840 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6841 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6842 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6843 if (__kmp_mic_type == mic3) { 6844 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6845 if ((res & 0x1) || __kmp_user_level_mwait) { 6846 __kmp_mwait_enabled = TRUE; 6847 if (__kmp_user_level_mwait) { 6848 KMP_INFORM(EnvMwaitWarn); 6849 } 6850 } else { 6851 __kmp_mwait_enabled = FALSE; 6852 } 6853 } 6854 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6855 "__kmp_mwait_enabled = %d\n", 6856 __kmp_mic_type, __kmp_mwait_enabled)); 6857 } 6858 #endif /* KMP_HAVE_UMWAIT */ 6859 6860 static void __kmp_do_serial_initialize(void) { 6861 int i, gtid; 6862 size_t size; 6863 6864 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6865 6866 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6867 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6868 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6869 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6870 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6871 6872 #if OMPT_SUPPORT 6873 ompt_pre_init(); 6874 #endif 6875 #if OMPD_SUPPORT 6876 __kmp_env_dump(); 6877 ompd_init(); 6878 #endif 6879 6880 __kmp_validate_locks(); 6881 6882 /* Initialize internal memory allocator */ 6883 __kmp_init_allocator(); 6884 6885 /* Register the library startup via an environment variable and check to see 6886 whether another copy of the library is already registered. */ 6887 6888 __kmp_register_library_startup(); 6889 6890 /* TODO reinitialization of library */ 6891 if (TCR_4(__kmp_global.g.g_done)) { 6892 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6893 } 6894 6895 __kmp_global.g.g_abort = 0; 6896 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6897 6898 /* initialize the locks */ 6899 #if KMP_USE_ADAPTIVE_LOCKS 6900 #if KMP_DEBUG_ADAPTIVE_LOCKS 6901 __kmp_init_speculative_stats(); 6902 #endif 6903 #endif 6904 #if KMP_STATS_ENABLED 6905 __kmp_stats_init(); 6906 #endif 6907 __kmp_init_lock(&__kmp_global_lock); 6908 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6909 __kmp_init_lock(&__kmp_debug_lock); 6910 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6911 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6912 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6913 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6914 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6915 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6916 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6917 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6918 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6919 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6920 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6921 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6922 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6923 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6924 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6925 #if KMP_USE_MONITOR 6926 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6927 #endif 6928 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6929 6930 /* conduct initialization and initial setup of configuration */ 6931 6932 __kmp_runtime_initialize(); 6933 6934 #if KMP_MIC_SUPPORTED 6935 __kmp_check_mic_type(); 6936 #endif 6937 6938 // Some global variable initialization moved here from kmp_env_initialize() 6939 #ifdef KMP_DEBUG 6940 kmp_diag = 0; 6941 #endif 6942 __kmp_abort_delay = 0; 6943 6944 // From __kmp_init_dflt_team_nth() 6945 /* assume the entire machine will be used */ 6946 __kmp_dflt_team_nth_ub = __kmp_xproc; 6947 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6948 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6949 } 6950 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6951 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6952 } 6953 __kmp_max_nth = __kmp_sys_max_nth; 6954 __kmp_cg_max_nth = __kmp_sys_max_nth; 6955 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6956 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6957 __kmp_teams_max_nth = __kmp_sys_max_nth; 6958 } 6959 6960 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6961 // part 6962 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6963 #if KMP_USE_MONITOR 6964 __kmp_monitor_wakeups = 6965 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6966 __kmp_bt_intervals = 6967 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6968 #endif 6969 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6970 __kmp_library = library_throughput; 6971 // From KMP_SCHEDULE initialization 6972 __kmp_static = kmp_sch_static_balanced; 6973 // AC: do not use analytical here, because it is non-monotonous 6974 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6975 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6976 // need to repeat assignment 6977 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6978 // bit control and barrier method control parts 6979 #if KMP_FAST_REDUCTION_BARRIER 6980 #define kmp_reduction_barrier_gather_bb ((int)1) 6981 #define kmp_reduction_barrier_release_bb ((int)1) 6982 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt 6983 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt 6984 #endif // KMP_FAST_REDUCTION_BARRIER 6985 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6986 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6987 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6988 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6989 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6990 #if KMP_FAST_REDUCTION_BARRIER 6991 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6992 // lin_64 ): hyper,1 6993 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6994 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6995 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6996 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6997 } 6998 #endif // KMP_FAST_REDUCTION_BARRIER 6999 } 7000 #if KMP_FAST_REDUCTION_BARRIER 7001 #undef kmp_reduction_barrier_release_pat 7002 #undef kmp_reduction_barrier_gather_pat 7003 #undef kmp_reduction_barrier_release_bb 7004 #undef kmp_reduction_barrier_gather_bb 7005 #endif // KMP_FAST_REDUCTION_BARRIER 7006 #if KMP_MIC_SUPPORTED 7007 if (__kmp_mic_type == mic2) { // KNC 7008 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 7009 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 7010 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 7011 1; // forkjoin release 7012 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7013 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 7014 } 7015 #if KMP_FAST_REDUCTION_BARRIER 7016 if (__kmp_mic_type == mic2) { // KNC 7017 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7018 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 7019 } 7020 #endif // KMP_FAST_REDUCTION_BARRIER 7021 #endif // KMP_MIC_SUPPORTED 7022 7023 // From KMP_CHECKS initialization 7024 #ifdef KMP_DEBUG 7025 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 7026 #else 7027 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 7028 #endif 7029 7030 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 7031 __kmp_foreign_tp = TRUE; 7032 7033 __kmp_global.g.g_dynamic = FALSE; 7034 __kmp_global.g.g_dynamic_mode = dynamic_default; 7035 7036 __kmp_init_nesting_mode(); 7037 7038 __kmp_env_initialize(NULL); 7039 7040 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 7041 __kmp_user_level_mwait_init(); 7042 #endif 7043 // Print all messages in message catalog for testing purposes. 7044 #ifdef KMP_DEBUG 7045 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 7046 if (__kmp_str_match_true(val)) { 7047 kmp_str_buf_t buffer; 7048 __kmp_str_buf_init(&buffer); 7049 __kmp_i18n_dump_catalog(&buffer); 7050 __kmp_printf("%s", buffer.str); 7051 __kmp_str_buf_free(&buffer); 7052 } 7053 __kmp_env_free(&val); 7054 #endif 7055 7056 __kmp_threads_capacity = 7057 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 7058 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 7059 __kmp_tp_capacity = __kmp_default_tp_capacity( 7060 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 7061 7062 // If the library is shut down properly, both pools must be NULL. Just in 7063 // case, set them to NULL -- some memory may leak, but subsequent code will 7064 // work even if pools are not freed. 7065 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 7066 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 7067 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 7068 __kmp_thread_pool = NULL; 7069 __kmp_thread_pool_insert_pt = NULL; 7070 __kmp_team_pool = NULL; 7071 7072 /* Allocate all of the variable sized records */ 7073 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 7074 * expandable */ 7075 /* Since allocation is cache-aligned, just add extra padding at the end */ 7076 size = 7077 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 7078 CACHE_LINE; 7079 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 7080 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 7081 sizeof(kmp_info_t *) * __kmp_threads_capacity); 7082 7083 /* init thread counts */ 7084 KMP_DEBUG_ASSERT(__kmp_all_nth == 7085 0); // Asserts fail if the library is reinitializing and 7086 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 7087 __kmp_all_nth = 0; 7088 __kmp_nth = 0; 7089 7090 /* setup the uber master thread and hierarchy */ 7091 gtid = __kmp_register_root(TRUE); 7092 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 7093 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7094 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 7095 7096 KMP_MB(); /* Flush all pending memory write invalidates. */ 7097 7098 __kmp_common_initialize(); 7099 7100 #if KMP_OS_UNIX 7101 /* invoke the child fork handler */ 7102 __kmp_register_atfork(); 7103 #endif 7104 7105 #if !KMP_DYNAMIC_LIB 7106 { 7107 /* Invoke the exit handler when the program finishes, only for static 7108 library. For dynamic library, we already have _fini and DllMain. */ 7109 int rc = atexit(__kmp_internal_end_atexit); 7110 if (rc != 0) { 7111 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 7112 __kmp_msg_null); 7113 } 7114 } 7115 #endif 7116 7117 #if KMP_HANDLE_SIGNALS 7118 #if KMP_OS_UNIX 7119 /* NOTE: make sure that this is called before the user installs their own 7120 signal handlers so that the user handlers are called first. this way they 7121 can return false, not call our handler, avoid terminating the library, and 7122 continue execution where they left off. */ 7123 __kmp_install_signals(FALSE); 7124 #endif /* KMP_OS_UNIX */ 7125 #if KMP_OS_WINDOWS 7126 __kmp_install_signals(TRUE); 7127 #endif /* KMP_OS_WINDOWS */ 7128 #endif 7129 7130 /* we have finished the serial initialization */ 7131 __kmp_init_counter++; 7132 7133 __kmp_init_serial = TRUE; 7134 7135 if (__kmp_settings) { 7136 __kmp_env_print(); 7137 } 7138 7139 if (__kmp_display_env || __kmp_display_env_verbose) { 7140 __kmp_env_print_2(); 7141 } 7142 7143 #if OMPT_SUPPORT 7144 ompt_post_init(); 7145 #endif 7146 7147 KMP_MB(); 7148 7149 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7150 } 7151 7152 void __kmp_serial_initialize(void) { 7153 if (__kmp_init_serial) { 7154 return; 7155 } 7156 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7157 if (__kmp_init_serial) { 7158 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7159 return; 7160 } 7161 __kmp_do_serial_initialize(); 7162 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7163 } 7164 7165 static void __kmp_do_middle_initialize(void) { 7166 int i, j; 7167 int prev_dflt_team_nth; 7168 7169 if (!__kmp_init_serial) { 7170 __kmp_do_serial_initialize(); 7171 } 7172 7173 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7174 7175 // Save the previous value for the __kmp_dflt_team_nth so that 7176 // we can avoid some reinitialization if it hasn't changed. 7177 prev_dflt_team_nth = __kmp_dflt_team_nth; 7178 7179 #if KMP_AFFINITY_SUPPORTED 7180 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7181 // number of cores on the machine. 7182 __kmp_affinity_initialize(); 7183 7184 #endif /* KMP_AFFINITY_SUPPORTED */ 7185 7186 KMP_ASSERT(__kmp_xproc > 0); 7187 if (__kmp_avail_proc == 0) { 7188 __kmp_avail_proc = __kmp_xproc; 7189 } 7190 7191 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7192 // correct them now 7193 j = 0; 7194 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7195 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7196 __kmp_avail_proc; 7197 j++; 7198 } 7199 7200 if (__kmp_dflt_team_nth == 0) { 7201 #ifdef KMP_DFLT_NTH_CORES 7202 // Default #threads = #cores 7203 __kmp_dflt_team_nth = __kmp_ncores; 7204 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7205 "__kmp_ncores (%d)\n", 7206 __kmp_dflt_team_nth)); 7207 #else 7208 // Default #threads = #available OS procs 7209 __kmp_dflt_team_nth = __kmp_avail_proc; 7210 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7211 "__kmp_avail_proc(%d)\n", 7212 __kmp_dflt_team_nth)); 7213 #endif /* KMP_DFLT_NTH_CORES */ 7214 } 7215 7216 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7217 __kmp_dflt_team_nth = KMP_MIN_NTH; 7218 } 7219 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7220 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7221 } 7222 7223 if (__kmp_nesting_mode > 0) 7224 __kmp_set_nesting_mode_threads(); 7225 7226 // There's no harm in continuing if the following check fails, 7227 // but it indicates an error in the previous logic. 7228 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7229 7230 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7231 // Run through the __kmp_threads array and set the num threads icv for each 7232 // root thread that is currently registered with the RTL (which has not 7233 // already explicitly set its nthreads-var with a call to 7234 // omp_set_num_threads()). 7235 for (i = 0; i < __kmp_threads_capacity; i++) { 7236 kmp_info_t *thread = __kmp_threads[i]; 7237 if (thread == NULL) 7238 continue; 7239 if (thread->th.th_current_task->td_icvs.nproc != 0) 7240 continue; 7241 7242 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7243 } 7244 } 7245 KA_TRACE( 7246 20, 7247 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7248 __kmp_dflt_team_nth)); 7249 7250 #ifdef KMP_ADJUST_BLOCKTIME 7251 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7252 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7253 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7254 if (__kmp_nth > __kmp_avail_proc) { 7255 __kmp_zero_bt = TRUE; 7256 } 7257 } 7258 #endif /* KMP_ADJUST_BLOCKTIME */ 7259 7260 /* we have finished middle initialization */ 7261 TCW_SYNC_4(__kmp_init_middle, TRUE); 7262 7263 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7264 } 7265 7266 void __kmp_middle_initialize(void) { 7267 if (__kmp_init_middle) { 7268 return; 7269 } 7270 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7271 if (__kmp_init_middle) { 7272 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7273 return; 7274 } 7275 __kmp_do_middle_initialize(); 7276 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7277 } 7278 7279 void __kmp_parallel_initialize(void) { 7280 int gtid = __kmp_entry_gtid(); // this might be a new root 7281 7282 /* synchronize parallel initialization (for sibling) */ 7283 if (TCR_4(__kmp_init_parallel)) 7284 return; 7285 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7286 if (TCR_4(__kmp_init_parallel)) { 7287 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7288 return; 7289 } 7290 7291 /* TODO reinitialization after we have already shut down */ 7292 if (TCR_4(__kmp_global.g.g_done)) { 7293 KA_TRACE( 7294 10, 7295 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7296 __kmp_infinite_loop(); 7297 } 7298 7299 /* jc: The lock __kmp_initz_lock is already held, so calling 7300 __kmp_serial_initialize would cause a deadlock. So we call 7301 __kmp_do_serial_initialize directly. */ 7302 if (!__kmp_init_middle) { 7303 __kmp_do_middle_initialize(); 7304 } 7305 __kmp_assign_root_init_mask(); 7306 __kmp_resume_if_hard_paused(); 7307 7308 /* begin initialization */ 7309 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7310 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7311 7312 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7313 // Save the FP control regs. 7314 // Worker threads will set theirs to these values at thread startup. 7315 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7316 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7317 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7318 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7319 7320 #if KMP_OS_UNIX 7321 #if KMP_HANDLE_SIGNALS 7322 /* must be after __kmp_serial_initialize */ 7323 __kmp_install_signals(TRUE); 7324 #endif 7325 #endif 7326 7327 __kmp_suspend_initialize(); 7328 7329 #if defined(USE_LOAD_BALANCE) 7330 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7331 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7332 } 7333 #else 7334 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7335 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7336 } 7337 #endif 7338 7339 if (__kmp_version) { 7340 __kmp_print_version_2(); 7341 } 7342 7343 /* we have finished parallel initialization */ 7344 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7345 7346 KMP_MB(); 7347 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7348 7349 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7350 } 7351 7352 void __kmp_hidden_helper_initialize() { 7353 if (TCR_4(__kmp_init_hidden_helper)) 7354 return; 7355 7356 // __kmp_parallel_initialize is required before we initialize hidden helper 7357 if (!TCR_4(__kmp_init_parallel)) 7358 __kmp_parallel_initialize(); 7359 7360 // Double check. Note that this double check should not be placed before 7361 // __kmp_parallel_initialize as it will cause dead lock. 7362 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7363 if (TCR_4(__kmp_init_hidden_helper)) { 7364 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7365 return; 7366 } 7367 7368 // Set the count of hidden helper tasks to be executed to zero 7369 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7370 7371 // Set the global variable indicating that we're initializing hidden helper 7372 // team/threads 7373 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7374 7375 // Platform independent initialization 7376 __kmp_do_initialize_hidden_helper_threads(); 7377 7378 // Wait here for the finish of initialization of hidden helper teams 7379 __kmp_hidden_helper_threads_initz_wait(); 7380 7381 // We have finished hidden helper initialization 7382 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7383 7384 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7385 } 7386 7387 /* ------------------------------------------------------------------------ */ 7388 7389 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7390 kmp_team_t *team) { 7391 kmp_disp_t *dispatch; 7392 7393 KMP_MB(); 7394 7395 /* none of the threads have encountered any constructs, yet. */ 7396 this_thr->th.th_local.this_construct = 0; 7397 #if KMP_CACHE_MANAGE 7398 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7399 #endif /* KMP_CACHE_MANAGE */ 7400 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7401 KMP_DEBUG_ASSERT(dispatch); 7402 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7403 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7404 // this_thr->th.th_info.ds.ds_tid ] ); 7405 7406 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7407 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7408 if (__kmp_env_consistency_check) 7409 __kmp_push_parallel(gtid, team->t.t_ident); 7410 7411 KMP_MB(); /* Flush all pending memory write invalidates. */ 7412 } 7413 7414 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7415 kmp_team_t *team) { 7416 if (__kmp_env_consistency_check) 7417 __kmp_pop_parallel(gtid, team->t.t_ident); 7418 7419 __kmp_finish_implicit_task(this_thr); 7420 } 7421 7422 int __kmp_invoke_task_func(int gtid) { 7423 int rc; 7424 int tid = __kmp_tid_from_gtid(gtid); 7425 kmp_info_t *this_thr = __kmp_threads[gtid]; 7426 kmp_team_t *team = this_thr->th.th_team; 7427 7428 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7429 #if USE_ITT_BUILD 7430 if (__itt_stack_caller_create_ptr) { 7431 // inform ittnotify about entering user's code 7432 if (team->t.t_stack_id != NULL) { 7433 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7434 } else { 7435 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7436 __kmp_itt_stack_callee_enter( 7437 (__itt_caller)team->t.t_parent->t.t_stack_id); 7438 } 7439 } 7440 #endif /* USE_ITT_BUILD */ 7441 #if INCLUDE_SSC_MARKS 7442 SSC_MARK_INVOKING(); 7443 #endif 7444 7445 #if OMPT_SUPPORT 7446 void *dummy; 7447 void **exit_frame_p; 7448 ompt_data_t *my_task_data; 7449 ompt_data_t *my_parallel_data; 7450 int ompt_team_size; 7451 7452 if (ompt_enabled.enabled) { 7453 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7454 .ompt_task_info.frame.exit_frame.ptr); 7455 } else { 7456 exit_frame_p = &dummy; 7457 } 7458 7459 my_task_data = 7460 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7461 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7462 if (ompt_enabled.ompt_callback_implicit_task) { 7463 ompt_team_size = team->t.t_nproc; 7464 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7465 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7466 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7467 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7468 } 7469 #endif 7470 7471 #if KMP_STATS_ENABLED 7472 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7473 if (previous_state == stats_state_e::TEAMS_REGION) { 7474 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7475 } else { 7476 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7477 } 7478 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7479 #endif 7480 7481 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7482 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7483 #if OMPT_SUPPORT 7484 , 7485 exit_frame_p 7486 #endif 7487 ); 7488 #if OMPT_SUPPORT 7489 *exit_frame_p = NULL; 7490 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7491 #endif 7492 7493 #if KMP_STATS_ENABLED 7494 if (previous_state == stats_state_e::TEAMS_REGION) { 7495 KMP_SET_THREAD_STATE(previous_state); 7496 } 7497 KMP_POP_PARTITIONED_TIMER(); 7498 #endif 7499 7500 #if USE_ITT_BUILD 7501 if (__itt_stack_caller_create_ptr) { 7502 // inform ittnotify about leaving user's code 7503 if (team->t.t_stack_id != NULL) { 7504 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7505 } else { 7506 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7507 __kmp_itt_stack_callee_leave( 7508 (__itt_caller)team->t.t_parent->t.t_stack_id); 7509 } 7510 } 7511 #endif /* USE_ITT_BUILD */ 7512 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7513 7514 return rc; 7515 } 7516 7517 void __kmp_teams_master(int gtid) { 7518 // This routine is called by all primary threads in teams construct 7519 kmp_info_t *thr = __kmp_threads[gtid]; 7520 kmp_team_t *team = thr->th.th_team; 7521 ident_t *loc = team->t.t_ident; 7522 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7523 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7524 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7525 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7526 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7527 7528 // This thread is a new CG root. Set up the proper variables. 7529 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7530 tmp->cg_root = thr; // Make thr the CG root 7531 // Init to thread limit stored when league primary threads were forked 7532 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7533 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7534 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7535 " cg_nthreads to 1\n", 7536 thr, tmp)); 7537 tmp->up = thr->th.th_cg_roots; 7538 thr->th.th_cg_roots = tmp; 7539 7540 // Launch league of teams now, but not let workers execute 7541 // (they hang on fork barrier until next parallel) 7542 #if INCLUDE_SSC_MARKS 7543 SSC_MARK_FORKING(); 7544 #endif 7545 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7546 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7547 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7548 #if INCLUDE_SSC_MARKS 7549 SSC_MARK_JOINING(); 7550 #endif 7551 // If the team size was reduced from the limit, set it to the new size 7552 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7553 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7554 // AC: last parameter "1" eliminates join barrier which won't work because 7555 // worker threads are in a fork barrier waiting for more parallel regions 7556 __kmp_join_call(loc, gtid 7557 #if OMPT_SUPPORT 7558 , 7559 fork_context_intel 7560 #endif 7561 , 7562 1); 7563 } 7564 7565 int __kmp_invoke_teams_master(int gtid) { 7566 kmp_info_t *this_thr = __kmp_threads[gtid]; 7567 kmp_team_t *team = this_thr->th.th_team; 7568 #if KMP_DEBUG 7569 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7570 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7571 (void *)__kmp_teams_master); 7572 #endif 7573 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7574 #if OMPT_SUPPORT 7575 int tid = __kmp_tid_from_gtid(gtid); 7576 ompt_data_t *task_data = 7577 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7578 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7579 if (ompt_enabled.ompt_callback_implicit_task) { 7580 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7581 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7582 ompt_task_initial); 7583 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7584 } 7585 #endif 7586 __kmp_teams_master(gtid); 7587 #if OMPT_SUPPORT 7588 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7589 #endif 7590 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7591 return 1; 7592 } 7593 7594 /* this sets the requested number of threads for the next parallel region 7595 encountered by this team. since this should be enclosed in the forkjoin 7596 critical section it should avoid race conditions with asymmetrical nested 7597 parallelism */ 7598 7599 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7600 kmp_info_t *thr = __kmp_threads[gtid]; 7601 7602 if (num_threads > 0) 7603 thr->th.th_set_nproc = num_threads; 7604 } 7605 7606 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7607 int num_threads) { 7608 KMP_DEBUG_ASSERT(thr); 7609 // Remember the number of threads for inner parallel regions 7610 if (!TCR_4(__kmp_init_middle)) 7611 __kmp_middle_initialize(); // get internal globals calculated 7612 __kmp_assign_root_init_mask(); 7613 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7614 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7615 7616 if (num_threads == 0) { 7617 if (__kmp_teams_thread_limit > 0) { 7618 num_threads = __kmp_teams_thread_limit; 7619 } else { 7620 num_threads = __kmp_avail_proc / num_teams; 7621 } 7622 // adjust num_threads w/o warning as it is not user setting 7623 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7624 // no thread_limit clause specified - do not change thread-limit-var ICV 7625 if (num_threads > __kmp_dflt_team_nth) { 7626 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7627 } 7628 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7629 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7630 } // prevent team size to exceed thread-limit-var 7631 if (num_teams * num_threads > __kmp_teams_max_nth) { 7632 num_threads = __kmp_teams_max_nth / num_teams; 7633 } 7634 if (num_threads == 0) { 7635 num_threads = 1; 7636 } 7637 } else { 7638 // This thread will be the primary thread of the league primary threads 7639 // Store new thread limit; old limit is saved in th_cg_roots list 7640 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7641 // num_threads = min(num_threads, nthreads-var) 7642 if (num_threads > __kmp_dflt_team_nth) { 7643 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7644 } 7645 if (num_teams * num_threads > __kmp_teams_max_nth) { 7646 int new_threads = __kmp_teams_max_nth / num_teams; 7647 if (new_threads == 0) { 7648 new_threads = 1; 7649 } 7650 if (new_threads != num_threads) { 7651 if (!__kmp_reserve_warn) { // user asked for too many threads 7652 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7653 __kmp_msg(kmp_ms_warning, 7654 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7655 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7656 } 7657 } 7658 num_threads = new_threads; 7659 } 7660 } 7661 thr->th.th_teams_size.nth = num_threads; 7662 } 7663 7664 /* this sets the requested number of teams for the teams region and/or 7665 the number of threads for the next parallel region encountered */ 7666 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7667 int num_threads) { 7668 kmp_info_t *thr = __kmp_threads[gtid]; 7669 KMP_DEBUG_ASSERT(num_teams >= 0); 7670 KMP_DEBUG_ASSERT(num_threads >= 0); 7671 7672 if (num_teams == 0) { 7673 if (__kmp_nteams > 0) { 7674 num_teams = __kmp_nteams; 7675 } else { 7676 num_teams = 1; // default number of teams is 1. 7677 } 7678 } 7679 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7680 if (!__kmp_reserve_warn) { 7681 __kmp_reserve_warn = 1; 7682 __kmp_msg(kmp_ms_warning, 7683 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7684 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7685 } 7686 num_teams = __kmp_teams_max_nth; 7687 } 7688 // Set number of teams (number of threads in the outer "parallel" of the 7689 // teams) 7690 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7691 7692 __kmp_push_thread_limit(thr, num_teams, num_threads); 7693 } 7694 7695 /* This sets the requested number of teams for the teams region and/or 7696 the number of threads for the next parallel region encountered */ 7697 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7698 int num_teams_ub, int num_threads) { 7699 kmp_info_t *thr = __kmp_threads[gtid]; 7700 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7701 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7702 KMP_DEBUG_ASSERT(num_threads >= 0); 7703 7704 if (num_teams_lb > num_teams_ub) { 7705 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7706 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7707 } 7708 7709 int num_teams = 1; // defalt number of teams is 1. 7710 7711 if (num_teams_lb == 0 && num_teams_ub > 0) 7712 num_teams_lb = num_teams_ub; 7713 7714 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7715 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7716 if (num_teams > __kmp_teams_max_nth) { 7717 if (!__kmp_reserve_warn) { 7718 __kmp_reserve_warn = 1; 7719 __kmp_msg(kmp_ms_warning, 7720 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7721 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7722 } 7723 num_teams = __kmp_teams_max_nth; 7724 } 7725 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7726 num_teams = num_teams_ub; 7727 } else { // num_teams_lb <= num_teams <= num_teams_ub 7728 if (num_threads == 0) { 7729 if (num_teams_ub > __kmp_teams_max_nth) { 7730 num_teams = num_teams_lb; 7731 } else { 7732 num_teams = num_teams_ub; 7733 } 7734 } else { 7735 num_teams = (num_threads > __kmp_teams_max_nth) 7736 ? num_teams 7737 : __kmp_teams_max_nth / num_threads; 7738 if (num_teams < num_teams_lb) { 7739 num_teams = num_teams_lb; 7740 } else if (num_teams > num_teams_ub) { 7741 num_teams = num_teams_ub; 7742 } 7743 } 7744 } 7745 // Set number of teams (number of threads in the outer "parallel" of the 7746 // teams) 7747 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7748 7749 __kmp_push_thread_limit(thr, num_teams, num_threads); 7750 } 7751 7752 // Set the proc_bind var to use in the following parallel region. 7753 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7754 kmp_info_t *thr = __kmp_threads[gtid]; 7755 thr->th.th_set_proc_bind = proc_bind; 7756 } 7757 7758 /* Launch the worker threads into the microtask. */ 7759 7760 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7761 kmp_info_t *this_thr = __kmp_threads[gtid]; 7762 7763 #ifdef KMP_DEBUG 7764 int f; 7765 #endif /* KMP_DEBUG */ 7766 7767 KMP_DEBUG_ASSERT(team); 7768 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7769 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7770 KMP_MB(); /* Flush all pending memory write invalidates. */ 7771 7772 team->t.t_construct = 0; /* no single directives seen yet */ 7773 team->t.t_ordered.dt.t_value = 7774 0; /* thread 0 enters the ordered section first */ 7775 7776 /* Reset the identifiers on the dispatch buffer */ 7777 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7778 if (team->t.t_max_nproc > 1) { 7779 int i; 7780 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7781 team->t.t_disp_buffer[i].buffer_index = i; 7782 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7783 } 7784 } else { 7785 team->t.t_disp_buffer[0].buffer_index = 0; 7786 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7787 } 7788 7789 KMP_MB(); /* Flush all pending memory write invalidates. */ 7790 KMP_ASSERT(this_thr->th.th_team == team); 7791 7792 #ifdef KMP_DEBUG 7793 for (f = 0; f < team->t.t_nproc; f++) { 7794 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7795 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7796 } 7797 #endif /* KMP_DEBUG */ 7798 7799 /* release the worker threads so they may begin working */ 7800 __kmp_fork_barrier(gtid, 0); 7801 } 7802 7803 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7804 kmp_info_t *this_thr = __kmp_threads[gtid]; 7805 7806 KMP_DEBUG_ASSERT(team); 7807 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7808 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7809 KMP_MB(); /* Flush all pending memory write invalidates. */ 7810 7811 /* Join barrier after fork */ 7812 7813 #ifdef KMP_DEBUG 7814 if (__kmp_threads[gtid] && 7815 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7816 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7817 __kmp_threads[gtid]); 7818 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7819 "team->t.t_nproc=%d\n", 7820 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7821 team->t.t_nproc); 7822 __kmp_print_structure(); 7823 } 7824 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7825 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7826 #endif /* KMP_DEBUG */ 7827 7828 __kmp_join_barrier(gtid); /* wait for everyone */ 7829 #if OMPT_SUPPORT 7830 if (ompt_enabled.enabled && 7831 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7832 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7833 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7834 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7835 #if OMPT_OPTIONAL 7836 void *codeptr = NULL; 7837 if (KMP_MASTER_TID(ds_tid) && 7838 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7839 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7840 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7841 7842 if (ompt_enabled.ompt_callback_sync_region_wait) { 7843 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7844 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7845 codeptr); 7846 } 7847 if (ompt_enabled.ompt_callback_sync_region) { 7848 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7849 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7850 codeptr); 7851 } 7852 #endif 7853 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7854 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7855 ompt_scope_end, NULL, task_data, 0, ds_tid, 7856 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7857 } 7858 } 7859 #endif 7860 7861 KMP_MB(); /* Flush all pending memory write invalidates. */ 7862 KMP_ASSERT(this_thr->th.th_team == team); 7863 } 7864 7865 /* ------------------------------------------------------------------------ */ 7866 7867 #ifdef USE_LOAD_BALANCE 7868 7869 // Return the worker threads actively spinning in the hot team, if we 7870 // are at the outermost level of parallelism. Otherwise, return 0. 7871 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7872 int i; 7873 int retval; 7874 kmp_team_t *hot_team; 7875 7876 if (root->r.r_active) { 7877 return 0; 7878 } 7879 hot_team = root->r.r_hot_team; 7880 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7881 return hot_team->t.t_nproc - 1; // Don't count primary thread 7882 } 7883 7884 // Skip the primary thread - it is accounted for elsewhere. 7885 retval = 0; 7886 for (i = 1; i < hot_team->t.t_nproc; i++) { 7887 if (hot_team->t.t_threads[i]->th.th_active) { 7888 retval++; 7889 } 7890 } 7891 return retval; 7892 } 7893 7894 // Perform an automatic adjustment to the number of 7895 // threads used by the next parallel region. 7896 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7897 int retval; 7898 int pool_active; 7899 int hot_team_active; 7900 int team_curr_active; 7901 int system_active; 7902 7903 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7904 set_nproc)); 7905 KMP_DEBUG_ASSERT(root); 7906 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7907 ->th.th_current_task->td_icvs.dynamic == TRUE); 7908 KMP_DEBUG_ASSERT(set_nproc > 1); 7909 7910 if (set_nproc == 1) { 7911 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7912 return 1; 7913 } 7914 7915 // Threads that are active in the thread pool, active in the hot team for this 7916 // particular root (if we are at the outer par level), and the currently 7917 // executing thread (to become the primary thread) are available to add to the 7918 // new team, but are currently contributing to the system load, and must be 7919 // accounted for. 7920 pool_active = __kmp_thread_pool_active_nth; 7921 hot_team_active = __kmp_active_hot_team_nproc(root); 7922 team_curr_active = pool_active + hot_team_active + 1; 7923 7924 // Check the system load. 7925 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7926 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7927 "hot team active = %d\n", 7928 system_active, pool_active, hot_team_active)); 7929 7930 if (system_active < 0) { 7931 // There was an error reading the necessary info from /proc, so use the 7932 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7933 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7934 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7935 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7936 7937 // Make this call behave like the thread limit algorithm. 7938 retval = __kmp_avail_proc - __kmp_nth + 7939 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7940 if (retval > set_nproc) { 7941 retval = set_nproc; 7942 } 7943 if (retval < KMP_MIN_NTH) { 7944 retval = KMP_MIN_NTH; 7945 } 7946 7947 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7948 retval)); 7949 return retval; 7950 } 7951 7952 // There is a slight delay in the load balance algorithm in detecting new 7953 // running procs. The real system load at this instant should be at least as 7954 // large as the #active omp thread that are available to add to the team. 7955 if (system_active < team_curr_active) { 7956 system_active = team_curr_active; 7957 } 7958 retval = __kmp_avail_proc - system_active + team_curr_active; 7959 if (retval > set_nproc) { 7960 retval = set_nproc; 7961 } 7962 if (retval < KMP_MIN_NTH) { 7963 retval = KMP_MIN_NTH; 7964 } 7965 7966 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7967 return retval; 7968 } // __kmp_load_balance_nproc() 7969 7970 #endif /* USE_LOAD_BALANCE */ 7971 7972 /* ------------------------------------------------------------------------ */ 7973 7974 /* NOTE: this is called with the __kmp_init_lock held */ 7975 void __kmp_cleanup(void) { 7976 int f; 7977 7978 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7979 7980 if (TCR_4(__kmp_init_parallel)) { 7981 #if KMP_HANDLE_SIGNALS 7982 __kmp_remove_signals(); 7983 #endif 7984 TCW_4(__kmp_init_parallel, FALSE); 7985 } 7986 7987 if (TCR_4(__kmp_init_middle)) { 7988 #if KMP_AFFINITY_SUPPORTED 7989 __kmp_affinity_uninitialize(); 7990 #endif /* KMP_AFFINITY_SUPPORTED */ 7991 __kmp_cleanup_hierarchy(); 7992 TCW_4(__kmp_init_middle, FALSE); 7993 } 7994 7995 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7996 7997 if (__kmp_init_serial) { 7998 __kmp_runtime_destroy(); 7999 __kmp_init_serial = FALSE; 8000 } 8001 8002 __kmp_cleanup_threadprivate_caches(); 8003 8004 for (f = 0; f < __kmp_threads_capacity; f++) { 8005 if (__kmp_root[f] != NULL) { 8006 __kmp_free(__kmp_root[f]); 8007 __kmp_root[f] = NULL; 8008 } 8009 } 8010 __kmp_free(__kmp_threads); 8011 // __kmp_threads and __kmp_root were allocated at once, as single block, so 8012 // there is no need in freeing __kmp_root. 8013 __kmp_threads = NULL; 8014 __kmp_root = NULL; 8015 __kmp_threads_capacity = 0; 8016 8017 #if KMP_USE_DYNAMIC_LOCK 8018 __kmp_cleanup_indirect_user_locks(); 8019 #else 8020 __kmp_cleanup_user_locks(); 8021 #endif 8022 #if OMPD_SUPPORT 8023 if (ompd_state) { 8024 __kmp_free(ompd_env_block); 8025 ompd_env_block = NULL; 8026 ompd_env_block_size = 0; 8027 } 8028 #endif 8029 8030 #if KMP_AFFINITY_SUPPORTED 8031 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 8032 __kmp_cpuinfo_file = NULL; 8033 #endif /* KMP_AFFINITY_SUPPORTED */ 8034 8035 #if KMP_USE_ADAPTIVE_LOCKS 8036 #if KMP_DEBUG_ADAPTIVE_LOCKS 8037 __kmp_print_speculative_stats(); 8038 #endif 8039 #endif 8040 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 8041 __kmp_nested_nth.nth = NULL; 8042 __kmp_nested_nth.size = 0; 8043 __kmp_nested_nth.used = 0; 8044 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 8045 __kmp_nested_proc_bind.bind_types = NULL; 8046 __kmp_nested_proc_bind.size = 0; 8047 __kmp_nested_proc_bind.used = 0; 8048 if (__kmp_affinity_format) { 8049 KMP_INTERNAL_FREE(__kmp_affinity_format); 8050 __kmp_affinity_format = NULL; 8051 } 8052 8053 __kmp_i18n_catclose(); 8054 8055 #if KMP_USE_HIER_SCHED 8056 __kmp_hier_scheds.deallocate(); 8057 #endif 8058 8059 #if KMP_STATS_ENABLED 8060 __kmp_stats_fini(); 8061 #endif 8062 8063 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 8064 } 8065 8066 /* ------------------------------------------------------------------------ */ 8067 8068 int __kmp_ignore_mppbeg(void) { 8069 char *env; 8070 8071 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 8072 if (__kmp_str_match_false(env)) 8073 return FALSE; 8074 } 8075 // By default __kmpc_begin() is no-op. 8076 return TRUE; 8077 } 8078 8079 int __kmp_ignore_mppend(void) { 8080 char *env; 8081 8082 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 8083 if (__kmp_str_match_false(env)) 8084 return FALSE; 8085 } 8086 // By default __kmpc_end() is no-op. 8087 return TRUE; 8088 } 8089 8090 void __kmp_internal_begin(void) { 8091 int gtid; 8092 kmp_root_t *root; 8093 8094 /* this is a very important step as it will register new sibling threads 8095 and assign these new uber threads a new gtid */ 8096 gtid = __kmp_entry_gtid(); 8097 root = __kmp_threads[gtid]->th.th_root; 8098 KMP_ASSERT(KMP_UBER_GTID(gtid)); 8099 8100 if (root->r.r_begin) 8101 return; 8102 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 8103 if (root->r.r_begin) { 8104 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8105 return; 8106 } 8107 8108 root->r.r_begin = TRUE; 8109 8110 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8111 } 8112 8113 /* ------------------------------------------------------------------------ */ 8114 8115 void __kmp_user_set_library(enum library_type arg) { 8116 int gtid; 8117 kmp_root_t *root; 8118 kmp_info_t *thread; 8119 8120 /* first, make sure we are initialized so we can get our gtid */ 8121 8122 gtid = __kmp_entry_gtid(); 8123 thread = __kmp_threads[gtid]; 8124 8125 root = thread->th.th_root; 8126 8127 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 8128 library_serial)); 8129 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 8130 thread */ 8131 KMP_WARNING(SetLibraryIncorrectCall); 8132 return; 8133 } 8134 8135 switch (arg) { 8136 case library_serial: 8137 thread->th.th_set_nproc = 0; 8138 set__nproc(thread, 1); 8139 break; 8140 case library_turnaround: 8141 thread->th.th_set_nproc = 0; 8142 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8143 : __kmp_dflt_team_nth_ub); 8144 break; 8145 case library_throughput: 8146 thread->th.th_set_nproc = 0; 8147 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8148 : __kmp_dflt_team_nth_ub); 8149 break; 8150 default: 8151 KMP_FATAL(UnknownLibraryType, arg); 8152 } 8153 8154 __kmp_aux_set_library(arg); 8155 } 8156 8157 void __kmp_aux_set_stacksize(size_t arg) { 8158 if (!__kmp_init_serial) 8159 __kmp_serial_initialize(); 8160 8161 #if KMP_OS_DARWIN 8162 if (arg & (0x1000 - 1)) { 8163 arg &= ~(0x1000 - 1); 8164 if (arg + 0x1000) /* check for overflow if we round up */ 8165 arg += 0x1000; 8166 } 8167 #endif 8168 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8169 8170 /* only change the default stacksize before the first parallel region */ 8171 if (!TCR_4(__kmp_init_parallel)) { 8172 size_t value = arg; /* argument is in bytes */ 8173 8174 if (value < __kmp_sys_min_stksize) 8175 value = __kmp_sys_min_stksize; 8176 else if (value > KMP_MAX_STKSIZE) 8177 value = KMP_MAX_STKSIZE; 8178 8179 __kmp_stksize = value; 8180 8181 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8182 } 8183 8184 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8185 } 8186 8187 /* set the behaviour of the runtime library */ 8188 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8189 void __kmp_aux_set_library(enum library_type arg) { 8190 __kmp_library = arg; 8191 8192 switch (__kmp_library) { 8193 case library_serial: { 8194 KMP_INFORM(LibraryIsSerial); 8195 } break; 8196 case library_turnaround: 8197 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8198 __kmp_use_yield = 2; // only yield when oversubscribed 8199 break; 8200 case library_throughput: 8201 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8202 __kmp_dflt_blocktime = 200; 8203 break; 8204 default: 8205 KMP_FATAL(UnknownLibraryType, arg); 8206 } 8207 } 8208 8209 /* Getting team information common for all team API */ 8210 // Returns NULL if not in teams construct 8211 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8212 kmp_info_t *thr = __kmp_entry_thread(); 8213 teams_serialized = 0; 8214 if (thr->th.th_teams_microtask) { 8215 kmp_team_t *team = thr->th.th_team; 8216 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8217 int ii = team->t.t_level; 8218 teams_serialized = team->t.t_serialized; 8219 int level = tlevel + 1; 8220 KMP_DEBUG_ASSERT(ii >= tlevel); 8221 while (ii > level) { 8222 for (teams_serialized = team->t.t_serialized; 8223 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8224 } 8225 if (team->t.t_serialized && (!teams_serialized)) { 8226 team = team->t.t_parent; 8227 continue; 8228 } 8229 if (ii > level) { 8230 team = team->t.t_parent; 8231 ii--; 8232 } 8233 } 8234 return team; 8235 } 8236 return NULL; 8237 } 8238 8239 int __kmp_aux_get_team_num() { 8240 int serialized; 8241 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8242 if (team) { 8243 if (serialized > 1) { 8244 return 0; // teams region is serialized ( 1 team of 1 thread ). 8245 } else { 8246 return team->t.t_master_tid; 8247 } 8248 } 8249 return 0; 8250 } 8251 8252 int __kmp_aux_get_num_teams() { 8253 int serialized; 8254 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8255 if (team) { 8256 if (serialized > 1) { 8257 return 1; 8258 } else { 8259 return team->t.t_parent->t.t_nproc; 8260 } 8261 } 8262 return 1; 8263 } 8264 8265 /* ------------------------------------------------------------------------ */ 8266 8267 /* 8268 * Affinity Format Parser 8269 * 8270 * Field is in form of: %[[[0].]size]type 8271 * % and type are required (%% means print a literal '%') 8272 * type is either single char or long name surrounded by {}, 8273 * e.g., N or {num_threads} 8274 * 0 => leading zeros 8275 * . => right justified when size is specified 8276 * by default output is left justified 8277 * size is the *minimum* field length 8278 * All other characters are printed as is 8279 * 8280 * Available field types: 8281 * L {thread_level} - omp_get_level() 8282 * n {thread_num} - omp_get_thread_num() 8283 * h {host} - name of host machine 8284 * P {process_id} - process id (integer) 8285 * T {thread_identifier} - native thread identifier (integer) 8286 * N {num_threads} - omp_get_num_threads() 8287 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8288 * a {thread_affinity} - comma separated list of integers or integer ranges 8289 * (values of affinity mask) 8290 * 8291 * Implementation-specific field types can be added 8292 * If a type is unknown, print "undefined" 8293 */ 8294 8295 // Structure holding the short name, long name, and corresponding data type 8296 // for snprintf. A table of these will represent the entire valid keyword 8297 // field types. 8298 typedef struct kmp_affinity_format_field_t { 8299 char short_name; // from spec e.g., L -> thread level 8300 const char *long_name; // from spec thread_level -> thread level 8301 char field_format; // data type for snprintf (typically 'd' or 's' 8302 // for integer or string) 8303 } kmp_affinity_format_field_t; 8304 8305 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8306 #if KMP_AFFINITY_SUPPORTED 8307 {'A', "thread_affinity", 's'}, 8308 #endif 8309 {'t', "team_num", 'd'}, 8310 {'T', "num_teams", 'd'}, 8311 {'L', "nesting_level", 'd'}, 8312 {'n', "thread_num", 'd'}, 8313 {'N', "num_threads", 'd'}, 8314 {'a', "ancestor_tnum", 'd'}, 8315 {'H', "host", 's'}, 8316 {'P', "process_id", 'd'}, 8317 {'i', "native_thread_id", 'd'}}; 8318 8319 // Return the number of characters it takes to hold field 8320 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8321 const char **ptr, 8322 kmp_str_buf_t *field_buffer) { 8323 int rc, format_index, field_value; 8324 const char *width_left, *width_right; 8325 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8326 static const int FORMAT_SIZE = 20; 8327 char format[FORMAT_SIZE] = {0}; 8328 char absolute_short_name = 0; 8329 8330 KMP_DEBUG_ASSERT(gtid >= 0); 8331 KMP_DEBUG_ASSERT(th); 8332 KMP_DEBUG_ASSERT(**ptr == '%'); 8333 KMP_DEBUG_ASSERT(field_buffer); 8334 8335 __kmp_str_buf_clear(field_buffer); 8336 8337 // Skip the initial % 8338 (*ptr)++; 8339 8340 // Check for %% first 8341 if (**ptr == '%') { 8342 __kmp_str_buf_cat(field_buffer, "%", 1); 8343 (*ptr)++; // skip over the second % 8344 return 1; 8345 } 8346 8347 // Parse field modifiers if they are present 8348 pad_zeros = false; 8349 if (**ptr == '0') { 8350 pad_zeros = true; 8351 (*ptr)++; // skip over 0 8352 } 8353 right_justify = false; 8354 if (**ptr == '.') { 8355 right_justify = true; 8356 (*ptr)++; // skip over . 8357 } 8358 // Parse width of field: [width_left, width_right) 8359 width_left = width_right = NULL; 8360 if (**ptr >= '0' && **ptr <= '9') { 8361 width_left = *ptr; 8362 SKIP_DIGITS(*ptr); 8363 width_right = *ptr; 8364 } 8365 8366 // Create the format for KMP_SNPRINTF based on flags parsed above 8367 format_index = 0; 8368 format[format_index++] = '%'; 8369 if (!right_justify) 8370 format[format_index++] = '-'; 8371 if (pad_zeros) 8372 format[format_index++] = '0'; 8373 if (width_left && width_right) { 8374 int i = 0; 8375 // Only allow 8 digit number widths. 8376 // This also prevents overflowing format variable 8377 while (i < 8 && width_left < width_right) { 8378 format[format_index++] = *width_left; 8379 width_left++; 8380 i++; 8381 } 8382 } 8383 8384 // Parse a name (long or short) 8385 // Canonicalize the name into absolute_short_name 8386 found_valid_name = false; 8387 parse_long_name = (**ptr == '{'); 8388 if (parse_long_name) 8389 (*ptr)++; // skip initial left brace 8390 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8391 sizeof(__kmp_affinity_format_table[0]); 8392 ++i) { 8393 char short_name = __kmp_affinity_format_table[i].short_name; 8394 const char *long_name = __kmp_affinity_format_table[i].long_name; 8395 char field_format = __kmp_affinity_format_table[i].field_format; 8396 if (parse_long_name) { 8397 size_t length = KMP_STRLEN(long_name); 8398 if (strncmp(*ptr, long_name, length) == 0) { 8399 found_valid_name = true; 8400 (*ptr) += length; // skip the long name 8401 } 8402 } else if (**ptr == short_name) { 8403 found_valid_name = true; 8404 (*ptr)++; // skip the short name 8405 } 8406 if (found_valid_name) { 8407 format[format_index++] = field_format; 8408 format[format_index++] = '\0'; 8409 absolute_short_name = short_name; 8410 break; 8411 } 8412 } 8413 if (parse_long_name) { 8414 if (**ptr != '}') { 8415 absolute_short_name = 0; 8416 } else { 8417 (*ptr)++; // skip over the right brace 8418 } 8419 } 8420 8421 // Attempt to fill the buffer with the requested 8422 // value using snprintf within __kmp_str_buf_print() 8423 switch (absolute_short_name) { 8424 case 't': 8425 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8426 break; 8427 case 'T': 8428 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8429 break; 8430 case 'L': 8431 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8432 break; 8433 case 'n': 8434 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8435 break; 8436 case 'H': { 8437 static const int BUFFER_SIZE = 256; 8438 char buf[BUFFER_SIZE]; 8439 __kmp_expand_host_name(buf, BUFFER_SIZE); 8440 rc = __kmp_str_buf_print(field_buffer, format, buf); 8441 } break; 8442 case 'P': 8443 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8444 break; 8445 case 'i': 8446 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8447 break; 8448 case 'N': 8449 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8450 break; 8451 case 'a': 8452 field_value = 8453 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8454 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8455 break; 8456 #if KMP_AFFINITY_SUPPORTED 8457 case 'A': { 8458 kmp_str_buf_t buf; 8459 __kmp_str_buf_init(&buf); 8460 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8461 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8462 __kmp_str_buf_free(&buf); 8463 } break; 8464 #endif 8465 default: 8466 // According to spec, If an implementation does not have info for field 8467 // type, then "undefined" is printed 8468 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8469 // Skip the field 8470 if (parse_long_name) { 8471 SKIP_TOKEN(*ptr); 8472 if (**ptr == '}') 8473 (*ptr)++; 8474 } else { 8475 (*ptr)++; 8476 } 8477 } 8478 8479 KMP_ASSERT(format_index <= FORMAT_SIZE); 8480 return rc; 8481 } 8482 8483 /* 8484 * Return number of characters needed to hold the affinity string 8485 * (not including null byte character) 8486 * The resultant string is printed to buffer, which the caller can then 8487 * handle afterwards 8488 */ 8489 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8490 kmp_str_buf_t *buffer) { 8491 const char *parse_ptr; 8492 size_t retval; 8493 const kmp_info_t *th; 8494 kmp_str_buf_t field; 8495 8496 KMP_DEBUG_ASSERT(buffer); 8497 KMP_DEBUG_ASSERT(gtid >= 0); 8498 8499 __kmp_str_buf_init(&field); 8500 __kmp_str_buf_clear(buffer); 8501 8502 th = __kmp_threads[gtid]; 8503 retval = 0; 8504 8505 // If format is NULL or zero-length string, then we use 8506 // affinity-format-var ICV 8507 parse_ptr = format; 8508 if (parse_ptr == NULL || *parse_ptr == '\0') { 8509 parse_ptr = __kmp_affinity_format; 8510 } 8511 KMP_DEBUG_ASSERT(parse_ptr); 8512 8513 while (*parse_ptr != '\0') { 8514 // Parse a field 8515 if (*parse_ptr == '%') { 8516 // Put field in the buffer 8517 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8518 __kmp_str_buf_catbuf(buffer, &field); 8519 retval += rc; 8520 } else { 8521 // Put literal character in buffer 8522 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8523 retval++; 8524 parse_ptr++; 8525 } 8526 } 8527 __kmp_str_buf_free(&field); 8528 return retval; 8529 } 8530 8531 // Displays the affinity string to stdout 8532 void __kmp_aux_display_affinity(int gtid, const char *format) { 8533 kmp_str_buf_t buf; 8534 __kmp_str_buf_init(&buf); 8535 __kmp_aux_capture_affinity(gtid, format, &buf); 8536 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8537 __kmp_str_buf_free(&buf); 8538 } 8539 8540 /* ------------------------------------------------------------------------ */ 8541 8542 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8543 int blocktime = arg; /* argument is in milliseconds */ 8544 #if KMP_USE_MONITOR 8545 int bt_intervals; 8546 #endif 8547 kmp_int8 bt_set; 8548 8549 __kmp_save_internal_controls(thread); 8550 8551 /* Normalize and set blocktime for the teams */ 8552 if (blocktime < KMP_MIN_BLOCKTIME) 8553 blocktime = KMP_MIN_BLOCKTIME; 8554 else if (blocktime > KMP_MAX_BLOCKTIME) 8555 blocktime = KMP_MAX_BLOCKTIME; 8556 8557 set__blocktime_team(thread->th.th_team, tid, blocktime); 8558 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8559 8560 #if KMP_USE_MONITOR 8561 /* Calculate and set blocktime intervals for the teams */ 8562 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8563 8564 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8565 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8566 #endif 8567 8568 /* Set whether blocktime has been set to "TRUE" */ 8569 bt_set = TRUE; 8570 8571 set__bt_set_team(thread->th.th_team, tid, bt_set); 8572 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8573 #if KMP_USE_MONITOR 8574 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8575 "bt_intervals=%d, monitor_updates=%d\n", 8576 __kmp_gtid_from_tid(tid, thread->th.th_team), 8577 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8578 __kmp_monitor_wakeups)); 8579 #else 8580 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8581 __kmp_gtid_from_tid(tid, thread->th.th_team), 8582 thread->th.th_team->t.t_id, tid, blocktime)); 8583 #endif 8584 } 8585 8586 void __kmp_aux_set_defaults(char const *str, size_t len) { 8587 if (!__kmp_init_serial) { 8588 __kmp_serial_initialize(); 8589 } 8590 __kmp_env_initialize(str); 8591 8592 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8593 __kmp_env_print(); 8594 } 8595 } // __kmp_aux_set_defaults 8596 8597 /* ------------------------------------------------------------------------ */ 8598 /* internal fast reduction routines */ 8599 8600 PACKED_REDUCTION_METHOD_T 8601 __kmp_determine_reduction_method( 8602 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8603 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8604 kmp_critical_name *lck) { 8605 8606 // Default reduction method: critical construct ( lck != NULL, like in current 8607 // PAROPT ) 8608 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8609 // can be selected by RTL 8610 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8611 // can be selected by RTL 8612 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8613 // among generated by PAROPT. 8614 8615 PACKED_REDUCTION_METHOD_T retval; 8616 8617 int team_size; 8618 8619 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8620 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8621 8622 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8623 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8624 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8625 8626 retval = critical_reduce_block; 8627 8628 // another choice of getting a team size (with 1 dynamic deference) is slower 8629 team_size = __kmp_get_team_num_threads(global_tid); 8630 if (team_size == 1) { 8631 8632 retval = empty_reduce_block; 8633 8634 } else { 8635 8636 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8637 8638 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8639 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8640 8641 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8642 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8643 8644 int teamsize_cutoff = 4; 8645 8646 #if KMP_MIC_SUPPORTED 8647 if (__kmp_mic_type != non_mic) { 8648 teamsize_cutoff = 8; 8649 } 8650 #endif 8651 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8652 if (tree_available) { 8653 if (team_size <= teamsize_cutoff) { 8654 if (atomic_available) { 8655 retval = atomic_reduce_block; 8656 } 8657 } else { 8658 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8659 } 8660 } else if (atomic_available) { 8661 retval = atomic_reduce_block; 8662 } 8663 #else 8664 #error "Unknown or unsupported OS" 8665 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8666 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8667 8668 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8669 8670 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8671 8672 // basic tuning 8673 8674 if (atomic_available) { 8675 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8676 retval = atomic_reduce_block; 8677 } 8678 } // otherwise: use critical section 8679 8680 #elif KMP_OS_DARWIN 8681 8682 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8683 if (atomic_available && (num_vars <= 3)) { 8684 retval = atomic_reduce_block; 8685 } else if (tree_available) { 8686 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8687 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8688 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8689 } 8690 } // otherwise: use critical section 8691 8692 #else 8693 #error "Unknown or unsupported OS" 8694 #endif 8695 8696 #else 8697 #error "Unknown or unsupported architecture" 8698 #endif 8699 } 8700 8701 // KMP_FORCE_REDUCTION 8702 8703 // If the team is serialized (team_size == 1), ignore the forced reduction 8704 // method and stay with the unsynchronized method (empty_reduce_block) 8705 if (__kmp_force_reduction_method != reduction_method_not_defined && 8706 team_size != 1) { 8707 8708 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8709 8710 int atomic_available, tree_available; 8711 8712 switch ((forced_retval = __kmp_force_reduction_method)) { 8713 case critical_reduce_block: 8714 KMP_ASSERT(lck); // lck should be != 0 8715 break; 8716 8717 case atomic_reduce_block: 8718 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8719 if (!atomic_available) { 8720 KMP_WARNING(RedMethodNotSupported, "atomic"); 8721 forced_retval = critical_reduce_block; 8722 } 8723 break; 8724 8725 case tree_reduce_block: 8726 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8727 if (!tree_available) { 8728 KMP_WARNING(RedMethodNotSupported, "tree"); 8729 forced_retval = critical_reduce_block; 8730 } else { 8731 #if KMP_FAST_REDUCTION_BARRIER 8732 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8733 #endif 8734 } 8735 break; 8736 8737 default: 8738 KMP_ASSERT(0); // "unsupported method specified" 8739 } 8740 8741 retval = forced_retval; 8742 } 8743 8744 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8745 8746 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8747 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8748 8749 return (retval); 8750 } 8751 // this function is for testing set/get/determine reduce method 8752 kmp_int32 __kmp_get_reduce_method(void) { 8753 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8754 } 8755 8756 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8757 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8758 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8759 8760 // Hard pause shuts down the runtime completely. Resume happens naturally when 8761 // OpenMP is used subsequently. 8762 void __kmp_hard_pause() { 8763 __kmp_pause_status = kmp_hard_paused; 8764 __kmp_internal_end_thread(-1); 8765 } 8766 8767 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8768 void __kmp_resume_if_soft_paused() { 8769 if (__kmp_pause_status == kmp_soft_paused) { 8770 __kmp_pause_status = kmp_not_paused; 8771 8772 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8773 kmp_info_t *thread = __kmp_threads[gtid]; 8774 if (thread) { // Wake it if sleeping 8775 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8776 thread); 8777 if (fl.is_sleeping()) 8778 fl.resume(gtid); 8779 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8780 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8781 } else { // thread holds the lock and may sleep soon 8782 do { // until either the thread sleeps, or we can get the lock 8783 if (fl.is_sleeping()) { 8784 fl.resume(gtid); 8785 break; 8786 } else if (__kmp_try_suspend_mx(thread)) { 8787 __kmp_unlock_suspend_mx(thread); 8788 break; 8789 } 8790 } while (1); 8791 } 8792 } 8793 } 8794 } 8795 } 8796 8797 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8798 // TODO: add warning messages 8799 int __kmp_pause_resource(kmp_pause_status_t level) { 8800 if (level == kmp_not_paused) { // requesting resume 8801 if (__kmp_pause_status == kmp_not_paused) { 8802 // error message about runtime not being paused, so can't resume 8803 return 1; 8804 } else { 8805 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8806 __kmp_pause_status == kmp_hard_paused); 8807 __kmp_pause_status = kmp_not_paused; 8808 return 0; 8809 } 8810 } else if (level == kmp_soft_paused) { // requesting soft pause 8811 if (__kmp_pause_status != kmp_not_paused) { 8812 // error message about already being paused 8813 return 1; 8814 } else { 8815 __kmp_soft_pause(); 8816 return 0; 8817 } 8818 } else if (level == kmp_hard_paused) { // requesting hard pause 8819 if (__kmp_pause_status != kmp_not_paused) { 8820 // error message about already being paused 8821 return 1; 8822 } else { 8823 __kmp_hard_pause(); 8824 return 0; 8825 } 8826 } else { 8827 // error message about invalid level 8828 return 1; 8829 } 8830 } 8831 8832 void __kmp_omp_display_env(int verbose) { 8833 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8834 if (__kmp_init_serial == 0) 8835 __kmp_do_serial_initialize(); 8836 __kmp_display_env_impl(!verbose, verbose); 8837 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8838 } 8839 8840 // The team size is changing, so distributed barrier must be modified 8841 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 8842 int new_nthreads) { 8843 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 8844 bp_dist_bar); 8845 kmp_info_t **other_threads = team->t.t_threads; 8846 8847 // We want all the workers to stop waiting on the barrier while we adjust the 8848 // size of the team. 8849 for (int f = 1; f < old_nthreads; ++f) { 8850 KMP_DEBUG_ASSERT(other_threads[f] != NULL); 8851 // Ignore threads that are already inactive or not present in the team 8852 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { 8853 // teams construct causes thread_limit to get passed in, and some of 8854 // those could be inactive; just ignore them 8855 continue; 8856 } 8857 // If thread is transitioning still to in_use state, wait for it 8858 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { 8859 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) 8860 KMP_CPU_PAUSE(); 8861 } 8862 // The thread should be in_use now 8863 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); 8864 // Transition to unused state 8865 team->t.t_threads[f]->th.th_used_in_team.store(2); 8866 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); 8867 } 8868 // Release all the workers 8869 kmp_uint64 new_value; // new value for go 8870 new_value = team->t.b->go_release(); 8871 8872 KMP_MFENCE(); 8873 8874 // Workers should see transition status 2 and move to 0; but may need to be 8875 // woken up first 8876 size_t my_go_index; 8877 int count = old_nthreads - 1; 8878 while (count > 0) { 8879 count = old_nthreads - 1; 8880 for (int f = 1; f < old_nthreads; ++f) { 8881 my_go_index = f / team->t.b->threads_per_go; 8882 if (other_threads[f]->th.th_used_in_team.load() != 0) { 8883 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers 8884 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( 8885 void *, other_threads[f]->th.th_sleep_loc); 8886 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); 8887 } 8888 } else { 8889 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); 8890 count--; 8891 } 8892 } 8893 } 8894 // Now update the barrier size 8895 team->t.b->update_num_threads(new_nthreads); 8896 team->t.b->go_reset(); 8897 } 8898 8899 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { 8900 // Add the threads back to the team 8901 KMP_DEBUG_ASSERT(team); 8902 // Threads were paused and pointed at th_used_in_team temporarily during a 8903 // resize of the team. We're going to set th_used_in_team to 3 to indicate to 8904 // the thread that it should transition itself back into the team. Then, if 8905 // blocktime isn't infinite, the thread could be sleeping, so we send a resume 8906 // to wake it up. 8907 for (int f = 1; f < new_nthreads; ++f) { 8908 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 8909 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, 8910 3); 8911 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads 8912 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, 8913 (kmp_flag_32<false, false> *)NULL); 8914 } 8915 } 8916 // The threads should be transitioning to the team; when they are done, they 8917 // should have set th_used_in_team to 1. This loop forces master to wait until 8918 // all threads have moved into the team and are waiting in the barrier. 8919 int count = new_nthreads - 1; 8920 while (count > 0) { 8921 count = new_nthreads - 1; 8922 for (int f = 1; f < new_nthreads; ++f) { 8923 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { 8924 count--; 8925 } 8926 } 8927 } 8928 } 8929 8930 // Globals and functions for hidden helper task 8931 kmp_info_t **__kmp_hidden_helper_threads; 8932 kmp_info_t *__kmp_hidden_helper_main_thread; 8933 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8934 #if KMP_OS_LINUX 8935 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8936 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8937 #else 8938 kmp_int32 __kmp_hidden_helper_threads_num = 0; 8939 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8940 #endif 8941 8942 namespace { 8943 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8944 8945 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8946 // This is an explicit synchronization on all hidden helper threads in case 8947 // that when a regular thread pushes a hidden helper task to one hidden 8948 // helper thread, the thread has not been awaken once since they're released 8949 // by the main thread after creating the team. 8950 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8951 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8952 __kmp_hidden_helper_threads_num) 8953 ; 8954 8955 // If main thread, then wait for signal 8956 if (__kmpc_master(nullptr, *gtid)) { 8957 // First, unset the initial state and release the initial thread 8958 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8959 __kmp_hidden_helper_initz_release(); 8960 __kmp_hidden_helper_main_thread_wait(); 8961 // Now wake up all worker threads 8962 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8963 __kmp_hidden_helper_worker_thread_signal(); 8964 } 8965 } 8966 } 8967 } // namespace 8968 8969 void __kmp_hidden_helper_threads_initz_routine() { 8970 // Create a new root for hidden helper team/threads 8971 const int gtid = __kmp_register_root(TRUE); 8972 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8973 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8974 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8975 __kmp_hidden_helper_threads_num; 8976 8977 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8978 8979 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8980 8981 // Set the initialization flag to FALSE 8982 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8983 8984 __kmp_hidden_helper_threads_deinitz_release(); 8985 } 8986 8987 /* Nesting Mode: 8988 Set via KMP_NESTING_MODE, which takes an integer. 8989 Note: we skip duplicate topology levels, and skip levels with only 8990 one entity. 8991 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 8992 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 8993 in the topology, and initializes the number of threads at each of those 8994 levels to the number of entities at each level, respectively, below the 8995 entity at the parent level. 8996 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 8997 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 8998 the user to turn nesting on explicitly. This is an even more experimental 8999 option to this experimental feature, and may change or go away in the 9000 future. 9001 */ 9002 9003 // Allocate space to store nesting levels 9004 void __kmp_init_nesting_mode() { 9005 int levels = KMP_HW_LAST; 9006 __kmp_nesting_mode_nlevels = levels; 9007 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 9008 for (int i = 0; i < levels; ++i) 9009 __kmp_nesting_nth_level[i] = 0; 9010 if (__kmp_nested_nth.size < levels) { 9011 __kmp_nested_nth.nth = 9012 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 9013 __kmp_nested_nth.size = levels; 9014 } 9015 } 9016 9017 // Set # threads for top levels of nesting; must be called after topology set 9018 void __kmp_set_nesting_mode_threads() { 9019 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 9020 9021 if (__kmp_nesting_mode == 1) 9022 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 9023 else if (__kmp_nesting_mode > 1) 9024 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9025 9026 if (__kmp_topology) { // use topology info 9027 int loc, hw_level; 9028 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 9029 loc < __kmp_nesting_mode_nlevels; 9030 loc++, hw_level++) { 9031 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 9032 if (__kmp_nesting_nth_level[loc] == 1) 9033 loc--; 9034 } 9035 // Make sure all cores are used 9036 if (__kmp_nesting_mode > 1 && loc > 1) { 9037 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 9038 int num_cores = __kmp_topology->get_count(core_level); 9039 int upper_levels = 1; 9040 for (int level = 0; level < loc - 1; ++level) 9041 upper_levels *= __kmp_nesting_nth_level[level]; 9042 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 9043 __kmp_nesting_nth_level[loc - 1] = 9044 num_cores / __kmp_nesting_nth_level[loc - 2]; 9045 } 9046 __kmp_nesting_mode_nlevels = loc; 9047 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9048 } else { // no topology info available; provide a reasonable guesstimation 9049 if (__kmp_avail_proc >= 4) { 9050 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 9051 __kmp_nesting_nth_level[1] = 2; 9052 __kmp_nesting_mode_nlevels = 2; 9053 } else { 9054 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 9055 __kmp_nesting_mode_nlevels = 1; 9056 } 9057 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9058 } 9059 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 9060 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 9061 } 9062 set__nproc(thread, __kmp_nesting_nth_level[0]); 9063 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 9064 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9065 if (get__max_active_levels(thread) > 1) { 9066 // if max levels was set, set nesting mode levels to same 9067 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 9068 } 9069 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 9070 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 9071 } 9072