1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #if KMP_OS_WINDOWS 51 // windows does not need include files as it doesn't use shared memory 52 #else 53 #include <sys/mman.h> 54 #include <sys/stat.h> 55 #include <fcntl.h> 56 #define SHM_SIZE 1024 57 #endif 58 59 #if defined(KMP_GOMP_COMPAT) 60 char const __kmp_version_alt_comp[] = 61 KMP_VERSION_PREFIX "alternative compiler support: yes"; 62 #endif /* defined(KMP_GOMP_COMPAT) */ 63 64 char const __kmp_version_omp_api[] = 65 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 66 67 #ifdef KMP_DEBUG 68 char const __kmp_version_lock[] = 69 KMP_VERSION_PREFIX "lock type: run time selectable"; 70 #endif /* KMP_DEBUG */ 71 72 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 73 74 /* ------------------------------------------------------------------------ */ 75 76 #if KMP_USE_MONITOR 77 kmp_info_t __kmp_monitor; 78 #endif 79 80 /* Forward declarations */ 81 82 void __kmp_cleanup(void); 83 84 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 85 int gtid); 86 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 87 kmp_internal_control_t *new_icvs, 88 ident_t *loc); 89 #if KMP_AFFINITY_SUPPORTED 90 static void __kmp_partition_places(kmp_team_t *team, 91 int update_master_only = 0); 92 #endif 93 static void __kmp_do_serial_initialize(void); 94 void __kmp_fork_barrier(int gtid, int tid); 95 void __kmp_join_barrier(int gtid); 96 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 97 kmp_internal_control_t *new_icvs, ident_t *loc); 98 99 #ifdef USE_LOAD_BALANCE 100 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 101 #endif 102 103 static int __kmp_expand_threads(int nNeed); 104 #if KMP_OS_WINDOWS 105 static int __kmp_unregister_root_other_thread(int gtid); 106 #endif 107 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 108 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 109 110 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 111 int new_nthreads); 112 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads); 113 114 /* Calculate the identifier of the current thread */ 115 /* fast (and somewhat portable) way to get unique identifier of executing 116 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 117 int __kmp_get_global_thread_id() { 118 int i; 119 kmp_info_t **other_threads; 120 size_t stack_data; 121 char *stack_addr; 122 size_t stack_size; 123 char *stack_base; 124 125 KA_TRACE( 126 1000, 127 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 128 __kmp_nth, __kmp_all_nth)); 129 130 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 131 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 132 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 133 __kmp_init_gtid for this to work. */ 134 135 if (!TCR_4(__kmp_init_gtid)) 136 return KMP_GTID_DNE; 137 138 #ifdef KMP_TDATA_GTID 139 if (TCR_4(__kmp_gtid_mode) >= 3) { 140 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 141 return __kmp_gtid; 142 } 143 #endif 144 if (TCR_4(__kmp_gtid_mode) >= 2) { 145 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 146 return __kmp_gtid_get_specific(); 147 } 148 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 149 150 stack_addr = (char *)&stack_data; 151 other_threads = __kmp_threads; 152 153 /* ATT: The code below is a source of potential bugs due to unsynchronized 154 access to __kmp_threads array. For example: 155 1. Current thread loads other_threads[i] to thr and checks it, it is 156 non-NULL. 157 2. Current thread is suspended by OS. 158 3. Another thread unregisters and finishes (debug versions of free() 159 may fill memory with something like 0xEF). 160 4. Current thread is resumed. 161 5. Current thread reads junk from *thr. 162 TODO: Fix it. --ln */ 163 164 for (i = 0; i < __kmp_threads_capacity; i++) { 165 166 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 167 if (!thr) 168 continue; 169 170 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 171 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 172 173 /* stack grows down -- search through all of the active threads */ 174 175 if (stack_addr <= stack_base) { 176 size_t stack_diff = stack_base - stack_addr; 177 178 if (stack_diff <= stack_size) { 179 /* The only way we can be closer than the allocated */ 180 /* stack size is if we are running on this thread. */ 181 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 182 return i; 183 } 184 } 185 } 186 187 /* get specific to try and determine our gtid */ 188 KA_TRACE(1000, 189 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 190 "thread, using TLS\n")); 191 i = __kmp_gtid_get_specific(); 192 193 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 194 195 /* if we havn't been assigned a gtid, then return code */ 196 if (i < 0) 197 return i; 198 199 /* dynamically updated stack window for uber threads to avoid get_specific 200 call */ 201 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 202 KMP_FATAL(StackOverflow, i); 203 } 204 205 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 206 if (stack_addr > stack_base) { 207 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 208 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 209 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 210 stack_base); 211 } else { 212 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 213 stack_base - stack_addr); 214 } 215 216 /* Reprint stack bounds for ubermaster since they have been refined */ 217 if (__kmp_storage_map) { 218 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 219 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 220 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 221 other_threads[i]->th.th_info.ds.ds_stacksize, 222 "th_%d stack (refinement)", i); 223 } 224 return i; 225 } 226 227 int __kmp_get_global_thread_id_reg() { 228 int gtid; 229 230 if (!__kmp_init_serial) { 231 gtid = KMP_GTID_DNE; 232 } else 233 #ifdef KMP_TDATA_GTID 234 if (TCR_4(__kmp_gtid_mode) >= 3) { 235 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 236 gtid = __kmp_gtid; 237 } else 238 #endif 239 if (TCR_4(__kmp_gtid_mode) >= 2) { 240 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 241 gtid = __kmp_gtid_get_specific(); 242 } else { 243 KA_TRACE(1000, 244 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 245 gtid = __kmp_get_global_thread_id(); 246 } 247 248 /* we must be a new uber master sibling thread */ 249 if (gtid == KMP_GTID_DNE) { 250 KA_TRACE(10, 251 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 252 "Registering a new gtid.\n")); 253 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 254 if (!__kmp_init_serial) { 255 __kmp_do_serial_initialize(); 256 gtid = __kmp_gtid_get_specific(); 257 } else { 258 gtid = __kmp_register_root(FALSE); 259 } 260 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 261 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 262 } 263 264 KMP_DEBUG_ASSERT(gtid >= 0); 265 266 return gtid; 267 } 268 269 /* caller must hold forkjoin_lock */ 270 void __kmp_check_stack_overlap(kmp_info_t *th) { 271 int f; 272 char *stack_beg = NULL; 273 char *stack_end = NULL; 274 int gtid; 275 276 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 277 if (__kmp_storage_map) { 278 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 279 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 280 281 gtid = __kmp_gtid_from_thread(th); 282 283 if (gtid == KMP_GTID_MONITOR) { 284 __kmp_print_storage_map_gtid( 285 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 286 "th_%s stack (%s)", "mon", 287 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 288 } else { 289 __kmp_print_storage_map_gtid( 290 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 291 "th_%d stack (%s)", gtid, 292 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 293 } 294 } 295 296 /* No point in checking ubermaster threads since they use refinement and 297 * cannot overlap */ 298 gtid = __kmp_gtid_from_thread(th); 299 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 300 KA_TRACE(10, 301 ("__kmp_check_stack_overlap: performing extensive checking\n")); 302 if (stack_beg == NULL) { 303 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 304 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 305 } 306 307 for (f = 0; f < __kmp_threads_capacity; f++) { 308 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 309 310 if (f_th && f_th != th) { 311 char *other_stack_end = 312 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 313 char *other_stack_beg = 314 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 315 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 316 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 317 318 /* Print the other stack values before the abort */ 319 if (__kmp_storage_map) 320 __kmp_print_storage_map_gtid( 321 -1, other_stack_beg, other_stack_end, 322 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 323 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 324 325 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 326 __kmp_msg_null); 327 } 328 } 329 } 330 } 331 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 332 } 333 334 /* ------------------------------------------------------------------------ */ 335 336 void __kmp_infinite_loop(void) { 337 static int done = FALSE; 338 339 while (!done) { 340 KMP_YIELD(TRUE); 341 } 342 } 343 344 #define MAX_MESSAGE 512 345 346 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 347 char const *format, ...) { 348 char buffer[MAX_MESSAGE]; 349 va_list ap; 350 351 va_start(ap, format); 352 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 353 p2, (unsigned long)size, format); 354 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 355 __kmp_vprintf(kmp_err, buffer, ap); 356 #if KMP_PRINT_DATA_PLACEMENT 357 int node; 358 if (gtid >= 0) { 359 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 360 if (__kmp_storage_map_verbose) { 361 node = __kmp_get_host_node(p1); 362 if (node < 0) /* doesn't work, so don't try this next time */ 363 __kmp_storage_map_verbose = FALSE; 364 else { 365 char *last; 366 int lastNode; 367 int localProc = __kmp_get_cpu_from_gtid(gtid); 368 369 const int page_size = KMP_GET_PAGE_SIZE(); 370 371 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 372 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 373 if (localProc >= 0) 374 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 375 localProc >> 1); 376 else 377 __kmp_printf_no_lock(" GTID %d\n", gtid); 378 #if KMP_USE_PRCTL 379 /* The more elaborate format is disabled for now because of the prctl 380 * hanging bug. */ 381 do { 382 last = p1; 383 lastNode = node; 384 /* This loop collates adjacent pages with the same host node. */ 385 do { 386 (char *)p1 += page_size; 387 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 388 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 389 lastNode); 390 } while (p1 <= p2); 391 #else 392 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 393 (char *)p1 + (page_size - 1), 394 __kmp_get_host_node(p1)); 395 if (p1 < p2) { 396 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 397 (char *)p2 + (page_size - 1), 398 __kmp_get_host_node(p2)); 399 } 400 #endif 401 } 402 } 403 } else 404 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 405 } 406 #endif /* KMP_PRINT_DATA_PLACEMENT */ 407 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 408 } 409 410 void __kmp_warn(char const *format, ...) { 411 char buffer[MAX_MESSAGE]; 412 va_list ap; 413 414 if (__kmp_generate_warnings == kmp_warnings_off) { 415 return; 416 } 417 418 va_start(ap, format); 419 420 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 421 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 422 __kmp_vprintf(kmp_err, buffer, ap); 423 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 424 425 va_end(ap); 426 } 427 428 void __kmp_abort_process() { 429 // Later threads may stall here, but that's ok because abort() will kill them. 430 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 431 432 if (__kmp_debug_buf) { 433 __kmp_dump_debug_buffer(); 434 } 435 436 if (KMP_OS_WINDOWS) { 437 // Let other threads know of abnormal termination and prevent deadlock 438 // if abort happened during library initialization or shutdown 439 __kmp_global.g.g_abort = SIGABRT; 440 441 /* On Windows* OS by default abort() causes pop-up error box, which stalls 442 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 443 boxes. _set_abort_behavior() works well, but this function is not 444 available in VS7 (this is not problem for DLL, but it is a problem for 445 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 446 help, at least in some versions of MS C RTL. 447 448 It seems following sequence is the only way to simulate abort() and 449 avoid pop-up error box. */ 450 raise(SIGABRT); 451 _exit(3); // Just in case, if signal ignored, exit anyway. 452 } else { 453 __kmp_unregister_library(); 454 abort(); 455 } 456 457 __kmp_infinite_loop(); 458 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 459 460 } // __kmp_abort_process 461 462 void __kmp_abort_thread(void) { 463 // TODO: Eliminate g_abort global variable and this function. 464 // In case of abort just call abort(), it will kill all the threads. 465 __kmp_infinite_loop(); 466 } // __kmp_abort_thread 467 468 /* Print out the storage map for the major kmp_info_t thread data structures 469 that are allocated together. */ 470 471 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 472 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 473 gtid); 474 475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 476 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 477 478 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 479 sizeof(kmp_local_t), "th_%d.th_local", gtid); 480 481 __kmp_print_storage_map_gtid( 482 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 483 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 484 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 486 &thr->th.th_bar[bs_plain_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 488 gtid); 489 490 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 491 &thr->th.th_bar[bs_forkjoin_barrier + 1], 492 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 493 gtid); 494 495 #if KMP_FAST_REDUCTION_BARRIER 496 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 497 &thr->th.th_bar[bs_reduction_barrier + 1], 498 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 499 gtid); 500 #endif // KMP_FAST_REDUCTION_BARRIER 501 } 502 503 /* Print out the storage map for the major kmp_team_t team data structures 504 that are allocated together. */ 505 506 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 507 int team_id, int num_thr) { 508 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 509 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 510 header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 513 &team->t.t_bar[bs_last_barrier], 514 sizeof(kmp_balign_team_t) * bs_last_barrier, 515 "%s_%d.t_bar", header, team_id); 516 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 518 &team->t.t_bar[bs_plain_barrier + 1], 519 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 520 header, team_id); 521 522 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 523 &team->t.t_bar[bs_forkjoin_barrier + 1], 524 sizeof(kmp_balign_team_t), 525 "%s_%d.t_bar[forkjoin]", header, team_id); 526 527 #if KMP_FAST_REDUCTION_BARRIER 528 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 529 &team->t.t_bar[bs_reduction_barrier + 1], 530 sizeof(kmp_balign_team_t), 531 "%s_%d.t_bar[reduction]", header, team_id); 532 #endif // KMP_FAST_REDUCTION_BARRIER 533 534 __kmp_print_storage_map_gtid( 535 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 536 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 537 538 __kmp_print_storage_map_gtid( 539 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 540 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 541 542 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 543 &team->t.t_disp_buffer[num_disp_buff], 544 sizeof(dispatch_shared_info_t) * num_disp_buff, 545 "%s_%d.t_disp_buffer", header, team_id); 546 } 547 548 static void __kmp_init_allocator() { 549 __kmp_init_memkind(); 550 __kmp_init_target_mem(); 551 } 552 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 553 554 /* ------------------------------------------------------------------------ */ 555 556 #if KMP_DYNAMIC_LIB 557 #if KMP_OS_WINDOWS 558 559 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 560 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 561 562 switch (fdwReason) { 563 564 case DLL_PROCESS_ATTACH: 565 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 566 567 return TRUE; 568 569 case DLL_PROCESS_DETACH: 570 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 571 572 // According to Windows* documentation for DllMain entry point: 573 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 574 // lpReserved == NULL when FreeLibrary() is called, 575 // lpReserved != NULL when the process is terminated. 576 // When FreeLibrary() is called, worker threads remain alive. So the 577 // runtime's state is consistent and executing proper shutdown is OK. 578 // When the process is terminated, worker threads have exited or been 579 // forcefully terminated by the OS and only the shutdown thread remains. 580 // This can leave the runtime in an inconsistent state. 581 // Hence, only attempt proper cleanup when FreeLibrary() is called. 582 // Otherwise, rely on OS to reclaim resources. 583 if (lpReserved == NULL) 584 __kmp_internal_end_library(__kmp_gtid_get_specific()); 585 586 return TRUE; 587 588 case DLL_THREAD_ATTACH: 589 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 590 591 /* if we want to register new siblings all the time here call 592 * __kmp_get_gtid(); */ 593 return TRUE; 594 595 case DLL_THREAD_DETACH: 596 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 597 598 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 599 return TRUE; 600 } 601 602 return TRUE; 603 } 604 605 #endif /* KMP_OS_WINDOWS */ 606 #endif /* KMP_DYNAMIC_LIB */ 607 608 /* __kmp_parallel_deo -- Wait until it's our turn. */ 609 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 610 int gtid = *gtid_ref; 611 #ifdef BUILD_PARALLEL_ORDERED 612 kmp_team_t *team = __kmp_team_from_gtid(gtid); 613 #endif /* BUILD_PARALLEL_ORDERED */ 614 615 if (__kmp_env_consistency_check) { 616 if (__kmp_threads[gtid]->th.th_root->r.r_active) 617 #if KMP_USE_DYNAMIC_LOCK 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 619 #else 620 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 621 #endif 622 } 623 #ifdef BUILD_PARALLEL_ORDERED 624 if (!team->t.t_serialized) { 625 KMP_MB(); 626 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 627 NULL); 628 KMP_MB(); 629 } 630 #endif /* BUILD_PARALLEL_ORDERED */ 631 } 632 633 /* __kmp_parallel_dxo -- Signal the next task. */ 634 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 635 int gtid = *gtid_ref; 636 #ifdef BUILD_PARALLEL_ORDERED 637 int tid = __kmp_tid_from_gtid(gtid); 638 kmp_team_t *team = __kmp_team_from_gtid(gtid); 639 #endif /* BUILD_PARALLEL_ORDERED */ 640 641 if (__kmp_env_consistency_check) { 642 if (__kmp_threads[gtid]->th.th_root->r.r_active) 643 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 644 } 645 #ifdef BUILD_PARALLEL_ORDERED 646 if (!team->t.t_serialized) { 647 KMP_MB(); /* Flush all pending memory write invalidates. */ 648 649 /* use the tid of the next thread in this team */ 650 /* TODO replace with general release procedure */ 651 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 652 653 KMP_MB(); /* Flush all pending memory write invalidates. */ 654 } 655 #endif /* BUILD_PARALLEL_ORDERED */ 656 } 657 658 /* ------------------------------------------------------------------------ */ 659 /* The BARRIER for a SINGLE process section is always explicit */ 660 661 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 662 int status; 663 kmp_info_t *th; 664 kmp_team_t *team; 665 666 if (!TCR_4(__kmp_init_parallel)) 667 __kmp_parallel_initialize(); 668 __kmp_resume_if_soft_paused(); 669 670 th = __kmp_threads[gtid]; 671 team = th->th.th_team; 672 status = 0; 673 674 th->th.th_ident = id_ref; 675 676 if (team->t.t_serialized) { 677 status = 1; 678 } else { 679 kmp_int32 old_this = th->th.th_local.this_construct; 680 681 ++th->th.th_local.this_construct; 682 /* try to set team count to thread count--success means thread got the 683 single block */ 684 /* TODO: Should this be acquire or release? */ 685 if (team->t.t_construct == old_this) { 686 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 687 th->th.th_local.this_construct); 688 } 689 #if USE_ITT_BUILD 690 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 691 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 692 team->t.t_active_level == 1) { 693 // Only report metadata by primary thread of active team at level 1 694 __kmp_itt_metadata_single(id_ref); 695 } 696 #endif /* USE_ITT_BUILD */ 697 } 698 699 if (__kmp_env_consistency_check) { 700 if (status && push_ws) { 701 __kmp_push_workshare(gtid, ct_psingle, id_ref); 702 } else { 703 __kmp_check_workshare(gtid, ct_psingle, id_ref); 704 } 705 } 706 #if USE_ITT_BUILD 707 if (status) { 708 __kmp_itt_single_start(gtid); 709 } 710 #endif /* USE_ITT_BUILD */ 711 return status; 712 } 713 714 void __kmp_exit_single(int gtid) { 715 #if USE_ITT_BUILD 716 __kmp_itt_single_end(gtid); 717 #endif /* USE_ITT_BUILD */ 718 if (__kmp_env_consistency_check) 719 __kmp_pop_workshare(gtid, ct_psingle, NULL); 720 } 721 722 /* determine if we can go parallel or must use a serialized parallel region and 723 * how many threads we can use 724 * set_nproc is the number of threads requested for the team 725 * returns 0 if we should serialize or only use one thread, 726 * otherwise the number of threads to use 727 * The forkjoin lock is held by the caller. */ 728 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 729 int master_tid, int set_nthreads, 730 int enter_teams) { 731 int capacity; 732 int new_nthreads; 733 KMP_DEBUG_ASSERT(__kmp_init_serial); 734 KMP_DEBUG_ASSERT(root && parent_team); 735 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 736 737 // If dyn-var is set, dynamically adjust the number of desired threads, 738 // according to the method specified by dynamic_mode. 739 new_nthreads = set_nthreads; 740 if (!get__dynamic_2(parent_team, master_tid)) { 741 ; 742 } 743 #ifdef USE_LOAD_BALANCE 744 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 745 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 746 if (new_nthreads == 1) { 747 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 748 "reservation to 1 thread\n", 749 master_tid)); 750 return 1; 751 } 752 if (new_nthreads < set_nthreads) { 753 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 754 "reservation to %d threads\n", 755 master_tid, new_nthreads)); 756 } 757 } 758 #endif /* USE_LOAD_BALANCE */ 759 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 760 new_nthreads = __kmp_avail_proc - __kmp_nth + 761 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 762 if (new_nthreads <= 1) { 763 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 764 "reservation to 1 thread\n", 765 master_tid)); 766 return 1; 767 } 768 if (new_nthreads < set_nthreads) { 769 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 770 "reservation to %d threads\n", 771 master_tid, new_nthreads)); 772 } else { 773 new_nthreads = set_nthreads; 774 } 775 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 776 if (set_nthreads > 2) { 777 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 778 new_nthreads = (new_nthreads % set_nthreads) + 1; 779 if (new_nthreads == 1) { 780 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 781 "reservation to 1 thread\n", 782 master_tid)); 783 return 1; 784 } 785 if (new_nthreads < set_nthreads) { 786 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 787 "reservation to %d threads\n", 788 master_tid, new_nthreads)); 789 } 790 } 791 } else { 792 KMP_ASSERT(0); 793 } 794 795 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 796 if (__kmp_nth + new_nthreads - 797 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 798 __kmp_max_nth) { 799 int tl_nthreads = __kmp_max_nth - __kmp_nth + 800 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 801 if (tl_nthreads <= 0) { 802 tl_nthreads = 1; 803 } 804 805 // If dyn-var is false, emit a 1-time warning. 806 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 807 __kmp_reserve_warn = 1; 808 __kmp_msg(kmp_ms_warning, 809 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 810 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 811 } 812 if (tl_nthreads == 1) { 813 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 814 "reduced reservation to 1 thread\n", 815 master_tid)); 816 return 1; 817 } 818 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 819 "reservation to %d threads\n", 820 master_tid, tl_nthreads)); 821 new_nthreads = tl_nthreads; 822 } 823 824 // Respect OMP_THREAD_LIMIT 825 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 826 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 827 if (cg_nthreads + new_nthreads - 828 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 829 max_cg_threads) { 830 int tl_nthreads = max_cg_threads - cg_nthreads + 831 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 832 if (tl_nthreads <= 0) { 833 tl_nthreads = 1; 834 } 835 836 // If dyn-var is false, emit a 1-time warning. 837 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 838 __kmp_reserve_warn = 1; 839 __kmp_msg(kmp_ms_warning, 840 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 841 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 842 } 843 if (tl_nthreads == 1) { 844 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 845 "reduced reservation to 1 thread\n", 846 master_tid)); 847 return 1; 848 } 849 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 850 "reservation to %d threads\n", 851 master_tid, tl_nthreads)); 852 new_nthreads = tl_nthreads; 853 } 854 855 // Check if the threads array is large enough, or needs expanding. 856 // See comment in __kmp_register_root() about the adjustment if 857 // __kmp_threads[0] == NULL. 858 capacity = __kmp_threads_capacity; 859 if (TCR_PTR(__kmp_threads[0]) == NULL) { 860 --capacity; 861 } 862 // If it is not for initializing the hidden helper team, we need to take 863 // __kmp_hidden_helper_threads_num out of the capacity because it is included 864 // in __kmp_threads_capacity. 865 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 866 capacity -= __kmp_hidden_helper_threads_num; 867 } 868 if (__kmp_nth + new_nthreads - 869 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 870 capacity) { 871 // Expand the threads array. 872 int slotsRequired = __kmp_nth + new_nthreads - 873 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 874 capacity; 875 int slotsAdded = __kmp_expand_threads(slotsRequired); 876 if (slotsAdded < slotsRequired) { 877 // The threads array was not expanded enough. 878 new_nthreads -= (slotsRequired - slotsAdded); 879 KMP_ASSERT(new_nthreads >= 1); 880 881 // If dyn-var is false, emit a 1-time warning. 882 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 883 __kmp_reserve_warn = 1; 884 if (__kmp_tp_cached) { 885 __kmp_msg(kmp_ms_warning, 886 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 887 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 888 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 889 } else { 890 __kmp_msg(kmp_ms_warning, 891 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 892 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 893 } 894 } 895 } 896 } 897 898 #ifdef KMP_DEBUG 899 if (new_nthreads == 1) { 900 KC_TRACE(10, 901 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 902 "dead roots and rechecking; requested %d threads\n", 903 __kmp_get_gtid(), set_nthreads)); 904 } else { 905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 906 " %d threads\n", 907 __kmp_get_gtid(), new_nthreads, set_nthreads)); 908 } 909 #endif // KMP_DEBUG 910 return new_nthreads; 911 } 912 913 /* Allocate threads from the thread pool and assign them to the new team. We are 914 assured that there are enough threads available, because we checked on that 915 earlier within critical section forkjoin */ 916 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 917 kmp_info_t *master_th, int master_gtid) { 918 int i; 919 int use_hot_team; 920 921 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 922 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 923 KMP_MB(); 924 925 /* first, let's setup the primary thread */ 926 master_th->th.th_info.ds.ds_tid = 0; 927 master_th->th.th_team = team; 928 master_th->th.th_team_nproc = team->t.t_nproc; 929 master_th->th.th_team_master = master_th; 930 master_th->th.th_team_serialized = FALSE; 931 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 932 933 /* make sure we are not the optimized hot team */ 934 #if KMP_NESTED_HOT_TEAMS 935 use_hot_team = 0; 936 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 937 if (hot_teams) { // hot teams array is not allocated if 938 // KMP_HOT_TEAMS_MAX_LEVEL=0 939 int level = team->t.t_active_level - 1; // index in array of hot teams 940 if (master_th->th.th_teams_microtask) { // are we inside the teams? 941 if (master_th->th.th_teams_size.nteams > 1) { 942 ++level; // level was not increased in teams construct for 943 // team_of_masters 944 } 945 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 946 master_th->th.th_teams_level == team->t.t_level) { 947 ++level; // level was not increased in teams construct for 948 // team_of_workers before the parallel 949 } // team->t.t_level will be increased inside parallel 950 } 951 if (level < __kmp_hot_teams_max_level) { 952 if (hot_teams[level].hot_team) { 953 // hot team has already been allocated for given level 954 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 955 use_hot_team = 1; // the team is ready to use 956 } else { 957 use_hot_team = 0; // AC: threads are not allocated yet 958 hot_teams[level].hot_team = team; // remember new hot team 959 hot_teams[level].hot_team_nth = team->t.t_nproc; 960 } 961 } else { 962 use_hot_team = 0; 963 } 964 } 965 #else 966 use_hot_team = team == root->r.r_hot_team; 967 #endif 968 if (!use_hot_team) { 969 970 /* install the primary thread */ 971 team->t.t_threads[0] = master_th; 972 __kmp_initialize_info(master_th, team, 0, master_gtid); 973 974 /* now, install the worker threads */ 975 for (i = 1; i < team->t.t_nproc; i++) { 976 977 /* fork or reallocate a new thread and install it in team */ 978 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 979 team->t.t_threads[i] = thr; 980 KMP_DEBUG_ASSERT(thr); 981 KMP_DEBUG_ASSERT(thr->th.th_team == team); 982 /* align team and thread arrived states */ 983 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 984 "T#%d(%d:%d) join =%llu, plain=%llu\n", 985 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 986 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 987 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 988 team->t.t_bar[bs_plain_barrier].b_arrived)); 989 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 990 thr->th.th_teams_level = master_th->th.th_teams_level; 991 thr->th.th_teams_size = master_th->th.th_teams_size; 992 { // Initialize threads' barrier data. 993 int b; 994 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 995 for (b = 0; b < bs_last_barrier; ++b) { 996 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 997 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 998 #if USE_DEBUGGER 999 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1000 #endif 1001 } 1002 } 1003 } 1004 1005 #if KMP_AFFINITY_SUPPORTED 1006 __kmp_partition_places(team); 1007 #endif 1008 } 1009 1010 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1011 for (i = 0; i < team->t.t_nproc; i++) { 1012 kmp_info_t *thr = team->t.t_threads[i]; 1013 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1014 thr->th.th_prev_level != team->t.t_level) { 1015 team->t.t_display_affinity = 1; 1016 break; 1017 } 1018 } 1019 } 1020 1021 KMP_MB(); 1022 } 1023 1024 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1025 // Propagate any changes to the floating point control registers out to the team 1026 // We try to avoid unnecessary writes to the relevant cache line in the team 1027 // structure, so we don't make changes unless they are needed. 1028 inline static void propagateFPControl(kmp_team_t *team) { 1029 if (__kmp_inherit_fp_control) { 1030 kmp_int16 x87_fpu_control_word; 1031 kmp_uint32 mxcsr; 1032 1033 // Get primary thread's values of FPU control flags (both X87 and vector) 1034 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1035 __kmp_store_mxcsr(&mxcsr); 1036 mxcsr &= KMP_X86_MXCSR_MASK; 1037 1038 // There is no point looking at t_fp_control_saved here. 1039 // If it is TRUE, we still have to update the values if they are different 1040 // from those we now have. If it is FALSE we didn't save anything yet, but 1041 // our objective is the same. We have to ensure that the values in the team 1042 // are the same as those we have. 1043 // So, this code achieves what we need whether or not t_fp_control_saved is 1044 // true. By checking whether the value needs updating we avoid unnecessary 1045 // writes that would put the cache-line into a written state, causing all 1046 // threads in the team to have to read it again. 1047 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1048 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1049 // Although we don't use this value, other code in the runtime wants to know 1050 // whether it should restore them. So we must ensure it is correct. 1051 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1052 } else { 1053 // Similarly here. Don't write to this cache-line in the team structure 1054 // unless we have to. 1055 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1056 } 1057 } 1058 1059 // Do the opposite, setting the hardware registers to the updated values from 1060 // the team. 1061 inline static void updateHWFPControl(kmp_team_t *team) { 1062 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1063 // Only reset the fp control regs if they have been changed in the team. 1064 // the parallel region that we are exiting. 1065 kmp_int16 x87_fpu_control_word; 1066 kmp_uint32 mxcsr; 1067 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1068 __kmp_store_mxcsr(&mxcsr); 1069 mxcsr &= KMP_X86_MXCSR_MASK; 1070 1071 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1072 __kmp_clear_x87_fpu_status_word(); 1073 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1074 } 1075 1076 if (team->t.t_mxcsr != mxcsr) { 1077 __kmp_load_mxcsr(&team->t.t_mxcsr); 1078 } 1079 } 1080 } 1081 #else 1082 #define propagateFPControl(x) ((void)0) 1083 #define updateHWFPControl(x) ((void)0) 1084 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1085 1086 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1087 int realloc); // forward declaration 1088 1089 /* Run a parallel region that has been serialized, so runs only in a team of the 1090 single primary thread. */ 1091 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1092 kmp_info_t *this_thr; 1093 kmp_team_t *serial_team; 1094 1095 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1096 1097 /* Skip all this code for autopar serialized loops since it results in 1098 unacceptable overhead */ 1099 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1100 return; 1101 1102 if (!TCR_4(__kmp_init_parallel)) 1103 __kmp_parallel_initialize(); 1104 __kmp_resume_if_soft_paused(); 1105 1106 this_thr = __kmp_threads[global_tid]; 1107 serial_team = this_thr->th.th_serial_team; 1108 1109 /* utilize the serialized team held by this thread */ 1110 KMP_DEBUG_ASSERT(serial_team); 1111 KMP_MB(); 1112 1113 if (__kmp_tasking_mode != tskm_immediate_exec) { 1114 KMP_DEBUG_ASSERT( 1115 this_thr->th.th_task_team == 1116 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1117 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1118 NULL); 1119 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1120 "team %p, new task_team = NULL\n", 1121 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1122 this_thr->th.th_task_team = NULL; 1123 } 1124 1125 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1126 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1127 proc_bind = proc_bind_false; 1128 } else if (proc_bind == proc_bind_default) { 1129 // No proc_bind clause was specified, so use the current value 1130 // of proc-bind-var for this parallel region. 1131 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1132 } 1133 // Reset for next parallel region 1134 this_thr->th.th_set_proc_bind = proc_bind_default; 1135 1136 #if OMPT_SUPPORT 1137 ompt_data_t ompt_parallel_data = ompt_data_none; 1138 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1139 if (ompt_enabled.enabled && 1140 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1141 1142 ompt_task_info_t *parent_task_info; 1143 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1144 1145 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1146 if (ompt_enabled.ompt_callback_parallel_begin) { 1147 int team_size = 1; 1148 1149 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1150 &(parent_task_info->task_data), &(parent_task_info->frame), 1151 &ompt_parallel_data, team_size, 1152 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1153 } 1154 } 1155 #endif // OMPT_SUPPORT 1156 1157 if (this_thr->th.th_team != serial_team) { 1158 // Nested level will be an index in the nested nthreads array 1159 int level = this_thr->th.th_team->t.t_level; 1160 1161 if (serial_team->t.t_serialized) { 1162 /* this serial team was already used 1163 TODO increase performance by making this locks more specific */ 1164 kmp_team_t *new_team; 1165 1166 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1167 1168 new_team = 1169 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1170 #if OMPT_SUPPORT 1171 ompt_parallel_data, 1172 #endif 1173 proc_bind, &this_thr->th.th_current_task->td_icvs, 1174 0 USE_NESTED_HOT_ARG(NULL)); 1175 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1176 KMP_ASSERT(new_team); 1177 1178 /* setup new serialized team and install it */ 1179 new_team->t.t_threads[0] = this_thr; 1180 new_team->t.t_parent = this_thr->th.th_team; 1181 serial_team = new_team; 1182 this_thr->th.th_serial_team = serial_team; 1183 1184 KF_TRACE( 1185 10, 1186 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1187 global_tid, serial_team)); 1188 1189 /* TODO the above breaks the requirement that if we run out of resources, 1190 then we can still guarantee that serialized teams are ok, since we may 1191 need to allocate a new one */ 1192 } else { 1193 KF_TRACE( 1194 10, 1195 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1196 global_tid, serial_team)); 1197 } 1198 1199 /* we have to initialize this serial team */ 1200 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1201 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1202 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1203 serial_team->t.t_ident = loc; 1204 serial_team->t.t_serialized = 1; 1205 serial_team->t.t_nproc = 1; 1206 serial_team->t.t_parent = this_thr->th.th_team; 1207 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1208 this_thr->th.th_team = serial_team; 1209 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1210 1211 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d curtask=%p\n", global_tid, 1212 this_thr->th.th_current_task)); 1213 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1214 this_thr->th.th_current_task->td_flags.executing = 0; 1215 1216 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1217 1218 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1219 implicit task for each serialized task represented by 1220 team->t.t_serialized? */ 1221 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1222 &this_thr->th.th_current_task->td_parent->td_icvs); 1223 1224 // Thread value exists in the nested nthreads array for the next nested 1225 // level 1226 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1227 this_thr->th.th_current_task->td_icvs.nproc = 1228 __kmp_nested_nth.nth[level + 1]; 1229 } 1230 1231 if (__kmp_nested_proc_bind.used && 1232 (level + 1 < __kmp_nested_proc_bind.used)) { 1233 this_thr->th.th_current_task->td_icvs.proc_bind = 1234 __kmp_nested_proc_bind.bind_types[level + 1]; 1235 } 1236 1237 #if USE_DEBUGGER 1238 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1239 #endif 1240 this_thr->th.th_info.ds.ds_tid = 0; 1241 1242 /* set thread cache values */ 1243 this_thr->th.th_team_nproc = 1; 1244 this_thr->th.th_team_master = this_thr; 1245 this_thr->th.th_team_serialized = 1; 1246 1247 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1248 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1249 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1250 1251 propagateFPControl(serial_team); 1252 1253 /* check if we need to allocate dispatch buffers stack */ 1254 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1255 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1256 serial_team->t.t_dispatch->th_disp_buffer = 1257 (dispatch_private_info_t *)__kmp_allocate( 1258 sizeof(dispatch_private_info_t)); 1259 } 1260 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1261 1262 KMP_MB(); 1263 1264 } else { 1265 /* this serialized team is already being used, 1266 * that's fine, just add another nested level */ 1267 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1268 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1269 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1270 ++serial_team->t.t_serialized; 1271 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1272 1273 // Nested level will be an index in the nested nthreads array 1274 int level = this_thr->th.th_team->t.t_level; 1275 // Thread value exists in the nested nthreads array for the next nested 1276 // level 1277 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1278 this_thr->th.th_current_task->td_icvs.nproc = 1279 __kmp_nested_nth.nth[level + 1]; 1280 } 1281 serial_team->t.t_level++; 1282 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1283 "of serial team %p to %d\n", 1284 global_tid, serial_team, serial_team->t.t_level)); 1285 1286 /* allocate/push dispatch buffers stack */ 1287 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1288 { 1289 dispatch_private_info_t *disp_buffer = 1290 (dispatch_private_info_t *)__kmp_allocate( 1291 sizeof(dispatch_private_info_t)); 1292 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1293 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1294 } 1295 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1296 1297 KMP_MB(); 1298 } 1299 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1300 1301 // Perform the display affinity functionality for 1302 // serialized parallel regions 1303 if (__kmp_display_affinity) { 1304 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1305 this_thr->th.th_prev_num_threads != 1) { 1306 // NULL means use the affinity-format-var ICV 1307 __kmp_aux_display_affinity(global_tid, NULL); 1308 this_thr->th.th_prev_level = serial_team->t.t_level; 1309 this_thr->th.th_prev_num_threads = 1; 1310 } 1311 } 1312 1313 if (__kmp_env_consistency_check) 1314 __kmp_push_parallel(global_tid, NULL); 1315 #if OMPT_SUPPORT 1316 serial_team->t.ompt_team_info.master_return_address = codeptr; 1317 if (ompt_enabled.enabled && 1318 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1319 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1320 OMPT_GET_FRAME_ADDRESS(0); 1321 1322 ompt_lw_taskteam_t lw_taskteam; 1323 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1324 &ompt_parallel_data, codeptr); 1325 1326 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1327 // don't use lw_taskteam after linking. content was swaped 1328 1329 /* OMPT implicit task begin */ 1330 if (ompt_enabled.ompt_callback_implicit_task) { 1331 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1332 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1333 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1334 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1335 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1336 __kmp_tid_from_gtid(global_tid); 1337 } 1338 1339 /* OMPT state */ 1340 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1341 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1342 OMPT_GET_FRAME_ADDRESS(0); 1343 } 1344 #endif 1345 } 1346 1347 /* most of the work for a fork */ 1348 /* return true if we really went parallel, false if serialized */ 1349 int __kmp_fork_call(ident_t *loc, int gtid, 1350 enum fork_context_e call_context, // Intel, GNU, ... 1351 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1352 kmp_va_list ap) { 1353 void **argv; 1354 int i; 1355 int master_tid; 1356 int master_this_cons; 1357 kmp_team_t *team; 1358 kmp_team_t *parent_team; 1359 kmp_info_t *master_th; 1360 kmp_root_t *root; 1361 int nthreads; 1362 int master_active; 1363 int master_set_numthreads; 1364 int level; 1365 int active_level; 1366 int teams_level; 1367 #if KMP_NESTED_HOT_TEAMS 1368 kmp_hot_team_ptr_t **p_hot_teams; 1369 #endif 1370 { // KMP_TIME_BLOCK 1371 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1372 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1373 1374 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1375 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1376 /* Some systems prefer the stack for the root thread(s) to start with */ 1377 /* some gap from the parent stack to prevent false sharing. */ 1378 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1379 /* These 2 lines below are so this does not get optimized out */ 1380 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1381 __kmp_stkpadding += (short)((kmp_int64)dummy); 1382 } 1383 1384 /* initialize if needed */ 1385 KMP_DEBUG_ASSERT( 1386 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1387 if (!TCR_4(__kmp_init_parallel)) 1388 __kmp_parallel_initialize(); 1389 __kmp_resume_if_soft_paused(); 1390 1391 /* setup current data */ 1392 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1393 // shutdown 1394 parent_team = master_th->th.th_team; 1395 master_tid = master_th->th.th_info.ds.ds_tid; 1396 master_this_cons = master_th->th.th_local.this_construct; 1397 root = master_th->th.th_root; 1398 master_active = root->r.r_active; 1399 master_set_numthreads = master_th->th.th_set_nproc; 1400 1401 #if OMPT_SUPPORT 1402 ompt_data_t ompt_parallel_data = ompt_data_none; 1403 ompt_data_t *parent_task_data; 1404 ompt_frame_t *ompt_frame; 1405 ompt_data_t *implicit_task_data; 1406 void *return_address = NULL; 1407 1408 if (ompt_enabled.enabled) { 1409 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1410 NULL, NULL); 1411 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1412 } 1413 #endif 1414 1415 // Assign affinity to root thread if it hasn't happened yet 1416 __kmp_assign_root_init_mask(); 1417 1418 // Nested level will be an index in the nested nthreads array 1419 level = parent_team->t.t_level; 1420 // used to launch non-serial teams even if nested is not allowed 1421 active_level = parent_team->t.t_active_level; 1422 // needed to check nesting inside the teams 1423 teams_level = master_th->th.th_teams_level; 1424 #if KMP_NESTED_HOT_TEAMS 1425 p_hot_teams = &master_th->th.th_hot_teams; 1426 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1427 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1428 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1429 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1430 // it is either actual or not needed (when active_level > 0) 1431 (*p_hot_teams)[0].hot_team_nth = 1; 1432 } 1433 #endif 1434 1435 #if OMPT_SUPPORT 1436 if (ompt_enabled.enabled) { 1437 if (ompt_enabled.ompt_callback_parallel_begin) { 1438 int team_size = master_set_numthreads 1439 ? master_set_numthreads 1440 : get__nproc_2(parent_team, master_tid); 1441 int flags = OMPT_INVOKER(call_context) | 1442 ((microtask == (microtask_t)__kmp_teams_master) 1443 ? ompt_parallel_league 1444 : ompt_parallel_team); 1445 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1446 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1447 return_address); 1448 } 1449 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1450 } 1451 #endif 1452 1453 master_th->th.th_ident = loc; 1454 1455 if (master_th->th.th_teams_microtask && ap && 1456 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1457 // AC: This is start of parallel that is nested inside teams construct. 1458 // The team is actual (hot), all workers are ready at the fork barrier. 1459 // No lock needed to initialize the team a bit, then free workers. 1460 parent_team->t.t_ident = loc; 1461 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1462 parent_team->t.t_argc = argc; 1463 argv = (void **)parent_team->t.t_argv; 1464 for (i = argc - 1; i >= 0; --i) 1465 *argv++ = va_arg(kmp_va_deref(ap), void *); 1466 // Increment our nested depth levels, but not increase the serialization 1467 if (parent_team == master_th->th.th_serial_team) { 1468 // AC: we are in serialized parallel 1469 __kmpc_serialized_parallel(loc, gtid); 1470 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1471 1472 if (call_context == fork_context_gnu) { 1473 // AC: need to decrement t_serialized for enquiry functions to work 1474 // correctly, will restore at join time 1475 parent_team->t.t_serialized--; 1476 return TRUE; 1477 } 1478 1479 #if OMPD_SUPPORT 1480 parent_team->t.t_pkfn = microtask; 1481 #endif 1482 1483 #if OMPT_SUPPORT 1484 void *dummy; 1485 void **exit_frame_p; 1486 1487 ompt_lw_taskteam_t lw_taskteam; 1488 1489 if (ompt_enabled.enabled) { 1490 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1491 &ompt_parallel_data, return_address); 1492 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1493 1494 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1495 // don't use lw_taskteam after linking. content was swaped 1496 1497 /* OMPT implicit task begin */ 1498 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1499 if (ompt_enabled.ompt_callback_implicit_task) { 1500 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1501 __kmp_tid_from_gtid(gtid); 1502 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1503 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1504 implicit_task_data, 1, 1505 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1506 } 1507 1508 /* OMPT state */ 1509 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1510 } else { 1511 exit_frame_p = &dummy; 1512 } 1513 #endif 1514 // AC: need to decrement t_serialized for enquiry functions to work 1515 // correctly, will restore at join time 1516 parent_team->t.t_serialized--; 1517 1518 { 1519 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1520 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1521 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1522 #if OMPT_SUPPORT 1523 , 1524 exit_frame_p 1525 #endif 1526 ); 1527 } 1528 1529 #if OMPT_SUPPORT 1530 if (ompt_enabled.enabled) { 1531 *exit_frame_p = NULL; 1532 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1533 if (ompt_enabled.ompt_callback_implicit_task) { 1534 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1535 ompt_scope_end, NULL, implicit_task_data, 1, 1536 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1537 } 1538 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1539 __ompt_lw_taskteam_unlink(master_th); 1540 if (ompt_enabled.ompt_callback_parallel_end) { 1541 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1542 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1543 OMPT_INVOKER(call_context) | ompt_parallel_team, 1544 return_address); 1545 } 1546 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1547 } 1548 #endif 1549 return TRUE; 1550 } 1551 1552 parent_team->t.t_pkfn = microtask; 1553 parent_team->t.t_invoke = invoker; 1554 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1555 parent_team->t.t_active_level++; 1556 parent_team->t.t_level++; 1557 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1558 1559 #if OMPT_SUPPORT 1560 if (ompt_enabled.enabled) { 1561 ompt_lw_taskteam_t lw_taskteam; 1562 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1563 &ompt_parallel_data, return_address); 1564 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1565 } 1566 #endif 1567 1568 /* Change number of threads in the team if requested */ 1569 if (master_set_numthreads) { // The parallel has num_threads clause 1570 if (master_set_numthreads <= master_th->th.th_teams_size.nth) { 1571 // AC: only can reduce number of threads dynamically, can't increase 1572 kmp_info_t **other_threads = parent_team->t.t_threads; 1573 // NOTE: if using distributed barrier, we need to run this code block 1574 // even when the team size appears not to have changed from the max. 1575 int old_proc = master_th->th.th_teams_size.nth; 1576 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 1577 bp_dist_bar) { 1578 __kmp_resize_dist_barrier(parent_team, old_proc, 1579 master_set_numthreads); 1580 __kmp_add_threads_to_team(parent_team, master_set_numthreads); 1581 } 1582 parent_team->t.t_nproc = master_set_numthreads; 1583 for (i = 0; i < master_set_numthreads; ++i) { 1584 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1585 } 1586 } 1587 // Keep extra threads hot in the team for possible next parallels 1588 master_th->th.th_set_nproc = 0; 1589 } 1590 1591 #if USE_DEBUGGER 1592 if (__kmp_debugging) { // Let debugger override number of threads. 1593 int nth = __kmp_omp_num_threads(loc); 1594 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1595 master_set_numthreads = nth; 1596 } 1597 } 1598 #endif 1599 1600 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1601 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1602 KMP_ITT_DEBUG) && 1603 __kmp_forkjoin_frames_mode == 3 && 1604 parent_team->t.t_active_level == 1 // only report frames at level 1 1605 && master_th->th.th_teams_size.nteams == 1) { 1606 kmp_uint64 tmp_time = __itt_get_timestamp(); 1607 master_th->th.th_frame_time = tmp_time; 1608 parent_team->t.t_region_time = tmp_time; 1609 } 1610 if (__itt_stack_caller_create_ptr) { 1611 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1612 // create new stack stitching id before entering fork barrier 1613 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1614 } 1615 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1616 1617 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1618 "master_th=%p, gtid=%d\n", 1619 root, parent_team, master_th, gtid)); 1620 __kmp_internal_fork(loc, gtid, parent_team); 1621 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1622 "master_th=%p, gtid=%d\n", 1623 root, parent_team, master_th, gtid)); 1624 1625 if (call_context == fork_context_gnu) 1626 return TRUE; 1627 1628 /* Invoke microtask for PRIMARY thread */ 1629 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1630 parent_team->t.t_id, parent_team->t.t_pkfn)); 1631 1632 if (!parent_team->t.t_invoke(gtid)) { 1633 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1634 } 1635 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1636 parent_team->t.t_id, parent_team->t.t_pkfn)); 1637 KMP_MB(); /* Flush all pending memory write invalidates. */ 1638 1639 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1640 1641 return TRUE; 1642 } // Parallel closely nested in teams construct 1643 1644 #if KMP_DEBUG 1645 if (__kmp_tasking_mode != tskm_immediate_exec) { 1646 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1647 parent_team->t.t_task_team[master_th->th.th_task_state]); 1648 } 1649 #endif 1650 1651 // Need this to happen before we determine the number of threads, not while 1652 // we are allocating the team 1653 //__kmp_push_current_task_to_thread(master_th, parent_team, 0); 1654 int enter_teams = 0; 1655 if (parent_team->t.t_active_level >= 1656 master_th->th.th_current_task->td_icvs.max_active_levels) { 1657 nthreads = 1; 1658 } else { 1659 enter_teams = ((ap == NULL && active_level == 0) || 1660 (ap && teams_level > 0 && teams_level == level)); 1661 nthreads = master_set_numthreads 1662 ? master_set_numthreads 1663 // TODO: get nproc directly from current task 1664 : get__nproc_2(parent_team, master_tid); 1665 // Check if we need to take forkjoin lock? (no need for serialized 1666 // parallel out of teams construct). This code moved here from 1667 // __kmp_reserve_threads() to speedup nested serialized parallels. 1668 if (nthreads > 1) { 1669 if ((get__max_active_levels(master_th) == 1 && 1670 (root->r.r_in_parallel && !enter_teams)) || 1671 (__kmp_library == library_serial)) { 1672 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1673 " threads\n", 1674 gtid, nthreads)); 1675 nthreads = 1; 1676 } 1677 } 1678 if (nthreads > 1) { 1679 /* determine how many new threads we can use */ 1680 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1681 /* AC: If we execute teams from parallel region (on host), then teams 1682 should be created but each can only have 1 thread if nesting is 1683 disabled. If teams called from serial region, then teams and their 1684 threads should be created regardless of the nesting setting. */ 1685 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1686 nthreads, enter_teams); 1687 if (nthreads == 1) { 1688 // Free lock for single thread execution here; for multi-thread 1689 // execution it will be freed later after team of threads created 1690 // and initialized 1691 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1692 } 1693 } 1694 } 1695 KMP_DEBUG_ASSERT(nthreads > 0); 1696 1697 // If we temporarily changed the set number of threads then restore it now 1698 master_th->th.th_set_nproc = 0; 1699 1700 /* create a serialized parallel region? */ 1701 if (nthreads == 1) { 1702 /* josh todo: hypothetical question: what do we do for OS X*? */ 1703 #if KMP_OS_LINUX && \ 1704 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1705 void *args[argc]; 1706 #else 1707 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1708 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1709 KMP_ARCH_AARCH64) */ 1710 1711 KA_TRACE(20, 1712 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1713 1714 __kmpc_serialized_parallel(loc, gtid); 1715 1716 #if OMPD_SUPPORT 1717 master_th->th.th_serial_team->t.t_pkfn = microtask; 1718 #endif 1719 1720 if (call_context == fork_context_intel) { 1721 /* TODO this sucks, use the compiler itself to pass args! :) */ 1722 master_th->th.th_serial_team->t.t_ident = loc; 1723 if (!ap) { 1724 // revert change made in __kmpc_serialized_parallel() 1725 master_th->th.th_serial_team->t.t_level--; 1726 // Get args from parent team for teams construct 1727 1728 #if OMPT_SUPPORT 1729 void *dummy; 1730 void **exit_frame_p; 1731 ompt_task_info_t *task_info; 1732 1733 ompt_lw_taskteam_t lw_taskteam; 1734 1735 if (ompt_enabled.enabled) { 1736 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1737 &ompt_parallel_data, return_address); 1738 1739 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1740 // don't use lw_taskteam after linking. content was swaped 1741 1742 task_info = OMPT_CUR_TASK_INFO(master_th); 1743 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1744 if (ompt_enabled.ompt_callback_implicit_task) { 1745 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1746 __kmp_tid_from_gtid(gtid); 1747 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1748 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1749 &(task_info->task_data), 1, 1750 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1751 ompt_task_implicit); 1752 } 1753 1754 /* OMPT state */ 1755 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1756 } else { 1757 exit_frame_p = &dummy; 1758 } 1759 #endif 1760 1761 { 1762 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1763 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1764 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1765 parent_team->t.t_argv 1766 #if OMPT_SUPPORT 1767 , 1768 exit_frame_p 1769 #endif 1770 ); 1771 } 1772 1773 #if OMPT_SUPPORT 1774 if (ompt_enabled.enabled) { 1775 *exit_frame_p = NULL; 1776 if (ompt_enabled.ompt_callback_implicit_task) { 1777 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1778 ompt_scope_end, NULL, &(task_info->task_data), 1, 1779 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1780 ompt_task_implicit); 1781 } 1782 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1783 __ompt_lw_taskteam_unlink(master_th); 1784 if (ompt_enabled.ompt_callback_parallel_end) { 1785 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1786 &ompt_parallel_data, parent_task_data, 1787 OMPT_INVOKER(call_context) | ompt_parallel_team, 1788 return_address); 1789 } 1790 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1791 } 1792 #endif 1793 } else if (microtask == (microtask_t)__kmp_teams_master) { 1794 KMP_DEBUG_ASSERT(master_th->th.th_team == 1795 master_th->th.th_serial_team); 1796 team = master_th->th.th_team; 1797 // team->t.t_pkfn = microtask; 1798 team->t.t_invoke = invoker; 1799 __kmp_alloc_argv_entries(argc, team, TRUE); 1800 team->t.t_argc = argc; 1801 argv = (void **)team->t.t_argv; 1802 if (ap) { 1803 for (i = argc - 1; i >= 0; --i) 1804 *argv++ = va_arg(kmp_va_deref(ap), void *); 1805 } else { 1806 for (i = 0; i < argc; ++i) 1807 // Get args from parent team for teams construct 1808 argv[i] = parent_team->t.t_argv[i]; 1809 } 1810 // AC: revert change made in __kmpc_serialized_parallel() 1811 // because initial code in teams should have level=0 1812 team->t.t_level--; 1813 // AC: call special invoker for outer "parallel" of teams construct 1814 invoker(gtid); 1815 #if OMPT_SUPPORT 1816 if (ompt_enabled.enabled) { 1817 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1818 if (ompt_enabled.ompt_callback_implicit_task) { 1819 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1820 ompt_scope_end, NULL, &(task_info->task_data), 0, 1821 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1822 } 1823 if (ompt_enabled.ompt_callback_parallel_end) { 1824 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1825 &ompt_parallel_data, parent_task_data, 1826 OMPT_INVOKER(call_context) | ompt_parallel_league, 1827 return_address); 1828 } 1829 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1830 } 1831 #endif 1832 } else { 1833 argv = args; 1834 for (i = argc - 1; i >= 0; --i) 1835 *argv++ = va_arg(kmp_va_deref(ap), void *); 1836 KMP_MB(); 1837 1838 #if OMPT_SUPPORT 1839 void *dummy; 1840 void **exit_frame_p; 1841 ompt_task_info_t *task_info; 1842 1843 ompt_lw_taskteam_t lw_taskteam; 1844 1845 if (ompt_enabled.enabled) { 1846 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1847 &ompt_parallel_data, return_address); 1848 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1849 // don't use lw_taskteam after linking. content was swaped 1850 task_info = OMPT_CUR_TASK_INFO(master_th); 1851 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1852 1853 /* OMPT implicit task begin */ 1854 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1855 if (ompt_enabled.ompt_callback_implicit_task) { 1856 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1857 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1858 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1859 ompt_task_implicit); 1860 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1861 __kmp_tid_from_gtid(gtid); 1862 } 1863 1864 /* OMPT state */ 1865 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1866 } else { 1867 exit_frame_p = &dummy; 1868 } 1869 #endif 1870 1871 { 1872 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1873 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1874 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1875 #if OMPT_SUPPORT 1876 , 1877 exit_frame_p 1878 #endif 1879 ); 1880 } 1881 1882 #if OMPT_SUPPORT 1883 if (ompt_enabled.enabled) { 1884 *exit_frame_p = NULL; 1885 if (ompt_enabled.ompt_callback_implicit_task) { 1886 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1887 ompt_scope_end, NULL, &(task_info->task_data), 1, 1888 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1889 ompt_task_implicit); 1890 } 1891 1892 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1893 __ompt_lw_taskteam_unlink(master_th); 1894 if (ompt_enabled.ompt_callback_parallel_end) { 1895 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1896 &ompt_parallel_data, parent_task_data, 1897 OMPT_INVOKER(call_context) | ompt_parallel_team, 1898 return_address); 1899 } 1900 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1901 } 1902 #endif 1903 } 1904 } else if (call_context == fork_context_gnu) { 1905 #if OMPT_SUPPORT 1906 ompt_lw_taskteam_t lwt; 1907 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1908 return_address); 1909 1910 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1911 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1912 // don't use lw_taskteam after linking. content was swaped 1913 #endif 1914 1915 // we were called from GNU native code 1916 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1917 return FALSE; 1918 } else { 1919 KMP_ASSERT2(call_context < fork_context_last, 1920 "__kmp_fork_call: unknown fork_context parameter"); 1921 } 1922 1923 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1924 KMP_MB(); 1925 return FALSE; 1926 } // if (nthreads == 1) 1927 1928 // GEH: only modify the executing flag in the case when not serialized 1929 // serialized case is handled in kmpc_serialized_parallel 1930 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1931 "curtask=%p, curtask_max_aclevel=%d\n", 1932 parent_team->t.t_active_level, master_th, 1933 master_th->th.th_current_task, 1934 master_th->th.th_current_task->td_icvs.max_active_levels)); 1935 // TODO: GEH - cannot do this assertion because root thread not set up as 1936 // executing 1937 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1938 master_th->th.th_current_task->td_flags.executing = 0; 1939 1940 if (!master_th->th.th_teams_microtask || level > teams_level) { 1941 /* Increment our nested depth level */ 1942 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1943 } 1944 1945 // See if we need to make a copy of the ICVs. 1946 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1947 if ((level + 1 < __kmp_nested_nth.used) && 1948 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1949 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1950 } else { 1951 nthreads_icv = 0; // don't update 1952 } 1953 1954 // Figure out the proc_bind_policy for the new team. 1955 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1956 kmp_proc_bind_t proc_bind_icv = 1957 proc_bind_default; // proc_bind_default means don't update 1958 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1959 proc_bind = proc_bind_false; 1960 } else { 1961 if (proc_bind == proc_bind_default) { 1962 // No proc_bind clause specified; use current proc-bind-var for this 1963 // parallel region 1964 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1965 } 1966 /* else: The proc_bind policy was specified explicitly on parallel clause. 1967 This overrides proc-bind-var for this parallel region, but does not 1968 change proc-bind-var. */ 1969 // Figure the value of proc-bind-var for the child threads. 1970 if ((level + 1 < __kmp_nested_proc_bind.used) && 1971 (__kmp_nested_proc_bind.bind_types[level + 1] != 1972 master_th->th.th_current_task->td_icvs.proc_bind)) { 1973 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1974 } 1975 } 1976 1977 // Reset for next parallel region 1978 master_th->th.th_set_proc_bind = proc_bind_default; 1979 1980 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1981 kmp_internal_control_t new_icvs; 1982 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1983 new_icvs.next = NULL; 1984 if (nthreads_icv > 0) { 1985 new_icvs.nproc = nthreads_icv; 1986 } 1987 if (proc_bind_icv != proc_bind_default) { 1988 new_icvs.proc_bind = proc_bind_icv; 1989 } 1990 1991 /* allocate a new parallel team */ 1992 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1993 team = __kmp_allocate_team(root, nthreads, nthreads, 1994 #if OMPT_SUPPORT 1995 ompt_parallel_data, 1996 #endif 1997 proc_bind, &new_icvs, 1998 argc USE_NESTED_HOT_ARG(master_th)); 1999 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2000 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, &new_icvs); 2001 } else { 2002 /* allocate a new parallel team */ 2003 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2004 team = __kmp_allocate_team(root, nthreads, nthreads, 2005 #if OMPT_SUPPORT 2006 ompt_parallel_data, 2007 #endif 2008 proc_bind, 2009 &master_th->th.th_current_task->td_icvs, 2010 argc USE_NESTED_HOT_ARG(master_th)); 2011 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) 2012 copy_icvs((kmp_internal_control_t *)team->t.b->team_icvs, 2013 &master_th->th.th_current_task->td_icvs); 2014 } 2015 KF_TRACE( 2016 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2017 2018 /* setup the new team */ 2019 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2020 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2021 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2022 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2023 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2024 #if OMPT_SUPPORT 2025 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2026 return_address); 2027 #endif 2028 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2029 // TODO: parent_team->t.t_level == INT_MAX ??? 2030 if (!master_th->th.th_teams_microtask || level > teams_level) { 2031 int new_level = parent_team->t.t_level + 1; 2032 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2033 new_level = parent_team->t.t_active_level + 1; 2034 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2035 } else { 2036 // AC: Do not increase parallel level at start of the teams construct 2037 int new_level = parent_team->t.t_level; 2038 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2039 new_level = parent_team->t.t_active_level; 2040 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2041 } 2042 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2043 // set primary thread's schedule as new run-time schedule 2044 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2045 2046 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2047 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2048 2049 // Update the floating point rounding in the team if required. 2050 propagateFPControl(team); 2051 #if OMPD_SUPPORT 2052 if (ompd_state & OMPD_ENABLE_BP) 2053 ompd_bp_parallel_begin(); 2054 #endif 2055 2056 if (__kmp_tasking_mode != tskm_immediate_exec) { 2057 // Set primary thread's task team to team's task team. Unless this is hot 2058 // team, it should be NULL. 2059 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2060 parent_team->t.t_task_team[master_th->th.th_task_state]); 2061 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2062 "%p, new task_team %p / team %p\n", 2063 __kmp_gtid_from_thread(master_th), 2064 master_th->th.th_task_team, parent_team, 2065 team->t.t_task_team[master_th->th.th_task_state], team)); 2066 2067 if (active_level || master_th->th.th_task_team) { 2068 // Take a memo of primary thread's task_state 2069 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2070 if (master_th->th.th_task_state_top >= 2071 master_th->th.th_task_state_stack_sz) { // increase size 2072 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2073 kmp_uint8 *old_stack, *new_stack; 2074 kmp_uint32 i; 2075 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2076 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2077 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2078 } 2079 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2080 ++i) { // zero-init rest of stack 2081 new_stack[i] = 0; 2082 } 2083 old_stack = master_th->th.th_task_state_memo_stack; 2084 master_th->th.th_task_state_memo_stack = new_stack; 2085 master_th->th.th_task_state_stack_sz = new_size; 2086 __kmp_free(old_stack); 2087 } 2088 // Store primary thread's task_state on stack 2089 master_th->th 2090 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2091 master_th->th.th_task_state; 2092 master_th->th.th_task_state_top++; 2093 #if KMP_NESTED_HOT_TEAMS 2094 if (master_th->th.th_hot_teams && 2095 active_level < __kmp_hot_teams_max_level && 2096 team == master_th->th.th_hot_teams[active_level].hot_team) { 2097 // Restore primary thread's nested state if nested hot team 2098 master_th->th.th_task_state = 2099 master_th->th 2100 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2101 } else { 2102 #endif 2103 master_th->th.th_task_state = 0; 2104 #if KMP_NESTED_HOT_TEAMS 2105 } 2106 #endif 2107 } 2108 #if !KMP_NESTED_HOT_TEAMS 2109 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2110 (team == root->r.r_hot_team)); 2111 #endif 2112 } 2113 2114 KA_TRACE( 2115 20, 2116 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2117 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2118 team->t.t_nproc)); 2119 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2120 (team->t.t_master_tid == 0 && 2121 (team->t.t_parent == root->r.r_root_team || 2122 team->t.t_parent->t.t_serialized))); 2123 KMP_MB(); 2124 2125 /* now, setup the arguments */ 2126 argv = (void **)team->t.t_argv; 2127 if (ap) { 2128 for (i = argc - 1; i >= 0; --i) { 2129 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2130 KMP_CHECK_UPDATE(*argv, new_argv); 2131 argv++; 2132 } 2133 } else { 2134 for (i = 0; i < argc; ++i) { 2135 // Get args from parent team for teams construct 2136 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2137 } 2138 } 2139 2140 /* now actually fork the threads */ 2141 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2142 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2143 root->r.r_active = TRUE; 2144 2145 __kmp_fork_team_threads(root, team, master_th, gtid); 2146 __kmp_setup_icv_copy(team, nthreads, 2147 &master_th->th.th_current_task->td_icvs, loc); 2148 2149 #if OMPT_SUPPORT 2150 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2151 #endif 2152 2153 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2154 2155 #if USE_ITT_BUILD 2156 if (team->t.t_active_level == 1 // only report frames at level 1 2157 && !master_th->th.th_teams_microtask) { // not in teams construct 2158 #if USE_ITT_NOTIFY 2159 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2160 (__kmp_forkjoin_frames_mode == 3 || 2161 __kmp_forkjoin_frames_mode == 1)) { 2162 kmp_uint64 tmp_time = 0; 2163 if (__itt_get_timestamp_ptr) 2164 tmp_time = __itt_get_timestamp(); 2165 // Internal fork - report frame begin 2166 master_th->th.th_frame_time = tmp_time; 2167 if (__kmp_forkjoin_frames_mode == 3) 2168 team->t.t_region_time = tmp_time; 2169 } else 2170 // only one notification scheme (either "submit" or "forking/joined", not both) 2171 #endif /* USE_ITT_NOTIFY */ 2172 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2173 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2174 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2175 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2176 } 2177 } 2178 #endif /* USE_ITT_BUILD */ 2179 2180 /* now go on and do the work */ 2181 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2182 KMP_MB(); 2183 KF_TRACE(10, 2184 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2185 root, team, master_th, gtid)); 2186 2187 #if USE_ITT_BUILD 2188 if (__itt_stack_caller_create_ptr) { 2189 // create new stack stitching id before entering fork barrier 2190 if (!enter_teams) { 2191 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2192 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2193 } else if (parent_team->t.t_serialized) { 2194 // keep stack stitching id in the serialized parent_team; 2195 // current team will be used for parallel inside the teams; 2196 // if parent_team is active, then it already keeps stack stitching id 2197 // for the league of teams 2198 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2199 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2200 } 2201 } 2202 #endif /* USE_ITT_BUILD */ 2203 2204 // AC: skip __kmp_internal_fork at teams construct, let only primary 2205 // threads execute 2206 if (ap) { 2207 __kmp_internal_fork(loc, gtid, team); 2208 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2209 "master_th=%p, gtid=%d\n", 2210 root, team, master_th, gtid)); 2211 } 2212 2213 if (call_context == fork_context_gnu) { 2214 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2215 return TRUE; 2216 } 2217 2218 /* Invoke microtask for PRIMARY thread */ 2219 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2220 team->t.t_id, team->t.t_pkfn)); 2221 } // END of timer KMP_fork_call block 2222 2223 #if KMP_STATS_ENABLED 2224 // If beginning a teams construct, then change thread state 2225 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2226 if (!ap) { 2227 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2228 } 2229 #endif 2230 2231 if (!team->t.t_invoke(gtid)) { 2232 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2233 } 2234 2235 #if KMP_STATS_ENABLED 2236 // If was beginning of a teams construct, then reset thread state 2237 if (!ap) { 2238 KMP_SET_THREAD_STATE(previous_state); 2239 } 2240 #endif 2241 2242 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2243 team->t.t_id, team->t.t_pkfn)); 2244 KMP_MB(); /* Flush all pending memory write invalidates. */ 2245 2246 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2247 #if OMPT_SUPPORT 2248 if (ompt_enabled.enabled) { 2249 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2250 } 2251 #endif 2252 2253 return TRUE; 2254 } 2255 2256 #if OMPT_SUPPORT 2257 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2258 kmp_team_t *team) { 2259 // restore state outside the region 2260 thread->th.ompt_thread_info.state = 2261 ((team->t.t_serialized) ? ompt_state_work_serial 2262 : ompt_state_work_parallel); 2263 } 2264 2265 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2266 kmp_team_t *team, ompt_data_t *parallel_data, 2267 int flags, void *codeptr) { 2268 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2269 if (ompt_enabled.ompt_callback_parallel_end) { 2270 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2271 parallel_data, &(task_info->task_data), flags, codeptr); 2272 } 2273 2274 task_info->frame.enter_frame = ompt_data_none; 2275 __kmp_join_restore_state(thread, team); 2276 } 2277 #endif 2278 2279 void __kmp_join_call(ident_t *loc, int gtid 2280 #if OMPT_SUPPORT 2281 , 2282 enum fork_context_e fork_context 2283 #endif 2284 , 2285 int exit_teams) { 2286 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2287 kmp_team_t *team; 2288 kmp_team_t *parent_team; 2289 kmp_info_t *master_th; 2290 kmp_root_t *root; 2291 int master_active; 2292 2293 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2294 2295 /* setup current data */ 2296 master_th = __kmp_threads[gtid]; 2297 root = master_th->th.th_root; 2298 team = master_th->th.th_team; 2299 parent_team = team->t.t_parent; 2300 2301 master_th->th.th_ident = loc; 2302 2303 #if OMPT_SUPPORT 2304 void *team_microtask = (void *)team->t.t_pkfn; 2305 // For GOMP interface with serialized parallel, need the 2306 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2307 // and end-parallel events. 2308 if (ompt_enabled.enabled && 2309 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2310 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2311 } 2312 #endif 2313 2314 #if KMP_DEBUG 2315 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2316 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2317 "th_task_team = %p\n", 2318 __kmp_gtid_from_thread(master_th), team, 2319 team->t.t_task_team[master_th->th.th_task_state], 2320 master_th->th.th_task_team)); 2321 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2322 team->t.t_task_team[master_th->th.th_task_state]); 2323 } 2324 #endif 2325 2326 if (team->t.t_serialized) { 2327 if (master_th->th.th_teams_microtask) { 2328 // We are in teams construct 2329 int level = team->t.t_level; 2330 int tlevel = master_th->th.th_teams_level; 2331 if (level == tlevel) { 2332 // AC: we haven't incremented it earlier at start of teams construct, 2333 // so do it here - at the end of teams construct 2334 team->t.t_level++; 2335 } else if (level == tlevel + 1) { 2336 // AC: we are exiting parallel inside teams, need to increment 2337 // serialization in order to restore it in the next call to 2338 // __kmpc_end_serialized_parallel 2339 team->t.t_serialized++; 2340 } 2341 } 2342 __kmpc_end_serialized_parallel(loc, gtid); 2343 2344 #if OMPT_SUPPORT 2345 if (ompt_enabled.enabled) { 2346 __kmp_join_restore_state(master_th, parent_team); 2347 } 2348 #endif 2349 2350 return; 2351 } 2352 2353 master_active = team->t.t_master_active; 2354 2355 if (!exit_teams) { 2356 // AC: No barrier for internal teams at exit from teams construct. 2357 // But there is barrier for external team (league). 2358 __kmp_internal_join(loc, gtid, team); 2359 #if USE_ITT_BUILD 2360 if (__itt_stack_caller_create_ptr) { 2361 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2362 // destroy the stack stitching id after join barrier 2363 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2364 team->t.t_stack_id = NULL; 2365 } 2366 #endif 2367 } else { 2368 master_th->th.th_task_state = 2369 0; // AC: no tasking in teams (out of any parallel) 2370 #if USE_ITT_BUILD 2371 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2372 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2373 // destroy the stack stitching id on exit from the teams construct 2374 // if parent_team is active, then the id will be destroyed later on 2375 // by master of the league of teams 2376 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2377 parent_team->t.t_stack_id = NULL; 2378 } 2379 #endif 2380 2381 if (team->t.t_nproc > 1 && 2382 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2383 team->t.b->update_num_threads(team->t.t_nproc); 2384 __kmp_add_threads_to_team(team, team->t.t_nproc); 2385 } 2386 } 2387 2388 KMP_MB(); 2389 2390 #if OMPT_SUPPORT 2391 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2392 void *codeptr = team->t.ompt_team_info.master_return_address; 2393 #endif 2394 2395 #if USE_ITT_BUILD 2396 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2397 if (team->t.t_active_level == 1 && 2398 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2399 master_th->th.th_teams_size.nteams == 1)) { 2400 master_th->th.th_ident = loc; 2401 // only one notification scheme (either "submit" or "forking/joined", not 2402 // both) 2403 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2404 __kmp_forkjoin_frames_mode == 3) 2405 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2406 master_th->th.th_frame_time, 0, loc, 2407 master_th->th.th_team_nproc, 1); 2408 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2409 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2410 __kmp_itt_region_joined(gtid); 2411 } // active_level == 1 2412 #endif /* USE_ITT_BUILD */ 2413 2414 if (master_th->th.th_teams_microtask && !exit_teams && 2415 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2416 team->t.t_level == master_th->th.th_teams_level + 1) { 2417 // AC: We need to leave the team structure intact at the end of parallel 2418 // inside the teams construct, so that at the next parallel same (hot) team 2419 // works, only adjust nesting levels 2420 #if OMPT_SUPPORT 2421 ompt_data_t ompt_parallel_data = ompt_data_none; 2422 if (ompt_enabled.enabled) { 2423 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2424 if (ompt_enabled.ompt_callback_implicit_task) { 2425 int ompt_team_size = team->t.t_nproc; 2426 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2427 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2428 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2429 } 2430 task_info->frame.exit_frame = ompt_data_none; 2431 task_info->task_data = ompt_data_none; 2432 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2433 __ompt_lw_taskteam_unlink(master_th); 2434 } 2435 #endif 2436 /* Decrement our nested depth level */ 2437 team->t.t_level--; 2438 team->t.t_active_level--; 2439 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2440 2441 // Restore number of threads in the team if needed. This code relies on 2442 // the proper adjustment of th_teams_size.nth after the fork in 2443 // __kmp_teams_master on each teams primary thread in the case that 2444 // __kmp_reserve_threads reduced it. 2445 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2446 int old_num = master_th->th.th_team_nproc; 2447 int new_num = master_th->th.th_teams_size.nth; 2448 kmp_info_t **other_threads = team->t.t_threads; 2449 team->t.t_nproc = new_num; 2450 for (int i = 0; i < old_num; ++i) { 2451 other_threads[i]->th.th_team_nproc = new_num; 2452 } 2453 // Adjust states of non-used threads of the team 2454 for (int i = old_num; i < new_num; ++i) { 2455 // Re-initialize thread's barrier data. 2456 KMP_DEBUG_ASSERT(other_threads[i]); 2457 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2458 for (int b = 0; b < bs_last_barrier; ++b) { 2459 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2460 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2461 #if USE_DEBUGGER 2462 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2463 #endif 2464 } 2465 if (__kmp_tasking_mode != tskm_immediate_exec) { 2466 // Synchronize thread's task state 2467 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2468 } 2469 } 2470 } 2471 2472 #if OMPT_SUPPORT 2473 if (ompt_enabled.enabled) { 2474 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2475 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2476 } 2477 #endif 2478 2479 return; 2480 } 2481 2482 /* do cleanup and restore the parent team */ 2483 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2484 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2485 2486 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2487 2488 /* jc: The following lock has instructions with REL and ACQ semantics, 2489 separating the parallel user code called in this parallel region 2490 from the serial user code called after this function returns. */ 2491 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2492 2493 if (!master_th->th.th_teams_microtask || 2494 team->t.t_level > master_th->th.th_teams_level) { 2495 /* Decrement our nested depth level */ 2496 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2497 } 2498 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2499 2500 #if OMPT_SUPPORT 2501 if (ompt_enabled.enabled) { 2502 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2503 if (ompt_enabled.ompt_callback_implicit_task) { 2504 int flags = (team_microtask == (void *)__kmp_teams_master) 2505 ? ompt_task_initial 2506 : ompt_task_implicit; 2507 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2508 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2509 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2510 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2511 } 2512 task_info->frame.exit_frame = ompt_data_none; 2513 task_info->task_data = ompt_data_none; 2514 } 2515 #endif 2516 2517 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2518 master_th, team)); 2519 __kmp_pop_current_task_from_thread(master_th); 2520 2521 #if KMP_AFFINITY_SUPPORTED 2522 // Restore master thread's partition. 2523 master_th->th.th_first_place = team->t.t_first_place; 2524 master_th->th.th_last_place = team->t.t_last_place; 2525 #endif // KMP_AFFINITY_SUPPORTED 2526 master_th->th.th_def_allocator = team->t.t_def_allocator; 2527 2528 #if OMPD_SUPPORT 2529 if (ompd_state & OMPD_ENABLE_BP) 2530 ompd_bp_parallel_end(); 2531 #endif 2532 updateHWFPControl(team); 2533 2534 if (root->r.r_active != master_active) 2535 root->r.r_active = master_active; 2536 2537 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2538 master_th)); // this will free worker threads 2539 2540 /* this race was fun to find. make sure the following is in the critical 2541 region otherwise assertions may fail occasionally since the old team may be 2542 reallocated and the hierarchy appears inconsistent. it is actually safe to 2543 run and won't cause any bugs, but will cause those assertion failures. it's 2544 only one deref&assign so might as well put this in the critical region */ 2545 master_th->th.th_team = parent_team; 2546 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2547 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2548 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2549 2550 /* restore serialized team, if need be */ 2551 if (parent_team->t.t_serialized && 2552 parent_team != master_th->th.th_serial_team && 2553 parent_team != root->r.r_root_team) { 2554 __kmp_free_team(root, 2555 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2556 master_th->th.th_serial_team = parent_team; 2557 } 2558 2559 if (__kmp_tasking_mode != tskm_immediate_exec) { 2560 if (master_th->th.th_task_state_top > 2561 0) { // Restore task state from memo stack 2562 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2563 // Remember primary thread's state if we re-use this nested hot team 2564 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2565 master_th->th.th_task_state; 2566 --master_th->th.th_task_state_top; // pop 2567 // Now restore state at this level 2568 master_th->th.th_task_state = 2569 master_th->th 2570 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2571 } 2572 // Copy the task team from the parent team to the primary thread 2573 master_th->th.th_task_team = 2574 parent_team->t.t_task_team[master_th->th.th_task_state]; 2575 KA_TRACE(20, 2576 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2577 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2578 parent_team)); 2579 } 2580 2581 // TODO: GEH - cannot do this assertion because root thread not set up as 2582 // executing 2583 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2584 master_th->th.th_current_task->td_flags.executing = 1; 2585 2586 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2587 2588 #if OMPT_SUPPORT 2589 int flags = 2590 OMPT_INVOKER(fork_context) | 2591 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2592 : ompt_parallel_team); 2593 if (ompt_enabled.enabled) { 2594 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2595 codeptr); 2596 } 2597 #endif 2598 2599 KMP_MB(); 2600 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2601 } 2602 2603 /* Check whether we should push an internal control record onto the 2604 serial team stack. If so, do it. */ 2605 void __kmp_save_internal_controls(kmp_info_t *thread) { 2606 2607 if (thread->th.th_team != thread->th.th_serial_team) { 2608 return; 2609 } 2610 if (thread->th.th_team->t.t_serialized > 1) { 2611 int push = 0; 2612 2613 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2614 push = 1; 2615 } else { 2616 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2617 thread->th.th_team->t.t_serialized) { 2618 push = 1; 2619 } 2620 } 2621 if (push) { /* push a record on the serial team's stack */ 2622 kmp_internal_control_t *control = 2623 (kmp_internal_control_t *)__kmp_allocate( 2624 sizeof(kmp_internal_control_t)); 2625 2626 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2627 2628 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2629 2630 control->next = thread->th.th_team->t.t_control_stack_top; 2631 thread->th.th_team->t.t_control_stack_top = control; 2632 } 2633 } 2634 } 2635 2636 /* Changes set_nproc */ 2637 void __kmp_set_num_threads(int new_nth, int gtid) { 2638 kmp_info_t *thread; 2639 kmp_root_t *root; 2640 2641 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2642 KMP_DEBUG_ASSERT(__kmp_init_serial); 2643 2644 if (new_nth < 1) 2645 new_nth = 1; 2646 else if (new_nth > __kmp_max_nth) 2647 new_nth = __kmp_max_nth; 2648 2649 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2650 thread = __kmp_threads[gtid]; 2651 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2652 return; // nothing to do 2653 2654 __kmp_save_internal_controls(thread); 2655 2656 set__nproc(thread, new_nth); 2657 2658 // If this omp_set_num_threads() call will cause the hot team size to be 2659 // reduced (in the absence of a num_threads clause), then reduce it now, 2660 // rather than waiting for the next parallel region. 2661 root = thread->th.th_root; 2662 if (__kmp_init_parallel && (!root->r.r_active) && 2663 (root->r.r_hot_team->t.t_nproc > new_nth) 2664 #if KMP_NESTED_HOT_TEAMS 2665 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2666 #endif 2667 ) { 2668 kmp_team_t *hot_team = root->r.r_hot_team; 2669 int f; 2670 2671 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2672 2673 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2674 __kmp_resize_dist_barrier(hot_team, hot_team->t.t_nproc, new_nth); 2675 } 2676 // Release the extra threads we don't need any more. 2677 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2678 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2679 if (__kmp_tasking_mode != tskm_immediate_exec) { 2680 // When decreasing team size, threads no longer in the team should unref 2681 // task team. 2682 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2683 } 2684 __kmp_free_thread(hot_team->t.t_threads[f]); 2685 hot_team->t.t_threads[f] = NULL; 2686 } 2687 hot_team->t.t_nproc = new_nth; 2688 #if KMP_NESTED_HOT_TEAMS 2689 if (thread->th.th_hot_teams) { 2690 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2691 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2692 } 2693 #endif 2694 2695 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 2696 hot_team->t.b->update_num_threads(new_nth); 2697 __kmp_add_threads_to_team(hot_team, new_nth); 2698 } 2699 2700 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2701 2702 // Update the t_nproc field in the threads that are still active. 2703 for (f = 0; f < new_nth; f++) { 2704 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2705 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2706 } 2707 // Special flag in case omp_set_num_threads() call 2708 hot_team->t.t_size_changed = -1; 2709 } 2710 } 2711 2712 /* Changes max_active_levels */ 2713 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2714 kmp_info_t *thread; 2715 2716 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2717 "%d = (%d)\n", 2718 gtid, max_active_levels)); 2719 KMP_DEBUG_ASSERT(__kmp_init_serial); 2720 2721 // validate max_active_levels 2722 if (max_active_levels < 0) { 2723 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2724 // We ignore this call if the user has specified a negative value. 2725 // The current setting won't be changed. The last valid setting will be 2726 // used. A warning will be issued (if warnings are allowed as controlled by 2727 // the KMP_WARNINGS env var). 2728 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2729 "max_active_levels for thread %d = (%d)\n", 2730 gtid, max_active_levels)); 2731 return; 2732 } 2733 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2734 // it's OK, the max_active_levels is within the valid range: [ 0; 2735 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2736 // We allow a zero value. (implementation defined behavior) 2737 } else { 2738 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2739 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2740 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2741 // Current upper limit is MAX_INT. (implementation defined behavior) 2742 // If the input exceeds the upper limit, we correct the input to be the 2743 // upper limit. (implementation defined behavior) 2744 // Actually, the flow should never get here until we use MAX_INT limit. 2745 } 2746 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2747 "max_active_levels for thread %d = (%d)\n", 2748 gtid, max_active_levels)); 2749 2750 thread = __kmp_threads[gtid]; 2751 2752 __kmp_save_internal_controls(thread); 2753 2754 set__max_active_levels(thread, max_active_levels); 2755 } 2756 2757 /* Gets max_active_levels */ 2758 int __kmp_get_max_active_levels(int gtid) { 2759 kmp_info_t *thread; 2760 2761 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2762 KMP_DEBUG_ASSERT(__kmp_init_serial); 2763 2764 thread = __kmp_threads[gtid]; 2765 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2766 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2767 "curtask_maxaclevel=%d\n", 2768 gtid, thread->th.th_current_task, 2769 thread->th.th_current_task->td_icvs.max_active_levels)); 2770 return thread->th.th_current_task->td_icvs.max_active_levels; 2771 } 2772 2773 // nteams-var per-device ICV 2774 void __kmp_set_num_teams(int num_teams) { 2775 if (num_teams > 0) 2776 __kmp_nteams = num_teams; 2777 } 2778 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2779 // teams-thread-limit-var per-device ICV 2780 void __kmp_set_teams_thread_limit(int limit) { 2781 if (limit > 0) 2782 __kmp_teams_thread_limit = limit; 2783 } 2784 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2785 2786 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2787 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2788 2789 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2790 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2791 kmp_info_t *thread; 2792 kmp_sched_t orig_kind; 2793 // kmp_team_t *team; 2794 2795 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2796 gtid, (int)kind, chunk)); 2797 KMP_DEBUG_ASSERT(__kmp_init_serial); 2798 2799 // Check if the kind parameter is valid, correct if needed. 2800 // Valid parameters should fit in one of two intervals - standard or extended: 2801 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2802 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2803 orig_kind = kind; 2804 kind = __kmp_sched_without_mods(kind); 2805 2806 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2807 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2808 // TODO: Hint needs attention in case we change the default schedule. 2809 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2810 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2811 __kmp_msg_null); 2812 kind = kmp_sched_default; 2813 chunk = 0; // ignore chunk value in case of bad kind 2814 } 2815 2816 thread = __kmp_threads[gtid]; 2817 2818 __kmp_save_internal_controls(thread); 2819 2820 if (kind < kmp_sched_upper_std) { 2821 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2822 // differ static chunked vs. unchunked: chunk should be invalid to 2823 // indicate unchunked schedule (which is the default) 2824 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2825 } else { 2826 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2827 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2828 } 2829 } else { 2830 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2831 // kmp_sched_lower - 2 ]; 2832 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2833 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2834 kmp_sched_lower - 2]; 2835 } 2836 __kmp_sched_apply_mods_intkind( 2837 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2838 if (kind == kmp_sched_auto || chunk < 1) { 2839 // ignore parameter chunk for schedule auto 2840 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2841 } else { 2842 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2843 } 2844 } 2845 2846 /* Gets def_sched_var ICV values */ 2847 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2848 kmp_info_t *thread; 2849 enum sched_type th_type; 2850 2851 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2852 KMP_DEBUG_ASSERT(__kmp_init_serial); 2853 2854 thread = __kmp_threads[gtid]; 2855 2856 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2857 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2858 case kmp_sch_static: 2859 case kmp_sch_static_greedy: 2860 case kmp_sch_static_balanced: 2861 *kind = kmp_sched_static; 2862 __kmp_sched_apply_mods_stdkind(kind, th_type); 2863 *chunk = 0; // chunk was not set, try to show this fact via zero value 2864 return; 2865 case kmp_sch_static_chunked: 2866 *kind = kmp_sched_static; 2867 break; 2868 case kmp_sch_dynamic_chunked: 2869 *kind = kmp_sched_dynamic; 2870 break; 2871 case kmp_sch_guided_chunked: 2872 case kmp_sch_guided_iterative_chunked: 2873 case kmp_sch_guided_analytical_chunked: 2874 *kind = kmp_sched_guided; 2875 break; 2876 case kmp_sch_auto: 2877 *kind = kmp_sched_auto; 2878 break; 2879 case kmp_sch_trapezoidal: 2880 *kind = kmp_sched_trapezoidal; 2881 break; 2882 #if KMP_STATIC_STEAL_ENABLED 2883 case kmp_sch_static_steal: 2884 *kind = kmp_sched_static_steal; 2885 break; 2886 #endif 2887 default: 2888 KMP_FATAL(UnknownSchedulingType, th_type); 2889 } 2890 2891 __kmp_sched_apply_mods_stdkind(kind, th_type); 2892 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2893 } 2894 2895 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2896 2897 int ii, dd; 2898 kmp_team_t *team; 2899 kmp_info_t *thr; 2900 2901 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2902 KMP_DEBUG_ASSERT(__kmp_init_serial); 2903 2904 // validate level 2905 if (level == 0) 2906 return 0; 2907 if (level < 0) 2908 return -1; 2909 thr = __kmp_threads[gtid]; 2910 team = thr->th.th_team; 2911 ii = team->t.t_level; 2912 if (level > ii) 2913 return -1; 2914 2915 if (thr->th.th_teams_microtask) { 2916 // AC: we are in teams region where multiple nested teams have same level 2917 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2918 if (level <= 2919 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2920 KMP_DEBUG_ASSERT(ii >= tlevel); 2921 // AC: As we need to pass by the teams league, we need to artificially 2922 // increase ii 2923 if (ii == tlevel) { 2924 ii += 2; // three teams have same level 2925 } else { 2926 ii++; // two teams have same level 2927 } 2928 } 2929 } 2930 2931 if (ii == level) 2932 return __kmp_tid_from_gtid(gtid); 2933 2934 dd = team->t.t_serialized; 2935 level++; 2936 while (ii > level) { 2937 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2938 } 2939 if ((team->t.t_serialized) && (!dd)) { 2940 team = team->t.t_parent; 2941 continue; 2942 } 2943 if (ii > level) { 2944 team = team->t.t_parent; 2945 dd = team->t.t_serialized; 2946 ii--; 2947 } 2948 } 2949 2950 return (dd > 1) ? (0) : (team->t.t_master_tid); 2951 } 2952 2953 int __kmp_get_team_size(int gtid, int level) { 2954 2955 int ii, dd; 2956 kmp_team_t *team; 2957 kmp_info_t *thr; 2958 2959 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2960 KMP_DEBUG_ASSERT(__kmp_init_serial); 2961 2962 // validate level 2963 if (level == 0) 2964 return 1; 2965 if (level < 0) 2966 return -1; 2967 thr = __kmp_threads[gtid]; 2968 team = thr->th.th_team; 2969 ii = team->t.t_level; 2970 if (level > ii) 2971 return -1; 2972 2973 if (thr->th.th_teams_microtask) { 2974 // AC: we are in teams region where multiple nested teams have same level 2975 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2976 if (level <= 2977 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2978 KMP_DEBUG_ASSERT(ii >= tlevel); 2979 // AC: As we need to pass by the teams league, we need to artificially 2980 // increase ii 2981 if (ii == tlevel) { 2982 ii += 2; // three teams have same level 2983 } else { 2984 ii++; // two teams have same level 2985 } 2986 } 2987 } 2988 2989 while (ii > level) { 2990 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2991 } 2992 if (team->t.t_serialized && (!dd)) { 2993 team = team->t.t_parent; 2994 continue; 2995 } 2996 if (ii > level) { 2997 team = team->t.t_parent; 2998 ii--; 2999 } 3000 } 3001 3002 return team->t.t_nproc; 3003 } 3004 3005 kmp_r_sched_t __kmp_get_schedule_global() { 3006 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3007 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3008 // independently. So one can get the updated schedule here. 3009 3010 kmp_r_sched_t r_sched; 3011 3012 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3013 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3014 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3015 // different roots (even in OMP 2.5) 3016 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 3017 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 3018 if (s == kmp_sch_static) { 3019 // replace STATIC with more detailed schedule (balanced or greedy) 3020 r_sched.r_sched_type = __kmp_static; 3021 } else if (s == kmp_sch_guided_chunked) { 3022 // replace GUIDED with more detailed schedule (iterative or analytical) 3023 r_sched.r_sched_type = __kmp_guided; 3024 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3025 r_sched.r_sched_type = __kmp_sched; 3026 } 3027 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 3028 3029 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3030 // __kmp_chunk may be wrong here (if it was not ever set) 3031 r_sched.chunk = KMP_DEFAULT_CHUNK; 3032 } else { 3033 r_sched.chunk = __kmp_chunk; 3034 } 3035 3036 return r_sched; 3037 } 3038 3039 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3040 at least argc number of *t_argv entries for the requested team. */ 3041 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3042 3043 KMP_DEBUG_ASSERT(team); 3044 if (!realloc || argc > team->t.t_max_argc) { 3045 3046 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3047 "current entries=%d\n", 3048 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3049 /* if previously allocated heap space for args, free them */ 3050 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3051 __kmp_free((void *)team->t.t_argv); 3052 3053 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3054 /* use unused space in the cache line for arguments */ 3055 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3056 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3057 "argv entries\n", 3058 team->t.t_id, team->t.t_max_argc)); 3059 team->t.t_argv = &team->t.t_inline_argv[0]; 3060 if (__kmp_storage_map) { 3061 __kmp_print_storage_map_gtid( 3062 -1, &team->t.t_inline_argv[0], 3063 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3064 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3065 team->t.t_id); 3066 } 3067 } else { 3068 /* allocate space for arguments in the heap */ 3069 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3070 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3071 : 2 * argc; 3072 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3073 "argv entries\n", 3074 team->t.t_id, team->t.t_max_argc)); 3075 team->t.t_argv = 3076 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3077 if (__kmp_storage_map) { 3078 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3079 &team->t.t_argv[team->t.t_max_argc], 3080 sizeof(void *) * team->t.t_max_argc, 3081 "team_%d.t_argv", team->t.t_id); 3082 } 3083 } 3084 } 3085 } 3086 3087 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3088 int i; 3089 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3090 team->t.t_threads = 3091 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3092 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3093 sizeof(dispatch_shared_info_t) * num_disp_buff); 3094 team->t.t_dispatch = 3095 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3096 team->t.t_implicit_task_taskdata = 3097 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3098 team->t.t_max_nproc = max_nth; 3099 3100 /* setup dispatch buffers */ 3101 for (i = 0; i < num_disp_buff; ++i) { 3102 team->t.t_disp_buffer[i].buffer_index = i; 3103 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3104 } 3105 } 3106 3107 static void __kmp_free_team_arrays(kmp_team_t *team) { 3108 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3109 int i; 3110 for (i = 0; i < team->t.t_max_nproc; ++i) { 3111 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3112 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3113 team->t.t_dispatch[i].th_disp_buffer = NULL; 3114 } 3115 } 3116 #if KMP_USE_HIER_SCHED 3117 __kmp_dispatch_free_hierarchies(team); 3118 #endif 3119 __kmp_free(team->t.t_threads); 3120 __kmp_free(team->t.t_disp_buffer); 3121 __kmp_free(team->t.t_dispatch); 3122 __kmp_free(team->t.t_implicit_task_taskdata); 3123 team->t.t_threads = NULL; 3124 team->t.t_disp_buffer = NULL; 3125 team->t.t_dispatch = NULL; 3126 team->t.t_implicit_task_taskdata = 0; 3127 } 3128 3129 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3130 kmp_info_t **oldThreads = team->t.t_threads; 3131 3132 __kmp_free(team->t.t_disp_buffer); 3133 __kmp_free(team->t.t_dispatch); 3134 __kmp_free(team->t.t_implicit_task_taskdata); 3135 __kmp_allocate_team_arrays(team, max_nth); 3136 3137 KMP_MEMCPY(team->t.t_threads, oldThreads, 3138 team->t.t_nproc * sizeof(kmp_info_t *)); 3139 3140 __kmp_free(oldThreads); 3141 } 3142 3143 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3144 3145 kmp_r_sched_t r_sched = 3146 __kmp_get_schedule_global(); // get current state of scheduling globals 3147 3148 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3149 3150 kmp_internal_control_t g_icvs = { 3151 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3152 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3153 // adjustment of threads (per thread) 3154 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3155 // whether blocktime is explicitly set 3156 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3157 #if KMP_USE_MONITOR 3158 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3159 // intervals 3160 #endif 3161 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3162 // next parallel region (per thread) 3163 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3164 __kmp_cg_max_nth, // int thread_limit; 3165 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3166 // for max_active_levels 3167 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3168 // {sched,chunk} pair 3169 __kmp_nested_proc_bind.bind_types[0], 3170 __kmp_default_device, 3171 NULL // struct kmp_internal_control *next; 3172 }; 3173 3174 return g_icvs; 3175 } 3176 3177 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3178 3179 kmp_internal_control_t gx_icvs; 3180 gx_icvs.serial_nesting_level = 3181 0; // probably =team->t.t_serial like in save_inter_controls 3182 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3183 gx_icvs.next = NULL; 3184 3185 return gx_icvs; 3186 } 3187 3188 static void __kmp_initialize_root(kmp_root_t *root) { 3189 int f; 3190 kmp_team_t *root_team; 3191 kmp_team_t *hot_team; 3192 int hot_team_max_nth; 3193 kmp_r_sched_t r_sched = 3194 __kmp_get_schedule_global(); // get current state of scheduling globals 3195 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3196 KMP_DEBUG_ASSERT(root); 3197 KMP_ASSERT(!root->r.r_begin); 3198 3199 /* setup the root state structure */ 3200 __kmp_init_lock(&root->r.r_begin_lock); 3201 root->r.r_begin = FALSE; 3202 root->r.r_active = FALSE; 3203 root->r.r_in_parallel = 0; 3204 root->r.r_blocktime = __kmp_dflt_blocktime; 3205 #if KMP_AFFINITY_SUPPORTED 3206 root->r.r_affinity_assigned = FALSE; 3207 #endif 3208 3209 /* setup the root team for this task */ 3210 /* allocate the root team structure */ 3211 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3212 3213 root_team = 3214 __kmp_allocate_team(root, 3215 1, // new_nproc 3216 1, // max_nproc 3217 #if OMPT_SUPPORT 3218 ompt_data_none, // root parallel id 3219 #endif 3220 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3221 0 // argc 3222 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3223 ); 3224 #if USE_DEBUGGER 3225 // Non-NULL value should be assigned to make the debugger display the root 3226 // team. 3227 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3228 #endif 3229 3230 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3231 3232 root->r.r_root_team = root_team; 3233 root_team->t.t_control_stack_top = NULL; 3234 3235 /* initialize root team */ 3236 root_team->t.t_threads[0] = NULL; 3237 root_team->t.t_nproc = 1; 3238 root_team->t.t_serialized = 1; 3239 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3240 root_team->t.t_sched.sched = r_sched.sched; 3241 KA_TRACE( 3242 20, 3243 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3244 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3245 3246 /* setup the hot team for this task */ 3247 /* allocate the hot team structure */ 3248 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3249 3250 hot_team = 3251 __kmp_allocate_team(root, 3252 1, // new_nproc 3253 __kmp_dflt_team_nth_ub * 2, // max_nproc 3254 #if OMPT_SUPPORT 3255 ompt_data_none, // root parallel id 3256 #endif 3257 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3258 0 // argc 3259 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3260 ); 3261 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3262 3263 root->r.r_hot_team = hot_team; 3264 root_team->t.t_control_stack_top = NULL; 3265 3266 /* first-time initialization */ 3267 hot_team->t.t_parent = root_team; 3268 3269 /* initialize hot team */ 3270 hot_team_max_nth = hot_team->t.t_max_nproc; 3271 for (f = 0; f < hot_team_max_nth; ++f) { 3272 hot_team->t.t_threads[f] = NULL; 3273 } 3274 hot_team->t.t_nproc = 1; 3275 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3276 hot_team->t.t_sched.sched = r_sched.sched; 3277 hot_team->t.t_size_changed = 0; 3278 } 3279 3280 #ifdef KMP_DEBUG 3281 3282 typedef struct kmp_team_list_item { 3283 kmp_team_p const *entry; 3284 struct kmp_team_list_item *next; 3285 } kmp_team_list_item_t; 3286 typedef kmp_team_list_item_t *kmp_team_list_t; 3287 3288 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3289 kmp_team_list_t list, // List of teams. 3290 kmp_team_p const *team // Team to add. 3291 ) { 3292 3293 // List must terminate with item where both entry and next are NULL. 3294 // Team is added to the list only once. 3295 // List is sorted in ascending order by team id. 3296 // Team id is *not* a key. 3297 3298 kmp_team_list_t l; 3299 3300 KMP_DEBUG_ASSERT(list != NULL); 3301 if (team == NULL) { 3302 return; 3303 } 3304 3305 __kmp_print_structure_team_accum(list, team->t.t_parent); 3306 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3307 3308 // Search list for the team. 3309 l = list; 3310 while (l->next != NULL && l->entry != team) { 3311 l = l->next; 3312 } 3313 if (l->next != NULL) { 3314 return; // Team has been added before, exit. 3315 } 3316 3317 // Team is not found. Search list again for insertion point. 3318 l = list; 3319 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3320 l = l->next; 3321 } 3322 3323 // Insert team. 3324 { 3325 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3326 sizeof(kmp_team_list_item_t)); 3327 *item = *l; 3328 l->entry = team; 3329 l->next = item; 3330 } 3331 } 3332 3333 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3334 3335 ) { 3336 __kmp_printf("%s", title); 3337 if (team != NULL) { 3338 __kmp_printf("%2x %p\n", team->t.t_id, team); 3339 } else { 3340 __kmp_printf(" - (nil)\n"); 3341 } 3342 } 3343 3344 static void __kmp_print_structure_thread(char const *title, 3345 kmp_info_p const *thread) { 3346 __kmp_printf("%s", title); 3347 if (thread != NULL) { 3348 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3349 } else { 3350 __kmp_printf(" - (nil)\n"); 3351 } 3352 } 3353 3354 void __kmp_print_structure(void) { 3355 3356 kmp_team_list_t list; 3357 3358 // Initialize list of teams. 3359 list = 3360 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3361 list->entry = NULL; 3362 list->next = NULL; 3363 3364 __kmp_printf("\n------------------------------\nGlobal Thread " 3365 "Table\n------------------------------\n"); 3366 { 3367 int gtid; 3368 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3369 __kmp_printf("%2d", gtid); 3370 if (__kmp_threads != NULL) { 3371 __kmp_printf(" %p", __kmp_threads[gtid]); 3372 } 3373 if (__kmp_root != NULL) { 3374 __kmp_printf(" %p", __kmp_root[gtid]); 3375 } 3376 __kmp_printf("\n"); 3377 } 3378 } 3379 3380 // Print out __kmp_threads array. 3381 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3382 "----------\n"); 3383 if (__kmp_threads != NULL) { 3384 int gtid; 3385 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3386 kmp_info_t const *thread = __kmp_threads[gtid]; 3387 if (thread != NULL) { 3388 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3389 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3390 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3391 __kmp_print_structure_team(" Serial Team: ", 3392 thread->th.th_serial_team); 3393 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3394 __kmp_print_structure_thread(" Primary: ", 3395 thread->th.th_team_master); 3396 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3397 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3398 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3399 __kmp_print_structure_thread(" Next in pool: ", 3400 thread->th.th_next_pool); 3401 __kmp_printf("\n"); 3402 __kmp_print_structure_team_accum(list, thread->th.th_team); 3403 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3404 } 3405 } 3406 } else { 3407 __kmp_printf("Threads array is not allocated.\n"); 3408 } 3409 3410 // Print out __kmp_root array. 3411 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3412 "--------\n"); 3413 if (__kmp_root != NULL) { 3414 int gtid; 3415 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3416 kmp_root_t const *root = __kmp_root[gtid]; 3417 if (root != NULL) { 3418 __kmp_printf("GTID %2d %p:\n", gtid, root); 3419 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3420 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3421 __kmp_print_structure_thread(" Uber Thread: ", 3422 root->r.r_uber_thread); 3423 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3424 __kmp_printf(" In Parallel: %2d\n", 3425 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3426 __kmp_printf("\n"); 3427 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3428 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3429 } 3430 } 3431 } else { 3432 __kmp_printf("Ubers array is not allocated.\n"); 3433 } 3434 3435 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3436 "--------\n"); 3437 while (list->next != NULL) { 3438 kmp_team_p const *team = list->entry; 3439 int i; 3440 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3441 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3442 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3443 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3444 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3445 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3446 for (i = 0; i < team->t.t_nproc; ++i) { 3447 __kmp_printf(" Thread %2d: ", i); 3448 __kmp_print_structure_thread("", team->t.t_threads[i]); 3449 } 3450 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3451 __kmp_printf("\n"); 3452 list = list->next; 3453 } 3454 3455 // Print out __kmp_thread_pool and __kmp_team_pool. 3456 __kmp_printf("\n------------------------------\nPools\n----------------------" 3457 "--------\n"); 3458 __kmp_print_structure_thread("Thread pool: ", 3459 CCAST(kmp_info_t *, __kmp_thread_pool)); 3460 __kmp_print_structure_team("Team pool: ", 3461 CCAST(kmp_team_t *, __kmp_team_pool)); 3462 __kmp_printf("\n"); 3463 3464 // Free team list. 3465 while (list != NULL) { 3466 kmp_team_list_item_t *item = list; 3467 list = list->next; 3468 KMP_INTERNAL_FREE(item); 3469 } 3470 } 3471 3472 #endif 3473 3474 //--------------------------------------------------------------------------- 3475 // Stuff for per-thread fast random number generator 3476 // Table of primes 3477 static const unsigned __kmp_primes[] = { 3478 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3479 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3480 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3481 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3482 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3483 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3484 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3485 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3486 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3487 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3488 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3489 3490 //--------------------------------------------------------------------------- 3491 // __kmp_get_random: Get a random number using a linear congruential method. 3492 unsigned short __kmp_get_random(kmp_info_t *thread) { 3493 unsigned x = thread->th.th_x; 3494 unsigned short r = (unsigned short)(x >> 16); 3495 3496 thread->th.th_x = x * thread->th.th_a + 1; 3497 3498 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3499 thread->th.th_info.ds.ds_tid, r)); 3500 3501 return r; 3502 } 3503 //-------------------------------------------------------- 3504 // __kmp_init_random: Initialize a random number generator 3505 void __kmp_init_random(kmp_info_t *thread) { 3506 unsigned seed = thread->th.th_info.ds.ds_tid; 3507 3508 thread->th.th_a = 3509 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3510 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3511 KA_TRACE(30, 3512 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3513 } 3514 3515 #if KMP_OS_WINDOWS 3516 /* reclaim array entries for root threads that are already dead, returns number 3517 * reclaimed */ 3518 static int __kmp_reclaim_dead_roots(void) { 3519 int i, r = 0; 3520 3521 for (i = 0; i < __kmp_threads_capacity; ++i) { 3522 if (KMP_UBER_GTID(i) && 3523 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3524 !__kmp_root[i] 3525 ->r.r_active) { // AC: reclaim only roots died in non-active state 3526 r += __kmp_unregister_root_other_thread(i); 3527 } 3528 } 3529 return r; 3530 } 3531 #endif 3532 3533 /* This function attempts to create free entries in __kmp_threads and 3534 __kmp_root, and returns the number of free entries generated. 3535 3536 For Windows* OS static library, the first mechanism used is to reclaim array 3537 entries for root threads that are already dead. 3538 3539 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3540 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3541 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3542 threadprivate cache array has been created. Synchronization with 3543 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3544 3545 After any dead root reclamation, if the clipping value allows array expansion 3546 to result in the generation of a total of nNeed free slots, the function does 3547 that expansion. If not, nothing is done beyond the possible initial root 3548 thread reclamation. 3549 3550 If any argument is negative, the behavior is undefined. */ 3551 static int __kmp_expand_threads(int nNeed) { 3552 int added = 0; 3553 int minimumRequiredCapacity; 3554 int newCapacity; 3555 kmp_info_t **newThreads; 3556 kmp_root_t **newRoot; 3557 3558 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3559 // resizing __kmp_threads does not need additional protection if foreign 3560 // threads are present 3561 3562 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3563 /* only for Windows static library */ 3564 /* reclaim array entries for root threads that are already dead */ 3565 added = __kmp_reclaim_dead_roots(); 3566 3567 if (nNeed) { 3568 nNeed -= added; 3569 if (nNeed < 0) 3570 nNeed = 0; 3571 } 3572 #endif 3573 if (nNeed <= 0) 3574 return added; 3575 3576 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3577 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3578 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3579 // > __kmp_max_nth in one of two ways: 3580 // 3581 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3582 // may not be reused by another thread, so we may need to increase 3583 // __kmp_threads_capacity to __kmp_max_nth + 1. 3584 // 3585 // 2) New foreign root(s) are encountered. We always register new foreign 3586 // roots. This may cause a smaller # of threads to be allocated at 3587 // subsequent parallel regions, but the worker threads hang around (and 3588 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3589 // 3590 // Anyway, that is the reason for moving the check to see if 3591 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3592 // instead of having it performed here. -BB 3593 3594 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3595 3596 /* compute expansion headroom to check if we can expand */ 3597 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3598 /* possible expansion too small -- give up */ 3599 return added; 3600 } 3601 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3602 3603 newCapacity = __kmp_threads_capacity; 3604 do { 3605 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3606 : __kmp_sys_max_nth; 3607 } while (newCapacity < minimumRequiredCapacity); 3608 newThreads = (kmp_info_t **)__kmp_allocate( 3609 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3610 newRoot = 3611 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3612 KMP_MEMCPY(newThreads, __kmp_threads, 3613 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3614 KMP_MEMCPY(newRoot, __kmp_root, 3615 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3616 3617 kmp_info_t **temp_threads = __kmp_threads; 3618 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3619 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3620 __kmp_free(temp_threads); 3621 added += newCapacity - __kmp_threads_capacity; 3622 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3623 3624 if (newCapacity > __kmp_tp_capacity) { 3625 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3626 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3627 __kmp_threadprivate_resize_cache(newCapacity); 3628 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3629 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3630 } 3631 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3632 } 3633 3634 return added; 3635 } 3636 3637 /* Register the current thread as a root thread and obtain our gtid. We must 3638 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3639 thread that calls from __kmp_do_serial_initialize() */ 3640 int __kmp_register_root(int initial_thread) { 3641 kmp_info_t *root_thread; 3642 kmp_root_t *root; 3643 int gtid; 3644 int capacity; 3645 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3646 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3647 KMP_MB(); 3648 3649 /* 2007-03-02: 3650 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3651 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3652 work as expected -- it may return false (that means there is at least one 3653 empty slot in __kmp_threads array), but it is possible the only free slot 3654 is #0, which is reserved for initial thread and so cannot be used for this 3655 one. Following code workarounds this bug. 3656 3657 However, right solution seems to be not reserving slot #0 for initial 3658 thread because: 3659 (1) there is no magic in slot #0, 3660 (2) we cannot detect initial thread reliably (the first thread which does 3661 serial initialization may be not a real initial thread). 3662 */ 3663 capacity = __kmp_threads_capacity; 3664 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3665 --capacity; 3666 } 3667 3668 // If it is not for initializing the hidden helper team, we need to take 3669 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3670 // in __kmp_threads_capacity. 3671 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3672 capacity -= __kmp_hidden_helper_threads_num; 3673 } 3674 3675 /* see if there are too many threads */ 3676 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3677 if (__kmp_tp_cached) { 3678 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3679 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3680 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3681 } else { 3682 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3683 __kmp_msg_null); 3684 } 3685 } 3686 3687 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3688 // 0: initial thread, also a regular OpenMP thread. 3689 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3690 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3691 // regular OpenMP threads. 3692 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3693 // Find an available thread slot for hidden helper thread. Slots for hidden 3694 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3695 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3696 gtid <= __kmp_hidden_helper_threads_num; 3697 gtid++) 3698 ; 3699 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3700 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3701 "hidden helper thread: T#%d\n", 3702 gtid)); 3703 } else { 3704 /* find an available thread slot */ 3705 // Don't reassign the zero slot since we need that to only be used by 3706 // initial thread. Slots for hidden helper threads should also be skipped. 3707 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3708 gtid = 0; 3709 } else { 3710 for (gtid = __kmp_hidden_helper_threads_num + 1; 3711 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3712 ; 3713 } 3714 KA_TRACE( 3715 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3716 KMP_ASSERT(gtid < __kmp_threads_capacity); 3717 } 3718 3719 /* update global accounting */ 3720 __kmp_all_nth++; 3721 TCW_4(__kmp_nth, __kmp_nth + 1); 3722 3723 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3724 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3725 if (__kmp_adjust_gtid_mode) { 3726 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3727 if (TCR_4(__kmp_gtid_mode) != 2) { 3728 TCW_4(__kmp_gtid_mode, 2); 3729 } 3730 } else { 3731 if (TCR_4(__kmp_gtid_mode) != 1) { 3732 TCW_4(__kmp_gtid_mode, 1); 3733 } 3734 } 3735 } 3736 3737 #ifdef KMP_ADJUST_BLOCKTIME 3738 /* Adjust blocktime to zero if necessary */ 3739 /* Middle initialization might not have occurred yet */ 3740 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3741 if (__kmp_nth > __kmp_avail_proc) { 3742 __kmp_zero_bt = TRUE; 3743 } 3744 } 3745 #endif /* KMP_ADJUST_BLOCKTIME */ 3746 3747 /* setup this new hierarchy */ 3748 if (!(root = __kmp_root[gtid])) { 3749 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3750 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3751 } 3752 3753 #if KMP_STATS_ENABLED 3754 // Initialize stats as soon as possible (right after gtid assignment). 3755 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3756 __kmp_stats_thread_ptr->startLife(); 3757 KMP_SET_THREAD_STATE(SERIAL_REGION); 3758 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3759 #endif 3760 __kmp_initialize_root(root); 3761 3762 /* setup new root thread structure */ 3763 if (root->r.r_uber_thread) { 3764 root_thread = root->r.r_uber_thread; 3765 } else { 3766 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3767 if (__kmp_storage_map) { 3768 __kmp_print_thread_storage_map(root_thread, gtid); 3769 } 3770 root_thread->th.th_info.ds.ds_gtid = gtid; 3771 #if OMPT_SUPPORT 3772 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3773 #endif 3774 root_thread->th.th_root = root; 3775 if (__kmp_env_consistency_check) { 3776 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3777 } 3778 #if USE_FAST_MEMORY 3779 __kmp_initialize_fast_memory(root_thread); 3780 #endif /* USE_FAST_MEMORY */ 3781 3782 #if KMP_USE_BGET 3783 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3784 __kmp_initialize_bget(root_thread); 3785 #endif 3786 __kmp_init_random(root_thread); // Initialize random number generator 3787 } 3788 3789 /* setup the serial team held in reserve by the root thread */ 3790 if (!root_thread->th.th_serial_team) { 3791 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3792 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3793 root_thread->th.th_serial_team = __kmp_allocate_team( 3794 root, 1, 1, 3795 #if OMPT_SUPPORT 3796 ompt_data_none, // root parallel id 3797 #endif 3798 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3799 } 3800 KMP_ASSERT(root_thread->th.th_serial_team); 3801 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3802 root_thread->th.th_serial_team)); 3803 3804 /* drop root_thread into place */ 3805 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3806 3807 root->r.r_root_team->t.t_threads[0] = root_thread; 3808 root->r.r_hot_team->t.t_threads[0] = root_thread; 3809 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3810 // AC: the team created in reserve, not for execution (it is unused for now). 3811 root_thread->th.th_serial_team->t.t_serialized = 0; 3812 root->r.r_uber_thread = root_thread; 3813 3814 /* initialize the thread, get it ready to go */ 3815 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3816 TCW_4(__kmp_init_gtid, TRUE); 3817 3818 /* prepare the primary thread for get_gtid() */ 3819 __kmp_gtid_set_specific(gtid); 3820 3821 #if USE_ITT_BUILD 3822 __kmp_itt_thread_name(gtid); 3823 #endif /* USE_ITT_BUILD */ 3824 3825 #ifdef KMP_TDATA_GTID 3826 __kmp_gtid = gtid; 3827 #endif 3828 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3829 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3830 3831 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3832 "plain=%u\n", 3833 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3834 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3835 KMP_INIT_BARRIER_STATE)); 3836 { // Initialize barrier data. 3837 int b; 3838 for (b = 0; b < bs_last_barrier; ++b) { 3839 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3840 #if USE_DEBUGGER 3841 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3842 #endif 3843 } 3844 } 3845 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3846 KMP_INIT_BARRIER_STATE); 3847 3848 #if KMP_AFFINITY_SUPPORTED 3849 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3850 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3851 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3852 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3853 #endif /* KMP_AFFINITY_SUPPORTED */ 3854 root_thread->th.th_def_allocator = __kmp_def_allocator; 3855 root_thread->th.th_prev_level = 0; 3856 root_thread->th.th_prev_num_threads = 1; 3857 3858 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3859 tmp->cg_root = root_thread; 3860 tmp->cg_thread_limit = __kmp_cg_max_nth; 3861 tmp->cg_nthreads = 1; 3862 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3863 " cg_nthreads init to 1\n", 3864 root_thread, tmp)); 3865 tmp->up = NULL; 3866 root_thread->th.th_cg_roots = tmp; 3867 3868 __kmp_root_counter++; 3869 3870 #if OMPT_SUPPORT 3871 if (!initial_thread && ompt_enabled.enabled) { 3872 3873 kmp_info_t *root_thread = ompt_get_thread(); 3874 3875 ompt_set_thread_state(root_thread, ompt_state_overhead); 3876 3877 if (ompt_enabled.ompt_callback_thread_begin) { 3878 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3879 ompt_thread_initial, __ompt_get_thread_data_internal()); 3880 } 3881 ompt_data_t *task_data; 3882 ompt_data_t *parallel_data; 3883 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3884 NULL); 3885 if (ompt_enabled.ompt_callback_implicit_task) { 3886 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3887 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3888 } 3889 3890 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3891 } 3892 #endif 3893 #if OMPD_SUPPORT 3894 if (ompd_state & OMPD_ENABLE_BP) 3895 ompd_bp_thread_begin(); 3896 #endif 3897 3898 KMP_MB(); 3899 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3900 3901 return gtid; 3902 } 3903 3904 #if KMP_NESTED_HOT_TEAMS 3905 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3906 const int max_level) { 3907 int i, n, nth; 3908 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3909 if (!hot_teams || !hot_teams[level].hot_team) { 3910 return 0; 3911 } 3912 KMP_DEBUG_ASSERT(level < max_level); 3913 kmp_team_t *team = hot_teams[level].hot_team; 3914 nth = hot_teams[level].hot_team_nth; 3915 n = nth - 1; // primary thread is not freed 3916 if (level < max_level - 1) { 3917 for (i = 0; i < nth; ++i) { 3918 kmp_info_t *th = team->t.t_threads[i]; 3919 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3920 if (i > 0 && th->th.th_hot_teams) { 3921 __kmp_free(th->th.th_hot_teams); 3922 th->th.th_hot_teams = NULL; 3923 } 3924 } 3925 } 3926 __kmp_free_team(root, team, NULL); 3927 return n; 3928 } 3929 #endif 3930 3931 // Resets a root thread and clear its root and hot teams. 3932 // Returns the number of __kmp_threads entries directly and indirectly freed. 3933 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3934 kmp_team_t *root_team = root->r.r_root_team; 3935 kmp_team_t *hot_team = root->r.r_hot_team; 3936 int n = hot_team->t.t_nproc; 3937 int i; 3938 3939 KMP_DEBUG_ASSERT(!root->r.r_active); 3940 3941 root->r.r_root_team = NULL; 3942 root->r.r_hot_team = NULL; 3943 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3944 // before call to __kmp_free_team(). 3945 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3946 #if KMP_NESTED_HOT_TEAMS 3947 if (__kmp_hot_teams_max_level > 3948 0) { // need to free nested hot teams and their threads if any 3949 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3950 kmp_info_t *th = hot_team->t.t_threads[i]; 3951 if (__kmp_hot_teams_max_level > 1) { 3952 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3953 } 3954 if (th->th.th_hot_teams) { 3955 __kmp_free(th->th.th_hot_teams); 3956 th->th.th_hot_teams = NULL; 3957 } 3958 } 3959 } 3960 #endif 3961 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3962 3963 // Before we can reap the thread, we need to make certain that all other 3964 // threads in the teams that had this root as ancestor have stopped trying to 3965 // steal tasks. 3966 if (__kmp_tasking_mode != tskm_immediate_exec) { 3967 __kmp_wait_to_unref_task_teams(); 3968 } 3969 3970 #if KMP_OS_WINDOWS 3971 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3972 KA_TRACE( 3973 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3974 "\n", 3975 (LPVOID) & (root->r.r_uber_thread->th), 3976 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3977 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3978 #endif /* KMP_OS_WINDOWS */ 3979 3980 #if OMPD_SUPPORT 3981 if (ompd_state & OMPD_ENABLE_BP) 3982 ompd_bp_thread_end(); 3983 #endif 3984 3985 #if OMPT_SUPPORT 3986 ompt_data_t *task_data; 3987 ompt_data_t *parallel_data; 3988 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3989 NULL); 3990 if (ompt_enabled.ompt_callback_implicit_task) { 3991 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3992 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3993 } 3994 if (ompt_enabled.ompt_callback_thread_end) { 3995 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3996 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3997 } 3998 #endif 3999 4000 TCW_4(__kmp_nth, 4001 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 4002 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 4003 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 4004 " to %d\n", 4005 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 4006 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 4007 if (i == 1) { 4008 // need to free contention group structure 4009 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 4010 root->r.r_uber_thread->th.th_cg_roots->cg_root); 4011 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 4012 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 4013 root->r.r_uber_thread->th.th_cg_roots = NULL; 4014 } 4015 __kmp_reap_thread(root->r.r_uber_thread, 1); 4016 4017 // We canot put root thread to __kmp_thread_pool, so we have to reap it 4018 // instead of freeing. 4019 root->r.r_uber_thread = NULL; 4020 /* mark root as no longer in use */ 4021 root->r.r_begin = FALSE; 4022 4023 return n; 4024 } 4025 4026 void __kmp_unregister_root_current_thread(int gtid) { 4027 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 4028 /* this lock should be ok, since unregister_root_current_thread is never 4029 called during an abort, only during a normal close. furthermore, if you 4030 have the forkjoin lock, you should never try to get the initz lock */ 4031 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4032 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4033 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4034 "exiting T#%d\n", 4035 gtid)); 4036 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4037 return; 4038 } 4039 kmp_root_t *root = __kmp_root[gtid]; 4040 4041 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4042 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4043 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4044 KMP_ASSERT(root->r.r_active == FALSE); 4045 4046 KMP_MB(); 4047 4048 kmp_info_t *thread = __kmp_threads[gtid]; 4049 kmp_team_t *team = thread->th.th_team; 4050 kmp_task_team_t *task_team = thread->th.th_task_team; 4051 4052 // we need to wait for the proxy tasks before finishing the thread 4053 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4054 #if OMPT_SUPPORT 4055 // the runtime is shutting down so we won't report any events 4056 thread->th.ompt_thread_info.state = ompt_state_undefined; 4057 #endif 4058 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4059 } 4060 4061 __kmp_reset_root(gtid, root); 4062 4063 KMP_MB(); 4064 KC_TRACE(10, 4065 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4066 4067 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4068 } 4069 4070 #if KMP_OS_WINDOWS 4071 /* __kmp_forkjoin_lock must be already held 4072 Unregisters a root thread that is not the current thread. Returns the number 4073 of __kmp_threads entries freed as a result. */ 4074 static int __kmp_unregister_root_other_thread(int gtid) { 4075 kmp_root_t *root = __kmp_root[gtid]; 4076 int r; 4077 4078 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4079 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4080 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4081 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4082 KMP_ASSERT(root->r.r_active == FALSE); 4083 4084 r = __kmp_reset_root(gtid, root); 4085 KC_TRACE(10, 4086 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4087 return r; 4088 } 4089 #endif 4090 4091 #if KMP_DEBUG 4092 void __kmp_task_info() { 4093 4094 kmp_int32 gtid = __kmp_entry_gtid(); 4095 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4096 kmp_info_t *this_thr = __kmp_threads[gtid]; 4097 kmp_team_t *steam = this_thr->th.th_serial_team; 4098 kmp_team_t *team = this_thr->th.th_team; 4099 4100 __kmp_printf( 4101 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4102 "ptask=%p\n", 4103 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4104 team->t.t_implicit_task_taskdata[tid].td_parent); 4105 } 4106 #endif // KMP_DEBUG 4107 4108 /* TODO optimize with one big memclr, take out what isn't needed, split 4109 responsibility to workers as much as possible, and delay initialization of 4110 features as much as possible */ 4111 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4112 int tid, int gtid) { 4113 /* this_thr->th.th_info.ds.ds_gtid is setup in 4114 kmp_allocate_thread/create_worker. 4115 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4116 KMP_DEBUG_ASSERT(this_thr != NULL); 4117 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4118 KMP_DEBUG_ASSERT(team); 4119 KMP_DEBUG_ASSERT(team->t.t_threads); 4120 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4121 kmp_info_t *master = team->t.t_threads[0]; 4122 KMP_DEBUG_ASSERT(master); 4123 KMP_DEBUG_ASSERT(master->th.th_root); 4124 4125 KMP_MB(); 4126 4127 TCW_SYNC_PTR(this_thr->th.th_team, team); 4128 4129 this_thr->th.th_info.ds.ds_tid = tid; 4130 this_thr->th.th_set_nproc = 0; 4131 if (__kmp_tasking_mode != tskm_immediate_exec) 4132 // When tasking is possible, threads are not safe to reap until they are 4133 // done tasking; this will be set when tasking code is exited in wait 4134 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4135 else // no tasking --> always safe to reap 4136 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4137 this_thr->th.th_set_proc_bind = proc_bind_default; 4138 #if KMP_AFFINITY_SUPPORTED 4139 this_thr->th.th_new_place = this_thr->th.th_current_place; 4140 #endif 4141 this_thr->th.th_root = master->th.th_root; 4142 4143 /* setup the thread's cache of the team structure */ 4144 this_thr->th.th_team_nproc = team->t.t_nproc; 4145 this_thr->th.th_team_master = master; 4146 this_thr->th.th_team_serialized = team->t.t_serialized; 4147 4148 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4149 4150 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4151 tid, gtid, this_thr, this_thr->th.th_current_task)); 4152 4153 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4154 team, tid, TRUE); 4155 4156 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4157 tid, gtid, this_thr, this_thr->th.th_current_task)); 4158 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4159 // __kmp_initialize_team()? 4160 4161 /* TODO no worksharing in speculative threads */ 4162 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4163 4164 this_thr->th.th_local.this_construct = 0; 4165 4166 if (!this_thr->th.th_pri_common) { 4167 this_thr->th.th_pri_common = 4168 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4169 if (__kmp_storage_map) { 4170 __kmp_print_storage_map_gtid( 4171 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4172 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4173 } 4174 this_thr->th.th_pri_head = NULL; 4175 } 4176 4177 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4178 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4179 // Make new thread's CG root same as primary thread's 4180 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4181 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4182 if (tmp) { 4183 // worker changes CG, need to check if old CG should be freed 4184 int i = tmp->cg_nthreads--; 4185 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4186 " on node %p of thread %p to %d\n", 4187 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4188 if (i == 1) { 4189 __kmp_free(tmp); // last thread left CG --> free it 4190 } 4191 } 4192 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4193 // Increment new thread's CG root's counter to add the new thread 4194 this_thr->th.th_cg_roots->cg_nthreads++; 4195 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4196 " node %p of thread %p to %d\n", 4197 this_thr, this_thr->th.th_cg_roots, 4198 this_thr->th.th_cg_roots->cg_root, 4199 this_thr->th.th_cg_roots->cg_nthreads)); 4200 this_thr->th.th_current_task->td_icvs.thread_limit = 4201 this_thr->th.th_cg_roots->cg_thread_limit; 4202 } 4203 4204 /* Initialize dynamic dispatch */ 4205 { 4206 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4207 // Use team max_nproc since this will never change for the team. 4208 size_t disp_size = 4209 sizeof(dispatch_private_info_t) * 4210 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4211 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4212 team->t.t_max_nproc)); 4213 KMP_ASSERT(dispatch); 4214 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4215 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4216 4217 dispatch->th_disp_index = 0; 4218 dispatch->th_doacross_buf_idx = 0; 4219 if (!dispatch->th_disp_buffer) { 4220 dispatch->th_disp_buffer = 4221 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4222 4223 if (__kmp_storage_map) { 4224 __kmp_print_storage_map_gtid( 4225 gtid, &dispatch->th_disp_buffer[0], 4226 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4227 ? 1 4228 : __kmp_dispatch_num_buffers], 4229 disp_size, 4230 "th_%d.th_dispatch.th_disp_buffer " 4231 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4232 gtid, team->t.t_id, gtid); 4233 } 4234 } else { 4235 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4236 } 4237 4238 dispatch->th_dispatch_pr_current = 0; 4239 dispatch->th_dispatch_sh_current = 0; 4240 4241 dispatch->th_deo_fcn = 0; /* ORDERED */ 4242 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4243 } 4244 4245 this_thr->th.th_next_pool = NULL; 4246 4247 if (!this_thr->th.th_task_state_memo_stack) { 4248 size_t i; 4249 this_thr->th.th_task_state_memo_stack = 4250 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4251 this_thr->th.th_task_state_top = 0; 4252 this_thr->th.th_task_state_stack_sz = 4; 4253 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4254 ++i) // zero init the stack 4255 this_thr->th.th_task_state_memo_stack[i] = 0; 4256 } 4257 4258 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4259 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4260 4261 KMP_MB(); 4262 } 4263 4264 /* allocate a new thread for the requesting team. this is only called from 4265 within a forkjoin critical section. we will first try to get an available 4266 thread from the thread pool. if none is available, we will fork a new one 4267 assuming we are able to create a new one. this should be assured, as the 4268 caller should check on this first. */ 4269 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4270 int new_tid) { 4271 kmp_team_t *serial_team; 4272 kmp_info_t *new_thr; 4273 int new_gtid; 4274 4275 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4276 KMP_DEBUG_ASSERT(root && team); 4277 #if !KMP_NESTED_HOT_TEAMS 4278 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4279 #endif 4280 KMP_MB(); 4281 4282 /* first, try to get one from the thread pool */ 4283 if (__kmp_thread_pool) { 4284 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4285 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4286 if (new_thr == __kmp_thread_pool_insert_pt) { 4287 __kmp_thread_pool_insert_pt = NULL; 4288 } 4289 TCW_4(new_thr->th.th_in_pool, FALSE); 4290 __kmp_suspend_initialize_thread(new_thr); 4291 __kmp_lock_suspend_mx(new_thr); 4292 if (new_thr->th.th_active_in_pool == TRUE) { 4293 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4294 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4295 new_thr->th.th_active_in_pool = FALSE; 4296 } 4297 __kmp_unlock_suspend_mx(new_thr); 4298 4299 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4300 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4301 KMP_ASSERT(!new_thr->th.th_team); 4302 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4303 4304 /* setup the thread structure */ 4305 __kmp_initialize_info(new_thr, team, new_tid, 4306 new_thr->th.th_info.ds.ds_gtid); 4307 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4308 4309 TCW_4(__kmp_nth, __kmp_nth + 1); 4310 4311 new_thr->th.th_task_state = 0; 4312 new_thr->th.th_task_state_top = 0; 4313 new_thr->th.th_task_state_stack_sz = 4; 4314 4315 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 4316 // Make sure pool thread has transitioned to waiting on own thread struct 4317 KMP_DEBUG_ASSERT(new_thr->th.th_used_in_team.load() == 0); 4318 // Thread activated in __kmp_allocate_team when increasing team size 4319 } 4320 4321 #ifdef KMP_ADJUST_BLOCKTIME 4322 /* Adjust blocktime back to zero if necessary */ 4323 /* Middle initialization might not have occurred yet */ 4324 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4325 if (__kmp_nth > __kmp_avail_proc) { 4326 __kmp_zero_bt = TRUE; 4327 } 4328 } 4329 #endif /* KMP_ADJUST_BLOCKTIME */ 4330 4331 #if KMP_DEBUG 4332 // If thread entered pool via __kmp_free_thread, wait_flag should != 4333 // KMP_BARRIER_PARENT_FLAG. 4334 int b; 4335 kmp_balign_t *balign = new_thr->th.th_bar; 4336 for (b = 0; b < bs_last_barrier; ++b) 4337 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4338 #endif 4339 4340 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4341 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4342 4343 KMP_MB(); 4344 return new_thr; 4345 } 4346 4347 /* no, well fork a new one */ 4348 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4349 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4350 4351 #if KMP_USE_MONITOR 4352 // If this is the first worker thread the RTL is creating, then also 4353 // launch the monitor thread. We try to do this as early as possible. 4354 if (!TCR_4(__kmp_init_monitor)) { 4355 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4356 if (!TCR_4(__kmp_init_monitor)) { 4357 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4358 TCW_4(__kmp_init_monitor, 1); 4359 __kmp_create_monitor(&__kmp_monitor); 4360 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4361 #if KMP_OS_WINDOWS 4362 // AC: wait until monitor has started. This is a fix for CQ232808. 4363 // The reason is that if the library is loaded/unloaded in a loop with 4364 // small (parallel) work in between, then there is high probability that 4365 // monitor thread started after the library shutdown. At shutdown it is 4366 // too late to cope with the problem, because when the primary thread is 4367 // in DllMain (process detach) the monitor has no chances to start (it is 4368 // blocked), and primary thread has no means to inform the monitor that 4369 // the library has gone, because all the memory which the monitor can 4370 // access is going to be released/reset. 4371 while (TCR_4(__kmp_init_monitor) < 2) { 4372 KMP_YIELD(TRUE); 4373 } 4374 KF_TRACE(10, ("after monitor thread has started\n")); 4375 #endif 4376 } 4377 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4378 } 4379 #endif 4380 4381 KMP_MB(); 4382 4383 { 4384 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4385 ? 1 4386 : __kmp_hidden_helper_threads_num + 1; 4387 4388 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4389 ++new_gtid) { 4390 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4391 } 4392 4393 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4394 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4395 } 4396 } 4397 4398 /* allocate space for it. */ 4399 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4400 4401 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4402 4403 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4404 // suppress race conditions detection on synchronization flags in debug mode 4405 // this helps to analyze library internals eliminating false positives 4406 __itt_suppress_mark_range( 4407 __itt_suppress_range, __itt_suppress_threading_errors, 4408 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4409 __itt_suppress_mark_range( 4410 __itt_suppress_range, __itt_suppress_threading_errors, 4411 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4412 #if KMP_OS_WINDOWS 4413 __itt_suppress_mark_range( 4414 __itt_suppress_range, __itt_suppress_threading_errors, 4415 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4416 #else 4417 __itt_suppress_mark_range(__itt_suppress_range, 4418 __itt_suppress_threading_errors, 4419 &new_thr->th.th_suspend_init_count, 4420 sizeof(new_thr->th.th_suspend_init_count)); 4421 #endif 4422 // TODO: check if we need to also suppress b_arrived flags 4423 __itt_suppress_mark_range(__itt_suppress_range, 4424 __itt_suppress_threading_errors, 4425 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4426 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4427 __itt_suppress_mark_range(__itt_suppress_range, 4428 __itt_suppress_threading_errors, 4429 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4430 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4431 __itt_suppress_mark_range(__itt_suppress_range, 4432 __itt_suppress_threading_errors, 4433 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4434 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4435 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4436 if (__kmp_storage_map) { 4437 __kmp_print_thread_storage_map(new_thr, new_gtid); 4438 } 4439 4440 // add the reserve serialized team, initialized from the team's primary thread 4441 { 4442 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4443 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4444 new_thr->th.th_serial_team = serial_team = 4445 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4446 #if OMPT_SUPPORT 4447 ompt_data_none, // root parallel id 4448 #endif 4449 proc_bind_default, &r_icvs, 4450 0 USE_NESTED_HOT_ARG(NULL)); 4451 } 4452 KMP_ASSERT(serial_team); 4453 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4454 // execution (it is unused for now). 4455 serial_team->t.t_threads[0] = new_thr; 4456 KF_TRACE(10, 4457 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4458 new_thr)); 4459 4460 /* setup the thread structures */ 4461 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4462 4463 #if USE_FAST_MEMORY 4464 __kmp_initialize_fast_memory(new_thr); 4465 #endif /* USE_FAST_MEMORY */ 4466 4467 #if KMP_USE_BGET 4468 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4469 __kmp_initialize_bget(new_thr); 4470 #endif 4471 4472 __kmp_init_random(new_thr); // Initialize random number generator 4473 4474 /* Initialize these only once when thread is grabbed for a team allocation */ 4475 KA_TRACE(20, 4476 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4477 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4478 4479 int b; 4480 kmp_balign_t *balign = new_thr->th.th_bar; 4481 for (b = 0; b < bs_last_barrier; ++b) { 4482 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4483 balign[b].bb.team = NULL; 4484 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4485 balign[b].bb.use_oncore_barrier = 0; 4486 } 4487 4488 TCW_PTR(new_thr->th.th_sleep_loc, NULL); 4489 new_thr->th.th_sleep_loc_type = flag_unset; 4490 4491 new_thr->th.th_spin_here = FALSE; 4492 new_thr->th.th_next_waiting = 0; 4493 #if KMP_OS_UNIX 4494 new_thr->th.th_blocking = false; 4495 #endif 4496 4497 #if KMP_AFFINITY_SUPPORTED 4498 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4499 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4500 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4501 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4502 #endif 4503 new_thr->th.th_def_allocator = __kmp_def_allocator; 4504 new_thr->th.th_prev_level = 0; 4505 new_thr->th.th_prev_num_threads = 1; 4506 4507 TCW_4(new_thr->th.th_in_pool, FALSE); 4508 new_thr->th.th_active_in_pool = FALSE; 4509 TCW_4(new_thr->th.th_active, TRUE); 4510 4511 /* adjust the global counters */ 4512 __kmp_all_nth++; 4513 __kmp_nth++; 4514 4515 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4516 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4517 if (__kmp_adjust_gtid_mode) { 4518 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4519 if (TCR_4(__kmp_gtid_mode) != 2) { 4520 TCW_4(__kmp_gtid_mode, 2); 4521 } 4522 } else { 4523 if (TCR_4(__kmp_gtid_mode) != 1) { 4524 TCW_4(__kmp_gtid_mode, 1); 4525 } 4526 } 4527 } 4528 4529 #ifdef KMP_ADJUST_BLOCKTIME 4530 /* Adjust blocktime back to zero if necessary */ 4531 /* Middle initialization might not have occurred yet */ 4532 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4533 if (__kmp_nth > __kmp_avail_proc) { 4534 __kmp_zero_bt = TRUE; 4535 } 4536 } 4537 #endif /* KMP_ADJUST_BLOCKTIME */ 4538 4539 /* actually fork it and create the new worker thread */ 4540 KF_TRACE( 4541 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4542 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4543 KF_TRACE(10, 4544 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4545 4546 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4547 new_gtid)); 4548 KMP_MB(); 4549 return new_thr; 4550 } 4551 4552 /* Reinitialize team for reuse. 4553 The hot team code calls this case at every fork barrier, so EPCC barrier 4554 test are extremely sensitive to changes in it, esp. writes to the team 4555 struct, which cause a cache invalidation in all threads. 4556 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4557 static void __kmp_reinitialize_team(kmp_team_t *team, 4558 kmp_internal_control_t *new_icvs, 4559 ident_t *loc) { 4560 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4561 team->t.t_threads[0], team)); 4562 KMP_DEBUG_ASSERT(team && new_icvs); 4563 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4564 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4565 4566 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4567 // Copy ICVs to the primary thread's implicit taskdata 4568 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4569 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4570 4571 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4572 team->t.t_threads[0], team)); 4573 } 4574 4575 /* Initialize the team data structure. 4576 This assumes the t_threads and t_max_nproc are already set. 4577 Also, we don't touch the arguments */ 4578 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4579 kmp_internal_control_t *new_icvs, 4580 ident_t *loc) { 4581 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4582 4583 /* verify */ 4584 KMP_DEBUG_ASSERT(team); 4585 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4586 KMP_DEBUG_ASSERT(team->t.t_threads); 4587 KMP_MB(); 4588 4589 team->t.t_master_tid = 0; /* not needed */ 4590 /* team->t.t_master_bar; not needed */ 4591 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4592 team->t.t_nproc = new_nproc; 4593 4594 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4595 team->t.t_next_pool = NULL; 4596 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4597 * up hot team */ 4598 4599 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4600 team->t.t_invoke = NULL; /* not needed */ 4601 4602 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4603 team->t.t_sched.sched = new_icvs->sched.sched; 4604 4605 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4606 team->t.t_fp_control_saved = FALSE; /* not needed */ 4607 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4608 team->t.t_mxcsr = 0; /* not needed */ 4609 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4610 4611 team->t.t_construct = 0; 4612 4613 team->t.t_ordered.dt.t_value = 0; 4614 team->t.t_master_active = FALSE; 4615 4616 #ifdef KMP_DEBUG 4617 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4618 #endif 4619 #if KMP_OS_WINDOWS 4620 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4621 #endif 4622 4623 team->t.t_control_stack_top = NULL; 4624 4625 __kmp_reinitialize_team(team, new_icvs, loc); 4626 4627 KMP_MB(); 4628 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4629 } 4630 4631 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4632 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4633 static void 4634 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4635 if (KMP_AFFINITY_CAPABLE()) { 4636 int status; 4637 if (old_mask != NULL) { 4638 status = __kmp_get_system_affinity(old_mask, TRUE); 4639 int error = errno; 4640 if (status != 0) { 4641 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4642 __kmp_msg_null); 4643 } 4644 } 4645 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4646 } 4647 } 4648 #endif 4649 4650 #if KMP_AFFINITY_SUPPORTED 4651 4652 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4653 // It calculates the worker + primary thread's partition based upon the parent 4654 // thread's partition, and binds each worker to a thread in their partition. 4655 // The primary thread's partition should already include its current binding. 4656 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4657 // Do not partition places for the hidden helper team 4658 if (KMP_HIDDEN_HELPER_TEAM(team)) 4659 return; 4660 // Copy the primary thread's place partition to the team struct 4661 kmp_info_t *master_th = team->t.t_threads[0]; 4662 KMP_DEBUG_ASSERT(master_th != NULL); 4663 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4664 int first_place = master_th->th.th_first_place; 4665 int last_place = master_th->th.th_last_place; 4666 int masters_place = master_th->th.th_current_place; 4667 team->t.t_first_place = first_place; 4668 team->t.t_last_place = last_place; 4669 4670 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4671 "bound to place %d partition = [%d,%d]\n", 4672 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4673 team->t.t_id, masters_place, first_place, last_place)); 4674 4675 switch (proc_bind) { 4676 4677 case proc_bind_default: 4678 // Serial teams might have the proc_bind policy set to proc_bind_default. 4679 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4680 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4681 break; 4682 4683 case proc_bind_primary: { 4684 int f; 4685 int n_th = team->t.t_nproc; 4686 for (f = 1; f < n_th; f++) { 4687 kmp_info_t *th = team->t.t_threads[f]; 4688 KMP_DEBUG_ASSERT(th != NULL); 4689 th->th.th_first_place = first_place; 4690 th->th.th_last_place = last_place; 4691 th->th.th_new_place = masters_place; 4692 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4693 team->t.t_display_affinity != 1) { 4694 team->t.t_display_affinity = 1; 4695 } 4696 4697 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4698 "partition = [%d,%d]\n", 4699 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4700 f, masters_place, first_place, last_place)); 4701 } 4702 } break; 4703 4704 case proc_bind_close: { 4705 int f; 4706 int n_th = team->t.t_nproc; 4707 int n_places; 4708 if (first_place <= last_place) { 4709 n_places = last_place - first_place + 1; 4710 } else { 4711 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4712 } 4713 if (n_th <= n_places) { 4714 int place = masters_place; 4715 for (f = 1; f < n_th; f++) { 4716 kmp_info_t *th = team->t.t_threads[f]; 4717 KMP_DEBUG_ASSERT(th != NULL); 4718 4719 if (place == last_place) { 4720 place = first_place; 4721 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4722 place = 0; 4723 } else { 4724 place++; 4725 } 4726 th->th.th_first_place = first_place; 4727 th->th.th_last_place = last_place; 4728 th->th.th_new_place = place; 4729 if (__kmp_display_affinity && place != th->th.th_current_place && 4730 team->t.t_display_affinity != 1) { 4731 team->t.t_display_affinity = 1; 4732 } 4733 4734 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4735 "partition = [%d,%d]\n", 4736 __kmp_gtid_from_thread(team->t.t_threads[f]), 4737 team->t.t_id, f, place, first_place, last_place)); 4738 } 4739 } else { 4740 int S, rem, gap, s_count; 4741 S = n_th / n_places; 4742 s_count = 0; 4743 rem = n_th - (S * n_places); 4744 gap = rem > 0 ? n_places / rem : n_places; 4745 int place = masters_place; 4746 int gap_ct = gap; 4747 for (f = 0; f < n_th; f++) { 4748 kmp_info_t *th = team->t.t_threads[f]; 4749 KMP_DEBUG_ASSERT(th != NULL); 4750 4751 th->th.th_first_place = first_place; 4752 th->th.th_last_place = last_place; 4753 th->th.th_new_place = place; 4754 if (__kmp_display_affinity && place != th->th.th_current_place && 4755 team->t.t_display_affinity != 1) { 4756 team->t.t_display_affinity = 1; 4757 } 4758 s_count++; 4759 4760 if ((s_count == S) && rem && (gap_ct == gap)) { 4761 // do nothing, add an extra thread to place on next iteration 4762 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4763 // we added an extra thread to this place; move to next place 4764 if (place == last_place) { 4765 place = first_place; 4766 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4767 place = 0; 4768 } else { 4769 place++; 4770 } 4771 s_count = 0; 4772 gap_ct = 1; 4773 rem--; 4774 } else if (s_count == S) { // place full; don't add extra 4775 if (place == last_place) { 4776 place = first_place; 4777 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4778 place = 0; 4779 } else { 4780 place++; 4781 } 4782 gap_ct++; 4783 s_count = 0; 4784 } 4785 4786 KA_TRACE(100, 4787 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4788 "partition = [%d,%d]\n", 4789 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4790 th->th.th_new_place, first_place, last_place)); 4791 } 4792 KMP_DEBUG_ASSERT(place == masters_place); 4793 } 4794 } break; 4795 4796 case proc_bind_spread: { 4797 int f; 4798 int n_th = team->t.t_nproc; 4799 int n_places; 4800 int thidx; 4801 if (first_place <= last_place) { 4802 n_places = last_place - first_place + 1; 4803 } else { 4804 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4805 } 4806 if (n_th <= n_places) { 4807 int place = -1; 4808 4809 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4810 int S = n_places / n_th; 4811 int s_count, rem, gap, gap_ct; 4812 4813 place = masters_place; 4814 rem = n_places - n_th * S; 4815 gap = rem ? n_th / rem : 1; 4816 gap_ct = gap; 4817 thidx = n_th; 4818 if (update_master_only == 1) 4819 thidx = 1; 4820 for (f = 0; f < thidx; f++) { 4821 kmp_info_t *th = team->t.t_threads[f]; 4822 KMP_DEBUG_ASSERT(th != NULL); 4823 4824 th->th.th_first_place = place; 4825 th->th.th_new_place = place; 4826 if (__kmp_display_affinity && place != th->th.th_current_place && 4827 team->t.t_display_affinity != 1) { 4828 team->t.t_display_affinity = 1; 4829 } 4830 s_count = 1; 4831 while (s_count < S) { 4832 if (place == last_place) { 4833 place = first_place; 4834 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4835 place = 0; 4836 } else { 4837 place++; 4838 } 4839 s_count++; 4840 } 4841 if (rem && (gap_ct == gap)) { 4842 if (place == last_place) { 4843 place = first_place; 4844 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4845 place = 0; 4846 } else { 4847 place++; 4848 } 4849 rem--; 4850 gap_ct = 0; 4851 } 4852 th->th.th_last_place = place; 4853 gap_ct++; 4854 4855 if (place == last_place) { 4856 place = first_place; 4857 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4858 place = 0; 4859 } else { 4860 place++; 4861 } 4862 4863 KA_TRACE(100, 4864 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4865 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4866 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4867 f, th->th.th_new_place, th->th.th_first_place, 4868 th->th.th_last_place, __kmp_affinity_num_masks)); 4869 } 4870 } else { 4871 /* Having uniform space of available computation places I can create 4872 T partitions of round(P/T) size and put threads into the first 4873 place of each partition. */ 4874 double current = static_cast<double>(masters_place); 4875 double spacing = 4876 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4877 int first, last; 4878 kmp_info_t *th; 4879 4880 thidx = n_th + 1; 4881 if (update_master_only == 1) 4882 thidx = 1; 4883 for (f = 0; f < thidx; f++) { 4884 first = static_cast<int>(current); 4885 last = static_cast<int>(current + spacing) - 1; 4886 KMP_DEBUG_ASSERT(last >= first); 4887 if (first >= n_places) { 4888 if (masters_place) { 4889 first -= n_places; 4890 last -= n_places; 4891 if (first == (masters_place + 1)) { 4892 KMP_DEBUG_ASSERT(f == n_th); 4893 first--; 4894 } 4895 if (last == masters_place) { 4896 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4897 last--; 4898 } 4899 } else { 4900 KMP_DEBUG_ASSERT(f == n_th); 4901 first = 0; 4902 last = 0; 4903 } 4904 } 4905 if (last >= n_places) { 4906 last = (n_places - 1); 4907 } 4908 place = first; 4909 current += spacing; 4910 if (f < n_th) { 4911 KMP_DEBUG_ASSERT(0 <= first); 4912 KMP_DEBUG_ASSERT(n_places > first); 4913 KMP_DEBUG_ASSERT(0 <= last); 4914 KMP_DEBUG_ASSERT(n_places > last); 4915 KMP_DEBUG_ASSERT(last_place >= first_place); 4916 th = team->t.t_threads[f]; 4917 KMP_DEBUG_ASSERT(th); 4918 th->th.th_first_place = first; 4919 th->th.th_new_place = place; 4920 th->th.th_last_place = last; 4921 if (__kmp_display_affinity && place != th->th.th_current_place && 4922 team->t.t_display_affinity != 1) { 4923 team->t.t_display_affinity = 1; 4924 } 4925 KA_TRACE(100, 4926 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4927 "partition = [%d,%d], spacing = %.4f\n", 4928 __kmp_gtid_from_thread(team->t.t_threads[f]), 4929 team->t.t_id, f, th->th.th_new_place, 4930 th->th.th_first_place, th->th.th_last_place, spacing)); 4931 } 4932 } 4933 } 4934 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4935 } else { 4936 int S, rem, gap, s_count; 4937 S = n_th / n_places; 4938 s_count = 0; 4939 rem = n_th - (S * n_places); 4940 gap = rem > 0 ? n_places / rem : n_places; 4941 int place = masters_place; 4942 int gap_ct = gap; 4943 thidx = n_th; 4944 if (update_master_only == 1) 4945 thidx = 1; 4946 for (f = 0; f < thidx; f++) { 4947 kmp_info_t *th = team->t.t_threads[f]; 4948 KMP_DEBUG_ASSERT(th != NULL); 4949 4950 th->th.th_first_place = place; 4951 th->th.th_last_place = place; 4952 th->th.th_new_place = place; 4953 if (__kmp_display_affinity && place != th->th.th_current_place && 4954 team->t.t_display_affinity != 1) { 4955 team->t.t_display_affinity = 1; 4956 } 4957 s_count++; 4958 4959 if ((s_count == S) && rem && (gap_ct == gap)) { 4960 // do nothing, add an extra thread to place on next iteration 4961 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4962 // we added an extra thread to this place; move on to next place 4963 if (place == last_place) { 4964 place = first_place; 4965 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4966 place = 0; 4967 } else { 4968 place++; 4969 } 4970 s_count = 0; 4971 gap_ct = 1; 4972 rem--; 4973 } else if (s_count == S) { // place is full; don't add extra thread 4974 if (place == last_place) { 4975 place = first_place; 4976 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4977 place = 0; 4978 } else { 4979 place++; 4980 } 4981 gap_ct++; 4982 s_count = 0; 4983 } 4984 4985 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4986 "partition = [%d,%d]\n", 4987 __kmp_gtid_from_thread(team->t.t_threads[f]), 4988 team->t.t_id, f, th->th.th_new_place, 4989 th->th.th_first_place, th->th.th_last_place)); 4990 } 4991 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4992 } 4993 } break; 4994 4995 default: 4996 break; 4997 } 4998 4999 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 5000 } 5001 5002 #endif // KMP_AFFINITY_SUPPORTED 5003 5004 /* allocate a new team data structure to use. take one off of the free pool if 5005 available */ 5006 kmp_team_t * 5007 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 5008 #if OMPT_SUPPORT 5009 ompt_data_t ompt_parallel_data, 5010 #endif 5011 kmp_proc_bind_t new_proc_bind, 5012 kmp_internal_control_t *new_icvs, 5013 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5014 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 5015 int f; 5016 kmp_team_t *team; 5017 int use_hot_team = !root->r.r_active; 5018 int level = 0; 5019 5020 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 5021 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 5022 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 5023 KMP_MB(); 5024 5025 #if KMP_NESTED_HOT_TEAMS 5026 kmp_hot_team_ptr_t *hot_teams; 5027 if (master) { 5028 team = master->th.th_team; 5029 level = team->t.t_active_level; 5030 if (master->th.th_teams_microtask) { // in teams construct? 5031 if (master->th.th_teams_size.nteams > 1 && 5032 ( // #teams > 1 5033 team->t.t_pkfn == 5034 (microtask_t)__kmp_teams_master || // inner fork of the teams 5035 master->th.th_teams_level < 5036 team->t.t_level)) { // or nested parallel inside the teams 5037 ++level; // not increment if #teams==1, or for outer fork of the teams; 5038 // increment otherwise 5039 } 5040 } 5041 hot_teams = master->th.th_hot_teams; 5042 if (level < __kmp_hot_teams_max_level && hot_teams && 5043 hot_teams[level].hot_team) { 5044 // hot team has already been allocated for given level 5045 use_hot_team = 1; 5046 } else { 5047 use_hot_team = 0; 5048 } 5049 } else { 5050 // check we won't access uninitialized hot_teams, just in case 5051 KMP_DEBUG_ASSERT(new_nproc == 1); 5052 } 5053 #endif 5054 // Optimization to use a "hot" team 5055 if (use_hot_team && new_nproc > 1) { 5056 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5057 #if KMP_NESTED_HOT_TEAMS 5058 team = hot_teams[level].hot_team; 5059 #else 5060 team = root->r.r_hot_team; 5061 #endif 5062 #if KMP_DEBUG 5063 if (__kmp_tasking_mode != tskm_immediate_exec) { 5064 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5065 "task_team[1] = %p before reinit\n", 5066 team->t.t_task_team[0], team->t.t_task_team[1])); 5067 } 5068 #endif 5069 5070 if (team->t.t_nproc != new_nproc && 5071 __kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5072 // Distributed barrier may need a resize 5073 int old_nthr = team->t.t_nproc; 5074 __kmp_resize_dist_barrier(team, old_nthr, new_nproc); 5075 } 5076 5077 // Has the number of threads changed? 5078 /* Let's assume the most common case is that the number of threads is 5079 unchanged, and put that case first. */ 5080 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5081 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5082 // This case can mean that omp_set_num_threads() was called and the hot 5083 // team size was already reduced, so we check the special flag 5084 if (team->t.t_size_changed == -1) { 5085 team->t.t_size_changed = 1; 5086 } else { 5087 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5088 } 5089 5090 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5091 kmp_r_sched_t new_sched = new_icvs->sched; 5092 // set primary thread's schedule as new run-time schedule 5093 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5094 5095 __kmp_reinitialize_team(team, new_icvs, 5096 root->r.r_uber_thread->th.th_ident); 5097 5098 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5099 team->t.t_threads[0], team)); 5100 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5101 5102 #if KMP_AFFINITY_SUPPORTED 5103 if ((team->t.t_size_changed == 0) && 5104 (team->t.t_proc_bind == new_proc_bind)) { 5105 if (new_proc_bind == proc_bind_spread) { 5106 __kmp_partition_places( 5107 team, 1); // add flag to update only master for spread 5108 } 5109 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5110 "proc_bind = %d, partition = [%d,%d]\n", 5111 team->t.t_id, new_proc_bind, team->t.t_first_place, 5112 team->t.t_last_place)); 5113 } else { 5114 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5115 __kmp_partition_places(team); 5116 } 5117 #else 5118 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5119 #endif /* KMP_AFFINITY_SUPPORTED */ 5120 } else if (team->t.t_nproc > new_nproc) { 5121 KA_TRACE(20, 5122 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5123 new_nproc)); 5124 5125 team->t.t_size_changed = 1; 5126 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5127 // Barrier size already reduced earlier in this function 5128 // Activate team threads via th_used_in_team 5129 __kmp_add_threads_to_team(team, new_nproc); 5130 } 5131 #if KMP_NESTED_HOT_TEAMS 5132 if (__kmp_hot_teams_mode == 0) { 5133 // AC: saved number of threads should correspond to team's value in this 5134 // mode, can be bigger in mode 1, when hot team has threads in reserve 5135 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5136 hot_teams[level].hot_team_nth = new_nproc; 5137 #endif // KMP_NESTED_HOT_TEAMS 5138 /* release the extra threads we don't need any more */ 5139 for (f = new_nproc; f < team->t.t_nproc; f++) { 5140 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5141 if (__kmp_tasking_mode != tskm_immediate_exec) { 5142 // When decreasing team size, threads no longer in the team should 5143 // unref task team. 5144 team->t.t_threads[f]->th.th_task_team = NULL; 5145 } 5146 __kmp_free_thread(team->t.t_threads[f]); 5147 team->t.t_threads[f] = NULL; 5148 } 5149 #if KMP_NESTED_HOT_TEAMS 5150 } // (__kmp_hot_teams_mode == 0) 5151 else { 5152 // When keeping extra threads in team, switch threads to wait on own 5153 // b_go flag 5154 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5155 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5156 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5157 for (int b = 0; b < bs_last_barrier; ++b) { 5158 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5159 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5160 } 5161 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5162 } 5163 } 5164 } 5165 #endif // KMP_NESTED_HOT_TEAMS 5166 team->t.t_nproc = new_nproc; 5167 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5168 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5169 __kmp_reinitialize_team(team, new_icvs, 5170 root->r.r_uber_thread->th.th_ident); 5171 5172 // Update remaining threads 5173 for (f = 0; f < new_nproc; ++f) { 5174 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5175 } 5176 5177 // restore the current task state of the primary thread: should be the 5178 // implicit task 5179 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5180 team->t.t_threads[0], team)); 5181 5182 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5183 5184 #ifdef KMP_DEBUG 5185 for (f = 0; f < team->t.t_nproc; f++) { 5186 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5187 team->t.t_threads[f]->th.th_team_nproc == 5188 team->t.t_nproc); 5189 } 5190 #endif 5191 5192 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5193 #if KMP_AFFINITY_SUPPORTED 5194 __kmp_partition_places(team); 5195 #endif 5196 } else { // team->t.t_nproc < new_nproc 5197 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5198 kmp_affin_mask_t *old_mask; 5199 if (KMP_AFFINITY_CAPABLE()) { 5200 KMP_CPU_ALLOC(old_mask); 5201 } 5202 #endif 5203 5204 KA_TRACE(20, 5205 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5206 new_nproc)); 5207 int old_nproc = team->t.t_nproc; // save old value and use to update only 5208 team->t.t_size_changed = 1; 5209 5210 #if KMP_NESTED_HOT_TEAMS 5211 int avail_threads = hot_teams[level].hot_team_nth; 5212 if (new_nproc < avail_threads) 5213 avail_threads = new_nproc; 5214 kmp_info_t **other_threads = team->t.t_threads; 5215 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5216 // Adjust barrier data of reserved threads (if any) of the team 5217 // Other data will be set in __kmp_initialize_info() below. 5218 int b; 5219 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5220 for (b = 0; b < bs_last_barrier; ++b) { 5221 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5222 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5223 #if USE_DEBUGGER 5224 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5225 #endif 5226 } 5227 } 5228 if (hot_teams[level].hot_team_nth >= new_nproc) { 5229 // we have all needed threads in reserve, no need to allocate any 5230 // this only possible in mode 1, cannot have reserved threads in mode 0 5231 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5232 team->t.t_nproc = new_nproc; // just get reserved threads involved 5233 } else { 5234 // We may have some threads in reserve, but not enough; 5235 // get reserved threads involved if any. 5236 team->t.t_nproc = hot_teams[level].hot_team_nth; 5237 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5238 #endif // KMP_NESTED_HOT_TEAMS 5239 if (team->t.t_max_nproc < new_nproc) { 5240 /* reallocate larger arrays */ 5241 __kmp_reallocate_team_arrays(team, new_nproc); 5242 __kmp_reinitialize_team(team, new_icvs, NULL); 5243 } 5244 5245 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5246 /* Temporarily set full mask for primary thread before creation of 5247 workers. The reason is that workers inherit the affinity from the 5248 primary thread, so if a lot of workers are created on the single 5249 core quickly, they don't get a chance to set their own affinity for 5250 a long time. */ 5251 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5252 #endif 5253 5254 /* allocate new threads for the hot team */ 5255 for (f = team->t.t_nproc; f < new_nproc; f++) { 5256 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5257 KMP_DEBUG_ASSERT(new_worker); 5258 team->t.t_threads[f] = new_worker; 5259 5260 KA_TRACE(20, 5261 ("__kmp_allocate_team: team %d init T#%d arrived: " 5262 "join=%llu, plain=%llu\n", 5263 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5264 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5265 team->t.t_bar[bs_plain_barrier].b_arrived)); 5266 5267 { // Initialize barrier data for new threads. 5268 int b; 5269 kmp_balign_t *balign = new_worker->th.th_bar; 5270 for (b = 0; b < bs_last_barrier; ++b) { 5271 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5272 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5273 KMP_BARRIER_PARENT_FLAG); 5274 #if USE_DEBUGGER 5275 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5276 #endif 5277 } 5278 } 5279 } 5280 5281 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5282 if (KMP_AFFINITY_CAPABLE()) { 5283 /* Restore initial primary thread's affinity mask */ 5284 __kmp_set_system_affinity(old_mask, TRUE); 5285 KMP_CPU_FREE(old_mask); 5286 } 5287 #endif 5288 #if KMP_NESTED_HOT_TEAMS 5289 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5290 #endif // KMP_NESTED_HOT_TEAMS 5291 if (__kmp_barrier_release_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5292 // Barrier size already increased earlier in this function 5293 // Activate team threads via th_used_in_team 5294 __kmp_add_threads_to_team(team, new_nproc); 5295 } 5296 /* make sure everyone is syncronized */ 5297 // new threads below 5298 __kmp_initialize_team(team, new_nproc, new_icvs, 5299 root->r.r_uber_thread->th.th_ident); 5300 5301 /* reinitialize the threads */ 5302 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5303 for (f = 0; f < team->t.t_nproc; ++f) 5304 __kmp_initialize_info(team->t.t_threads[f], team, f, 5305 __kmp_gtid_from_tid(f, team)); 5306 5307 if (level) { // set th_task_state for new threads in nested hot team 5308 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5309 // only need to set the th_task_state for the new threads. th_task_state 5310 // for primary thread will not be accurate until after this in 5311 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5312 // get the correct value. 5313 for (f = old_nproc; f < team->t.t_nproc; ++f) 5314 team->t.t_threads[f]->th.th_task_state = 5315 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5316 } else { // set th_task_state for new threads in non-nested hot team 5317 // copy primary thread's state 5318 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5319 for (f = old_nproc; f < team->t.t_nproc; ++f) 5320 team->t.t_threads[f]->th.th_task_state = old_state; 5321 } 5322 5323 #ifdef KMP_DEBUG 5324 for (f = 0; f < team->t.t_nproc; ++f) { 5325 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5326 team->t.t_threads[f]->th.th_team_nproc == 5327 team->t.t_nproc); 5328 } 5329 #endif 5330 5331 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5332 #if KMP_AFFINITY_SUPPORTED 5333 __kmp_partition_places(team); 5334 #endif 5335 } // Check changes in number of threads 5336 5337 kmp_info_t *master = team->t.t_threads[0]; 5338 if (master->th.th_teams_microtask) { 5339 for (f = 1; f < new_nproc; ++f) { 5340 // propagate teams construct specific info to workers 5341 kmp_info_t *thr = team->t.t_threads[f]; 5342 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5343 thr->th.th_teams_level = master->th.th_teams_level; 5344 thr->th.th_teams_size = master->th.th_teams_size; 5345 } 5346 } 5347 #if KMP_NESTED_HOT_TEAMS 5348 if (level) { 5349 // Sync barrier state for nested hot teams, not needed for outermost hot 5350 // team. 5351 for (f = 1; f < new_nproc; ++f) { 5352 kmp_info_t *thr = team->t.t_threads[f]; 5353 int b; 5354 kmp_balign_t *balign = thr->th.th_bar; 5355 for (b = 0; b < bs_last_barrier; ++b) { 5356 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5357 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5358 #if USE_DEBUGGER 5359 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5360 #endif 5361 } 5362 } 5363 } 5364 #endif // KMP_NESTED_HOT_TEAMS 5365 5366 /* reallocate space for arguments if necessary */ 5367 __kmp_alloc_argv_entries(argc, team, TRUE); 5368 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5369 // The hot team re-uses the previous task team, 5370 // if untouched during the previous release->gather phase. 5371 5372 KF_TRACE(10, (" hot_team = %p\n", team)); 5373 5374 #if KMP_DEBUG 5375 if (__kmp_tasking_mode != tskm_immediate_exec) { 5376 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5377 "task_team[1] = %p after reinit\n", 5378 team->t.t_task_team[0], team->t.t_task_team[1])); 5379 } 5380 #endif 5381 5382 #if OMPT_SUPPORT 5383 __ompt_team_assign_id(team, ompt_parallel_data); 5384 #endif 5385 5386 KMP_MB(); 5387 5388 return team; 5389 } 5390 5391 /* next, let's try to take one from the team pool */ 5392 KMP_MB(); 5393 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5394 /* TODO: consider resizing undersized teams instead of reaping them, now 5395 that we have a resizing mechanism */ 5396 if (team->t.t_max_nproc >= max_nproc) { 5397 /* take this team from the team pool */ 5398 __kmp_team_pool = team->t.t_next_pool; 5399 5400 if (max_nproc > 1 && 5401 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5402 if (!team->t.b) { // Allocate barrier structure 5403 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5404 } 5405 } 5406 5407 /* setup the team for fresh use */ 5408 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5409 5410 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5411 "task_team[1] %p to NULL\n", 5412 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5413 team->t.t_task_team[0] = NULL; 5414 team->t.t_task_team[1] = NULL; 5415 5416 /* reallocate space for arguments if necessary */ 5417 __kmp_alloc_argv_entries(argc, team, TRUE); 5418 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5419 5420 KA_TRACE( 5421 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5422 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5423 { // Initialize barrier data. 5424 int b; 5425 for (b = 0; b < bs_last_barrier; ++b) { 5426 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5427 #if USE_DEBUGGER 5428 team->t.t_bar[b].b_master_arrived = 0; 5429 team->t.t_bar[b].b_team_arrived = 0; 5430 #endif 5431 } 5432 } 5433 5434 team->t.t_proc_bind = new_proc_bind; 5435 5436 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5437 team->t.t_id)); 5438 5439 #if OMPT_SUPPORT 5440 __ompt_team_assign_id(team, ompt_parallel_data); 5441 #endif 5442 5443 KMP_MB(); 5444 5445 return team; 5446 } 5447 5448 /* reap team if it is too small, then loop back and check the next one */ 5449 // not sure if this is wise, but, will be redone during the hot-teams 5450 // rewrite. 5451 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5452 team = __kmp_reap_team(team); 5453 __kmp_team_pool = team; 5454 } 5455 5456 /* nothing available in the pool, no matter, make a new team! */ 5457 KMP_MB(); 5458 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5459 5460 /* and set it up */ 5461 team->t.t_max_nproc = max_nproc; 5462 if (max_nproc > 1 && 5463 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5464 // Allocate barrier structure 5465 team->t.b = distributedBarrier::allocate(__kmp_dflt_team_nth_ub); 5466 } 5467 5468 /* NOTE well, for some reason allocating one big buffer and dividing it up 5469 seems to really hurt performance a lot on the P4, so, let's not use this */ 5470 __kmp_allocate_team_arrays(team, max_nproc); 5471 5472 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5473 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5474 5475 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5476 "%p to NULL\n", 5477 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5478 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5479 // memory, no need to duplicate 5480 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5481 // memory, no need to duplicate 5482 5483 if (__kmp_storage_map) { 5484 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5485 } 5486 5487 /* allocate space for arguments */ 5488 __kmp_alloc_argv_entries(argc, team, FALSE); 5489 team->t.t_argc = argc; 5490 5491 KA_TRACE(20, 5492 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5493 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5494 { // Initialize barrier data. 5495 int b; 5496 for (b = 0; b < bs_last_barrier; ++b) { 5497 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5498 #if USE_DEBUGGER 5499 team->t.t_bar[b].b_master_arrived = 0; 5500 team->t.t_bar[b].b_team_arrived = 0; 5501 #endif 5502 } 5503 } 5504 5505 team->t.t_proc_bind = new_proc_bind; 5506 5507 #if OMPT_SUPPORT 5508 __ompt_team_assign_id(team, ompt_parallel_data); 5509 team->t.ompt_serialized_team_info = NULL; 5510 #endif 5511 5512 KMP_MB(); 5513 5514 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5515 team->t.t_id)); 5516 5517 return team; 5518 } 5519 5520 /* TODO implement hot-teams at all levels */ 5521 /* TODO implement lazy thread release on demand (disband request) */ 5522 5523 /* free the team. return it to the team pool. release all the threads 5524 * associated with it */ 5525 void __kmp_free_team(kmp_root_t *root, 5526 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5527 int f; 5528 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5529 team->t.t_id)); 5530 5531 /* verify state */ 5532 KMP_DEBUG_ASSERT(root); 5533 KMP_DEBUG_ASSERT(team); 5534 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5535 KMP_DEBUG_ASSERT(team->t.t_threads); 5536 5537 int use_hot_team = team == root->r.r_hot_team; 5538 #if KMP_NESTED_HOT_TEAMS 5539 int level; 5540 if (master) { 5541 level = team->t.t_active_level - 1; 5542 if (master->th.th_teams_microtask) { // in teams construct? 5543 if (master->th.th_teams_size.nteams > 1) { 5544 ++level; // level was not increased in teams construct for 5545 // team_of_masters 5546 } 5547 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5548 master->th.th_teams_level == team->t.t_level) { 5549 ++level; // level was not increased in teams construct for 5550 // team_of_workers before the parallel 5551 } // team->t.t_level will be increased inside parallel 5552 } 5553 #if KMP_DEBUG 5554 kmp_hot_team_ptr_t *hot_teams = master->th.th_hot_teams; 5555 #endif 5556 if (level < __kmp_hot_teams_max_level) { 5557 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5558 use_hot_team = 1; 5559 } 5560 } 5561 #endif // KMP_NESTED_HOT_TEAMS 5562 5563 /* team is done working */ 5564 TCW_SYNC_PTR(team->t.t_pkfn, 5565 NULL); // Important for Debugging Support Library. 5566 #if KMP_OS_WINDOWS 5567 team->t.t_copyin_counter = 0; // init counter for possible reuse 5568 #endif 5569 // Do not reset pointer to parent team to NULL for hot teams. 5570 5571 /* if we are non-hot team, release our threads */ 5572 if (!use_hot_team) { 5573 if (__kmp_tasking_mode != tskm_immediate_exec) { 5574 // Wait for threads to reach reapable state 5575 for (f = 1; f < team->t.t_nproc; ++f) { 5576 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5577 kmp_info_t *th = team->t.t_threads[f]; 5578 volatile kmp_uint32 *state = &th->th.th_reap_state; 5579 while (*state != KMP_SAFE_TO_REAP) { 5580 #if KMP_OS_WINDOWS 5581 // On Windows a thread can be killed at any time, check this 5582 DWORD ecode; 5583 if (!__kmp_is_thread_alive(th, &ecode)) { 5584 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5585 break; 5586 } 5587 #endif 5588 // first check if thread is sleeping 5589 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5590 if (fl.is_sleeping()) 5591 fl.resume(__kmp_gtid_from_thread(th)); 5592 KMP_CPU_PAUSE(); 5593 } 5594 } 5595 5596 // Delete task teams 5597 int tt_idx; 5598 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5599 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5600 if (task_team != NULL) { 5601 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5602 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5603 team->t.t_threads[f]->th.th_task_team = NULL; 5604 } 5605 KA_TRACE( 5606 20, 5607 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5608 __kmp_get_gtid(), task_team, team->t.t_id)); 5609 #if KMP_NESTED_HOT_TEAMS 5610 __kmp_free_task_team(master, task_team); 5611 #endif 5612 team->t.t_task_team[tt_idx] = NULL; 5613 } 5614 } 5615 } 5616 5617 // Reset pointer to parent team only for non-hot teams. 5618 team->t.t_parent = NULL; 5619 team->t.t_level = 0; 5620 team->t.t_active_level = 0; 5621 5622 /* free the worker threads */ 5623 for (f = 1; f < team->t.t_nproc; ++f) { 5624 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5625 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5626 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 5627 1, 2); 5628 } 5629 __kmp_free_thread(team->t.t_threads[f]); 5630 } 5631 5632 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5633 if (team->t.b) { 5634 // wake up thread at old location 5635 team->t.b->go_release(); 5636 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5637 for (f = 1; f < team->t.t_nproc; ++f) { 5638 if (team->t.b->sleep[f].sleep) { 5639 __kmp_atomic_resume_64( 5640 team->t.t_threads[f]->th.th_info.ds.ds_gtid, 5641 (kmp_atomic_flag_64<> *)NULL); 5642 } 5643 } 5644 } 5645 // Wait for threads to be removed from team 5646 for (int f = 1; f < team->t.t_nproc; ++f) { 5647 while (team->t.t_threads[f]->th.th_used_in_team.load() != 0) 5648 KMP_CPU_PAUSE(); 5649 } 5650 } 5651 } 5652 5653 for (f = 1; f < team->t.t_nproc; ++f) { 5654 team->t.t_threads[f] = NULL; 5655 } 5656 5657 if (team->t.t_max_nproc > 1 && 5658 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 5659 distributedBarrier::deallocate(team->t.b); 5660 team->t.b = NULL; 5661 } 5662 /* put the team back in the team pool */ 5663 /* TODO limit size of team pool, call reap_team if pool too large */ 5664 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5665 __kmp_team_pool = (volatile kmp_team_t *)team; 5666 } else { // Check if team was created for primary threads in teams construct 5667 // See if first worker is a CG root 5668 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5669 team->t.t_threads[1]->th.th_cg_roots); 5670 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5671 // Clean up the CG root nodes on workers so that this team can be re-used 5672 for (f = 1; f < team->t.t_nproc; ++f) { 5673 kmp_info_t *thr = team->t.t_threads[f]; 5674 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5675 thr->th.th_cg_roots->cg_root == thr); 5676 // Pop current CG root off list 5677 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5678 thr->th.th_cg_roots = tmp->up; 5679 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5680 " up to node %p. cg_nthreads was %d\n", 5681 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5682 int i = tmp->cg_nthreads--; 5683 if (i == 1) { 5684 __kmp_free(tmp); // free CG if we are the last thread in it 5685 } 5686 // Restore current task's thread_limit from CG root 5687 if (thr->th.th_cg_roots) 5688 thr->th.th_current_task->td_icvs.thread_limit = 5689 thr->th.th_cg_roots->cg_thread_limit; 5690 } 5691 } 5692 } 5693 5694 KMP_MB(); 5695 } 5696 5697 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5698 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5699 kmp_team_t *next_pool = team->t.t_next_pool; 5700 5701 KMP_DEBUG_ASSERT(team); 5702 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5703 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5704 KMP_DEBUG_ASSERT(team->t.t_threads); 5705 KMP_DEBUG_ASSERT(team->t.t_argv); 5706 5707 /* TODO clean the threads that are a part of this? */ 5708 5709 /* free stuff */ 5710 __kmp_free_team_arrays(team); 5711 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5712 __kmp_free((void *)team->t.t_argv); 5713 __kmp_free(team); 5714 5715 KMP_MB(); 5716 return next_pool; 5717 } 5718 5719 // Free the thread. Don't reap it, just place it on the pool of available 5720 // threads. 5721 // 5722 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5723 // binding for the affinity mechanism to be useful. 5724 // 5725 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5726 // However, we want to avoid a potential performance problem by always 5727 // scanning through the list to find the correct point at which to insert 5728 // the thread (potential N**2 behavior). To do this we keep track of the 5729 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5730 // With single-level parallelism, threads will always be added to the tail 5731 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5732 // parallelism, all bets are off and we may need to scan through the entire 5733 // free list. 5734 // 5735 // This change also has a potentially large performance benefit, for some 5736 // applications. Previously, as threads were freed from the hot team, they 5737 // would be placed back on the free list in inverse order. If the hot team 5738 // grew back to it's original size, then the freed thread would be placed 5739 // back on the hot team in reverse order. This could cause bad cache 5740 // locality problems on programs where the size of the hot team regularly 5741 // grew and shrunk. 5742 // 5743 // Now, for single-level parallelism, the OMP tid is always == gtid. 5744 void __kmp_free_thread(kmp_info_t *this_th) { 5745 int gtid; 5746 kmp_info_t **scan; 5747 5748 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5749 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5750 5751 KMP_DEBUG_ASSERT(this_th); 5752 5753 // When moving thread to pool, switch thread to wait on own b_go flag, and 5754 // uninitialized (NULL team). 5755 int b; 5756 kmp_balign_t *balign = this_th->th.th_bar; 5757 for (b = 0; b < bs_last_barrier; ++b) { 5758 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5759 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5760 balign[b].bb.team = NULL; 5761 balign[b].bb.leaf_kids = 0; 5762 } 5763 this_th->th.th_task_state = 0; 5764 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5765 5766 /* put thread back on the free pool */ 5767 TCW_PTR(this_th->th.th_team, NULL); 5768 TCW_PTR(this_th->th.th_root, NULL); 5769 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5770 5771 while (this_th->th.th_cg_roots) { 5772 this_th->th.th_cg_roots->cg_nthreads--; 5773 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5774 " %p of thread %p to %d\n", 5775 this_th, this_th->th.th_cg_roots, 5776 this_th->th.th_cg_roots->cg_root, 5777 this_th->th.th_cg_roots->cg_nthreads)); 5778 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5779 if (tmp->cg_root == this_th) { // Thread is a cg_root 5780 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5781 KA_TRACE( 5782 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5783 this_th->th.th_cg_roots = tmp->up; 5784 __kmp_free(tmp); 5785 } else { // Worker thread 5786 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5787 __kmp_free(tmp); 5788 } 5789 this_th->th.th_cg_roots = NULL; 5790 break; 5791 } 5792 } 5793 5794 /* If the implicit task assigned to this thread can be used by other threads 5795 * -> multiple threads can share the data and try to free the task at 5796 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5797 * with higher probability when hot team is disabled but can occurs even when 5798 * the hot team is enabled */ 5799 __kmp_free_implicit_task(this_th); 5800 this_th->th.th_current_task = NULL; 5801 5802 // If the __kmp_thread_pool_insert_pt is already past the new insert 5803 // point, then we need to re-scan the entire list. 5804 gtid = this_th->th.th_info.ds.ds_gtid; 5805 if (__kmp_thread_pool_insert_pt != NULL) { 5806 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5807 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5808 __kmp_thread_pool_insert_pt = NULL; 5809 } 5810 } 5811 5812 // Scan down the list to find the place to insert the thread. 5813 // scan is the address of a link in the list, possibly the address of 5814 // __kmp_thread_pool itself. 5815 // 5816 // In the absence of nested parallelism, the for loop will have 0 iterations. 5817 if (__kmp_thread_pool_insert_pt != NULL) { 5818 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5819 } else { 5820 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5821 } 5822 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5823 scan = &((*scan)->th.th_next_pool)) 5824 ; 5825 5826 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5827 // to its address. 5828 TCW_PTR(this_th->th.th_next_pool, *scan); 5829 __kmp_thread_pool_insert_pt = *scan = this_th; 5830 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5831 (this_th->th.th_info.ds.ds_gtid < 5832 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5833 TCW_4(this_th->th.th_in_pool, TRUE); 5834 __kmp_suspend_initialize_thread(this_th); 5835 __kmp_lock_suspend_mx(this_th); 5836 if (this_th->th.th_active == TRUE) { 5837 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5838 this_th->th.th_active_in_pool = TRUE; 5839 } 5840 #if KMP_DEBUG 5841 else { 5842 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5843 } 5844 #endif 5845 __kmp_unlock_suspend_mx(this_th); 5846 5847 TCW_4(__kmp_nth, __kmp_nth - 1); 5848 5849 #ifdef KMP_ADJUST_BLOCKTIME 5850 /* Adjust blocktime back to user setting or default if necessary */ 5851 /* Middle initialization might never have occurred */ 5852 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5853 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5854 if (__kmp_nth <= __kmp_avail_proc) { 5855 __kmp_zero_bt = FALSE; 5856 } 5857 } 5858 #endif /* KMP_ADJUST_BLOCKTIME */ 5859 5860 KMP_MB(); 5861 } 5862 5863 /* ------------------------------------------------------------------------ */ 5864 5865 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5866 #if OMP_PROFILING_SUPPORT 5867 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5868 // TODO: add a configuration option for time granularity 5869 if (ProfileTraceFile) 5870 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5871 #endif 5872 5873 int gtid = this_thr->th.th_info.ds.ds_gtid; 5874 /* void *stack_data;*/ 5875 kmp_team_t **volatile pteam; 5876 5877 KMP_MB(); 5878 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5879 5880 if (__kmp_env_consistency_check) { 5881 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5882 } 5883 5884 #if OMPD_SUPPORT 5885 if (ompd_state & OMPD_ENABLE_BP) 5886 ompd_bp_thread_begin(); 5887 #endif 5888 5889 #if OMPT_SUPPORT 5890 ompt_data_t *thread_data = nullptr; 5891 if (ompt_enabled.enabled) { 5892 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5893 *thread_data = ompt_data_none; 5894 5895 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5896 this_thr->th.ompt_thread_info.wait_id = 0; 5897 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5898 this_thr->th.ompt_thread_info.parallel_flags = 0; 5899 if (ompt_enabled.ompt_callback_thread_begin) { 5900 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5901 ompt_thread_worker, thread_data); 5902 } 5903 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5904 } 5905 #endif 5906 5907 /* This is the place where threads wait for work */ 5908 while (!TCR_4(__kmp_global.g.g_done)) { 5909 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5910 KMP_MB(); 5911 5912 /* wait for work to do */ 5913 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5914 5915 /* No tid yet since not part of a team */ 5916 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5917 5918 #if OMPT_SUPPORT 5919 if (ompt_enabled.enabled) { 5920 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5921 } 5922 #endif 5923 5924 pteam = &this_thr->th.th_team; 5925 5926 /* have we been allocated? */ 5927 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5928 /* we were just woken up, so run our new task */ 5929 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5930 int rc; 5931 KA_TRACE(20, 5932 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5933 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5934 (*pteam)->t.t_pkfn)); 5935 5936 updateHWFPControl(*pteam); 5937 5938 #if OMPT_SUPPORT 5939 if (ompt_enabled.enabled) { 5940 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5941 } 5942 #endif 5943 5944 rc = (*pteam)->t.t_invoke(gtid); 5945 KMP_ASSERT(rc); 5946 5947 KMP_MB(); 5948 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5949 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5950 (*pteam)->t.t_pkfn)); 5951 } 5952 #if OMPT_SUPPORT 5953 if (ompt_enabled.enabled) { 5954 /* no frame set while outside task */ 5955 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5956 5957 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5958 } 5959 #endif 5960 /* join barrier after parallel region */ 5961 __kmp_join_barrier(gtid); 5962 } 5963 } 5964 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5965 5966 #if OMPD_SUPPORT 5967 if (ompd_state & OMPD_ENABLE_BP) 5968 ompd_bp_thread_end(); 5969 #endif 5970 5971 #if OMPT_SUPPORT 5972 if (ompt_enabled.ompt_callback_thread_end) { 5973 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5974 } 5975 #endif 5976 5977 this_thr->th.th_task_team = NULL; 5978 /* run the destructors for the threadprivate data for this thread */ 5979 __kmp_common_destroy_gtid(gtid); 5980 5981 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5982 KMP_MB(); 5983 5984 #if OMP_PROFILING_SUPPORT 5985 llvm::timeTraceProfilerFinishThread(); 5986 #endif 5987 return this_thr; 5988 } 5989 5990 /* ------------------------------------------------------------------------ */ 5991 5992 void __kmp_internal_end_dest(void *specific_gtid) { 5993 // Make sure no significant bits are lost 5994 int gtid; 5995 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5996 5997 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5998 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5999 * this is because 0 is reserved for the nothing-stored case */ 6000 6001 __kmp_internal_end_thread(gtid); 6002 } 6003 6004 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 6005 6006 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 6007 __kmp_internal_end_atexit(); 6008 } 6009 6010 #endif 6011 6012 /* [Windows] josh: when the atexit handler is called, there may still be more 6013 than one thread alive */ 6014 void __kmp_internal_end_atexit(void) { 6015 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 6016 /* [Windows] 6017 josh: ideally, we want to completely shutdown the library in this atexit 6018 handler, but stat code that depends on thread specific data for gtid fails 6019 because that data becomes unavailable at some point during the shutdown, so 6020 we call __kmp_internal_end_thread instead. We should eventually remove the 6021 dependency on __kmp_get_specific_gtid in the stat code and use 6022 __kmp_internal_end_library to cleanly shutdown the library. 6023 6024 // TODO: Can some of this comment about GVS be removed? 6025 I suspect that the offending stat code is executed when the calling thread 6026 tries to clean up a dead root thread's data structures, resulting in GVS 6027 code trying to close the GVS structures for that thread, but since the stat 6028 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 6029 the calling thread is cleaning up itself instead of another thread, it get 6030 confused. This happens because allowing a thread to unregister and cleanup 6031 another thread is a recent modification for addressing an issue. 6032 Based on the current design (20050722), a thread may end up 6033 trying to unregister another thread only if thread death does not trigger 6034 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 6035 thread specific data destructor function to detect thread death. For 6036 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 6037 is nothing. Thus, the workaround is applicable only for Windows static 6038 stat library. */ 6039 __kmp_internal_end_library(-1); 6040 #if KMP_OS_WINDOWS 6041 __kmp_close_console(); 6042 #endif 6043 } 6044 6045 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 6046 // It is assumed __kmp_forkjoin_lock is acquired. 6047 6048 int gtid; 6049 6050 KMP_DEBUG_ASSERT(thread != NULL); 6051 6052 gtid = thread->th.th_info.ds.ds_gtid; 6053 6054 if (!is_root) { 6055 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 6056 /* Assume the threads are at the fork barrier here */ 6057 KA_TRACE( 6058 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 6059 gtid)); 6060 if (__kmp_barrier_gather_pattern[bs_forkjoin_barrier] == bp_dist_bar) { 6061 while ( 6062 !KMP_COMPARE_AND_STORE_ACQ32(&(thread->th.th_used_in_team), 0, 3)) 6063 KMP_CPU_PAUSE(); 6064 __kmp_resume_32(gtid, (kmp_flag_32<false, false> *)NULL); 6065 } else { 6066 /* Need release fence here to prevent seg faults for tree forkjoin 6067 barrier (GEH) */ 6068 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 6069 thread); 6070 __kmp_release_64(&flag); 6071 } 6072 } 6073 6074 // Terminate OS thread. 6075 __kmp_reap_worker(thread); 6076 6077 // The thread was killed asynchronously. If it was actively 6078 // spinning in the thread pool, decrement the global count. 6079 // 6080 // There is a small timing hole here - if the worker thread was just waking 6081 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 6082 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 6083 // the global counter might not get updated. 6084 // 6085 // Currently, this can only happen as the library is unloaded, 6086 // so there are no harmful side effects. 6087 if (thread->th.th_active_in_pool) { 6088 thread->th.th_active_in_pool = FALSE; 6089 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 6090 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 6091 } 6092 } 6093 6094 __kmp_free_implicit_task(thread); 6095 6096 // Free the fast memory for tasking 6097 #if USE_FAST_MEMORY 6098 __kmp_free_fast_memory(thread); 6099 #endif /* USE_FAST_MEMORY */ 6100 6101 __kmp_suspend_uninitialize_thread(thread); 6102 6103 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 6104 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 6105 6106 --__kmp_all_nth; 6107 // __kmp_nth was decremented when thread is added to the pool. 6108 6109 #ifdef KMP_ADJUST_BLOCKTIME 6110 /* Adjust blocktime back to user setting or default if necessary */ 6111 /* Middle initialization might never have occurred */ 6112 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6113 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6114 if (__kmp_nth <= __kmp_avail_proc) { 6115 __kmp_zero_bt = FALSE; 6116 } 6117 } 6118 #endif /* KMP_ADJUST_BLOCKTIME */ 6119 6120 /* free the memory being used */ 6121 if (__kmp_env_consistency_check) { 6122 if (thread->th.th_cons) { 6123 __kmp_free_cons_stack(thread->th.th_cons); 6124 thread->th.th_cons = NULL; 6125 } 6126 } 6127 6128 if (thread->th.th_pri_common != NULL) { 6129 __kmp_free(thread->th.th_pri_common); 6130 thread->th.th_pri_common = NULL; 6131 } 6132 6133 if (thread->th.th_task_state_memo_stack != NULL) { 6134 __kmp_free(thread->th.th_task_state_memo_stack); 6135 thread->th.th_task_state_memo_stack = NULL; 6136 } 6137 6138 #if KMP_USE_BGET 6139 if (thread->th.th_local.bget_data != NULL) { 6140 __kmp_finalize_bget(thread); 6141 } 6142 #endif 6143 6144 #if KMP_AFFINITY_SUPPORTED 6145 if (thread->th.th_affin_mask != NULL) { 6146 KMP_CPU_FREE(thread->th.th_affin_mask); 6147 thread->th.th_affin_mask = NULL; 6148 } 6149 #endif /* KMP_AFFINITY_SUPPORTED */ 6150 6151 #if KMP_USE_HIER_SCHED 6152 if (thread->th.th_hier_bar_data != NULL) { 6153 __kmp_free(thread->th.th_hier_bar_data); 6154 thread->th.th_hier_bar_data = NULL; 6155 } 6156 #endif 6157 6158 __kmp_reap_team(thread->th.th_serial_team); 6159 thread->th.th_serial_team = NULL; 6160 __kmp_free(thread); 6161 6162 KMP_MB(); 6163 6164 } // __kmp_reap_thread 6165 6166 static void __kmp_internal_end(void) { 6167 int i; 6168 6169 /* First, unregister the library */ 6170 __kmp_unregister_library(); 6171 6172 #if KMP_OS_WINDOWS 6173 /* In Win static library, we can't tell when a root actually dies, so we 6174 reclaim the data structures for any root threads that have died but not 6175 unregistered themselves, in order to shut down cleanly. 6176 In Win dynamic library we also can't tell when a thread dies. */ 6177 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6178 // dead roots 6179 #endif 6180 6181 for (i = 0; i < __kmp_threads_capacity; i++) 6182 if (__kmp_root[i]) 6183 if (__kmp_root[i]->r.r_active) 6184 break; 6185 KMP_MB(); /* Flush all pending memory write invalidates. */ 6186 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6187 6188 if (i < __kmp_threads_capacity) { 6189 #if KMP_USE_MONITOR 6190 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6191 KMP_MB(); /* Flush all pending memory write invalidates. */ 6192 6193 // Need to check that monitor was initialized before reaping it. If we are 6194 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6195 // __kmp_monitor will appear to contain valid data, but it is only valid in 6196 // the parent process, not the child. 6197 // New behavior (201008): instead of keying off of the flag 6198 // __kmp_init_parallel, the monitor thread creation is keyed off 6199 // of the new flag __kmp_init_monitor. 6200 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6201 if (TCR_4(__kmp_init_monitor)) { 6202 __kmp_reap_monitor(&__kmp_monitor); 6203 TCW_4(__kmp_init_monitor, 0); 6204 } 6205 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6206 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6207 #endif // KMP_USE_MONITOR 6208 } else { 6209 /* TODO move this to cleanup code */ 6210 #ifdef KMP_DEBUG 6211 /* make sure that everything has properly ended */ 6212 for (i = 0; i < __kmp_threads_capacity; i++) { 6213 if (__kmp_root[i]) { 6214 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6215 // there can be uber threads alive here 6216 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6217 } 6218 } 6219 #endif 6220 6221 KMP_MB(); 6222 6223 // Reap the worker threads. 6224 // This is valid for now, but be careful if threads are reaped sooner. 6225 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6226 // Get the next thread from the pool. 6227 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6228 __kmp_thread_pool = thread->th.th_next_pool; 6229 // Reap it. 6230 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6231 thread->th.th_next_pool = NULL; 6232 thread->th.th_in_pool = FALSE; 6233 __kmp_reap_thread(thread, 0); 6234 } 6235 __kmp_thread_pool_insert_pt = NULL; 6236 6237 // Reap teams. 6238 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6239 // Get the next team from the pool. 6240 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6241 __kmp_team_pool = team->t.t_next_pool; 6242 // Reap it. 6243 team->t.t_next_pool = NULL; 6244 __kmp_reap_team(team); 6245 } 6246 6247 __kmp_reap_task_teams(); 6248 6249 #if KMP_OS_UNIX 6250 // Threads that are not reaped should not access any resources since they 6251 // are going to be deallocated soon, so the shutdown sequence should wait 6252 // until all threads either exit the final spin-waiting loop or begin 6253 // sleeping after the given blocktime. 6254 for (i = 0; i < __kmp_threads_capacity; i++) { 6255 kmp_info_t *thr = __kmp_threads[i]; 6256 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6257 KMP_CPU_PAUSE(); 6258 } 6259 #endif 6260 6261 for (i = 0; i < __kmp_threads_capacity; ++i) { 6262 // TBD: Add some checking... 6263 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6264 } 6265 6266 /* Make sure all threadprivate destructors get run by joining with all 6267 worker threads before resetting this flag */ 6268 TCW_SYNC_4(__kmp_init_common, FALSE); 6269 6270 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6271 KMP_MB(); 6272 6273 #if KMP_USE_MONITOR 6274 // See note above: One of the possible fixes for CQ138434 / CQ140126 6275 // 6276 // FIXME: push both code fragments down and CSE them? 6277 // push them into __kmp_cleanup() ? 6278 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6279 if (TCR_4(__kmp_init_monitor)) { 6280 __kmp_reap_monitor(&__kmp_monitor); 6281 TCW_4(__kmp_init_monitor, 0); 6282 } 6283 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6284 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6285 #endif 6286 } /* else !__kmp_global.t_active */ 6287 TCW_4(__kmp_init_gtid, FALSE); 6288 KMP_MB(); /* Flush all pending memory write invalidates. */ 6289 6290 __kmp_cleanup(); 6291 #if OMPT_SUPPORT 6292 ompt_fini(); 6293 #endif 6294 } 6295 6296 void __kmp_internal_end_library(int gtid_req) { 6297 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6298 /* this shouldn't be a race condition because __kmp_internal_end() is the 6299 only place to clear __kmp_serial_init */ 6300 /* we'll check this later too, after we get the lock */ 6301 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6302 // redundant, because the next check will work in any case. 6303 if (__kmp_global.g.g_abort) { 6304 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6305 /* TODO abort? */ 6306 return; 6307 } 6308 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6309 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6310 return; 6311 } 6312 6313 // If hidden helper team has been initialized, we need to deinit it 6314 if (TCR_4(__kmp_init_hidden_helper) && 6315 !TCR_4(__kmp_hidden_helper_team_done)) { 6316 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6317 // First release the main thread to let it continue its work 6318 __kmp_hidden_helper_main_thread_release(); 6319 // Wait until the hidden helper team has been destroyed 6320 __kmp_hidden_helper_threads_deinitz_wait(); 6321 } 6322 6323 KMP_MB(); /* Flush all pending memory write invalidates. */ 6324 /* find out who we are and what we should do */ 6325 { 6326 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6327 KA_TRACE( 6328 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6329 if (gtid == KMP_GTID_SHUTDOWN) { 6330 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6331 "already shutdown\n")); 6332 return; 6333 } else if (gtid == KMP_GTID_MONITOR) { 6334 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6335 "registered, or system shutdown\n")); 6336 return; 6337 } else if (gtid == KMP_GTID_DNE) { 6338 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6339 "shutdown\n")); 6340 /* we don't know who we are, but we may still shutdown the library */ 6341 } else if (KMP_UBER_GTID(gtid)) { 6342 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6343 if (__kmp_root[gtid]->r.r_active) { 6344 __kmp_global.g.g_abort = -1; 6345 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6346 __kmp_unregister_library(); 6347 KA_TRACE(10, 6348 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6349 gtid)); 6350 return; 6351 } else { 6352 KA_TRACE( 6353 10, 6354 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6355 __kmp_unregister_root_current_thread(gtid); 6356 } 6357 } else { 6358 /* worker threads may call this function through the atexit handler, if they 6359 * call exit() */ 6360 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6361 TODO: do a thorough shutdown instead */ 6362 #ifdef DUMP_DEBUG_ON_EXIT 6363 if (__kmp_debug_buf) 6364 __kmp_dump_debug_buffer(); 6365 #endif 6366 // added unregister library call here when we switch to shm linux 6367 // if we don't, it will leave lots of files in /dev/shm 6368 // cleanup shared memory file before exiting. 6369 __kmp_unregister_library(); 6370 return; 6371 } 6372 } 6373 /* synchronize the termination process */ 6374 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6375 6376 /* have we already finished */ 6377 if (__kmp_global.g.g_abort) { 6378 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6379 /* TODO abort? */ 6380 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6381 return; 6382 } 6383 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6384 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6385 return; 6386 } 6387 6388 /* We need this lock to enforce mutex between this reading of 6389 __kmp_threads_capacity and the writing by __kmp_register_root. 6390 Alternatively, we can use a counter of roots that is atomically updated by 6391 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6392 __kmp_internal_end_*. */ 6393 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6394 6395 /* now we can safely conduct the actual termination */ 6396 __kmp_internal_end(); 6397 6398 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6399 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6400 6401 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6402 6403 #ifdef DUMP_DEBUG_ON_EXIT 6404 if (__kmp_debug_buf) 6405 __kmp_dump_debug_buffer(); 6406 #endif 6407 6408 #if KMP_OS_WINDOWS 6409 __kmp_close_console(); 6410 #endif 6411 6412 __kmp_fini_allocator(); 6413 6414 } // __kmp_internal_end_library 6415 6416 void __kmp_internal_end_thread(int gtid_req) { 6417 int i; 6418 6419 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6420 /* this shouldn't be a race condition because __kmp_internal_end() is the 6421 * only place to clear __kmp_serial_init */ 6422 /* we'll check this later too, after we get the lock */ 6423 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6424 // redundant, because the next check will work in any case. 6425 if (__kmp_global.g.g_abort) { 6426 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6427 /* TODO abort? */ 6428 return; 6429 } 6430 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6431 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6432 return; 6433 } 6434 6435 // If hidden helper team has been initialized, we need to deinit it 6436 if (TCR_4(__kmp_init_hidden_helper) && 6437 !TCR_4(__kmp_hidden_helper_team_done)) { 6438 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6439 // First release the main thread to let it continue its work 6440 __kmp_hidden_helper_main_thread_release(); 6441 // Wait until the hidden helper team has been destroyed 6442 __kmp_hidden_helper_threads_deinitz_wait(); 6443 } 6444 6445 KMP_MB(); /* Flush all pending memory write invalidates. */ 6446 6447 /* find out who we are and what we should do */ 6448 { 6449 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6450 KA_TRACE(10, 6451 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6452 if (gtid == KMP_GTID_SHUTDOWN) { 6453 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6454 "already shutdown\n")); 6455 return; 6456 } else if (gtid == KMP_GTID_MONITOR) { 6457 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6458 "registered, or system shutdown\n")); 6459 return; 6460 } else if (gtid == KMP_GTID_DNE) { 6461 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6462 "shutdown\n")); 6463 return; 6464 /* we don't know who we are */ 6465 } else if (KMP_UBER_GTID(gtid)) { 6466 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6467 if (__kmp_root[gtid]->r.r_active) { 6468 __kmp_global.g.g_abort = -1; 6469 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6470 KA_TRACE(10, 6471 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6472 gtid)); 6473 return; 6474 } else { 6475 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6476 gtid)); 6477 __kmp_unregister_root_current_thread(gtid); 6478 } 6479 } else { 6480 /* just a worker thread, let's leave */ 6481 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6482 6483 if (gtid >= 0) { 6484 __kmp_threads[gtid]->th.th_task_team = NULL; 6485 } 6486 6487 KA_TRACE(10, 6488 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6489 gtid)); 6490 return; 6491 } 6492 } 6493 #if KMP_DYNAMIC_LIB 6494 if (__kmp_pause_status != kmp_hard_paused) 6495 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6496 // because we will better shutdown later in the library destructor. 6497 { 6498 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6499 return; 6500 } 6501 #endif 6502 /* synchronize the termination process */ 6503 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6504 6505 /* have we already finished */ 6506 if (__kmp_global.g.g_abort) { 6507 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6508 /* TODO abort? */ 6509 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6510 return; 6511 } 6512 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6513 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6514 return; 6515 } 6516 6517 /* We need this lock to enforce mutex between this reading of 6518 __kmp_threads_capacity and the writing by __kmp_register_root. 6519 Alternatively, we can use a counter of roots that is atomically updated by 6520 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6521 __kmp_internal_end_*. */ 6522 6523 /* should we finish the run-time? are all siblings done? */ 6524 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6525 6526 for (i = 0; i < __kmp_threads_capacity; ++i) { 6527 if (KMP_UBER_GTID(i)) { 6528 KA_TRACE( 6529 10, 6530 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6531 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6532 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6533 return; 6534 } 6535 } 6536 6537 /* now we can safely conduct the actual termination */ 6538 6539 __kmp_internal_end(); 6540 6541 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6542 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6543 6544 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6545 6546 #ifdef DUMP_DEBUG_ON_EXIT 6547 if (__kmp_debug_buf) 6548 __kmp_dump_debug_buffer(); 6549 #endif 6550 } // __kmp_internal_end_thread 6551 6552 // ----------------------------------------------------------------------------- 6553 // Library registration stuff. 6554 6555 static long __kmp_registration_flag = 0; 6556 // Random value used to indicate library initialization. 6557 static char *__kmp_registration_str = NULL; 6558 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6559 6560 static inline char *__kmp_reg_status_name() { 6561 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6562 each thread. If registration and unregistration go in different threads 6563 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6564 env var can not be found, because the name will contain different pid. */ 6565 // macOS* complains about name being too long with additional getuid() 6566 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6567 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6568 (int)getuid()); 6569 #else 6570 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6571 #endif 6572 } // __kmp_reg_status_get 6573 6574 void __kmp_register_library_startup(void) { 6575 6576 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6577 int done = 0; 6578 union { 6579 double dtime; 6580 long ltime; 6581 } time; 6582 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6583 __kmp_initialize_system_tick(); 6584 #endif 6585 __kmp_read_system_time(&time.dtime); 6586 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6587 __kmp_registration_str = 6588 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6589 __kmp_registration_flag, KMP_LIBRARY_FILE); 6590 6591 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6592 __kmp_registration_str)); 6593 6594 while (!done) { 6595 6596 char *value = NULL; // Actual value of the environment variable. 6597 6598 #if defined(KMP_USE_SHM) 6599 char *shm_name = __kmp_str_format("/%s", name); 6600 int shm_preexist = 0; 6601 char *data1; 6602 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6603 if ((fd1 == -1) && (errno == EEXIST)) { 6604 // file didn't open because it already exists. 6605 // try opening existing file 6606 fd1 = shm_open(shm_name, O_RDWR, 0666); 6607 if (fd1 == -1) { // file didn't open 6608 // error out here 6609 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6610 __kmp_msg_null); 6611 } else { 6612 // able to open existing file 6613 shm_preexist = 1; 6614 } 6615 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6616 // already exists. 6617 // error out here. 6618 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6619 __kmp_msg_null); 6620 } 6621 if (shm_preexist == 0) { 6622 // we created SHM now set size 6623 if (ftruncate(fd1, SHM_SIZE) == -1) { 6624 // error occured setting size; 6625 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6626 KMP_ERR(errno), __kmp_msg_null); 6627 } 6628 } 6629 data1 = 6630 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6631 if (data1 == MAP_FAILED) { 6632 // failed to map shared memory 6633 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6634 __kmp_msg_null); 6635 } 6636 if (shm_preexist == 0) { // set data to SHM, set value 6637 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6638 } 6639 // Read value from either what we just wrote or existing file. 6640 value = __kmp_str_format("%s", data1); // read value from SHM 6641 munmap(data1, SHM_SIZE); 6642 close(fd1); 6643 #else // Windows and unix with static library 6644 // Set environment variable, but do not overwrite if it is exist. 6645 __kmp_env_set(name, __kmp_registration_str, 0); 6646 // read value to see if it got set 6647 value = __kmp_env_get(name); 6648 #endif 6649 6650 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6651 done = 1; // Ok, environment variable set successfully, exit the loop. 6652 } else { 6653 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6654 // Check whether it alive or dead. 6655 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6656 char *tail = value; 6657 char *flag_addr_str = NULL; 6658 char *flag_val_str = NULL; 6659 char const *file_name = NULL; 6660 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6661 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6662 file_name = tail; 6663 if (tail != NULL) { 6664 unsigned long *flag_addr = 0; 6665 unsigned long flag_val = 0; 6666 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6667 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6668 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6669 // First, check whether environment-encoded address is mapped into 6670 // addr space. 6671 // If so, dereference it to see if it still has the right value. 6672 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6673 neighbor = 1; 6674 } else { 6675 // If not, then we know the other copy of the library is no longer 6676 // running. 6677 neighbor = 2; 6678 } 6679 } 6680 } 6681 switch (neighbor) { 6682 case 0: // Cannot parse environment variable -- neighbor status unknown. 6683 // Assume it is the incompatible format of future version of the 6684 // library. Assume the other library is alive. 6685 // WARN( ... ); // TODO: Issue a warning. 6686 file_name = "unknown library"; 6687 KMP_FALLTHROUGH(); 6688 // Attention! Falling to the next case. That's intentional. 6689 case 1: { // Neighbor is alive. 6690 // Check it is allowed. 6691 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6692 if (!__kmp_str_match_true(duplicate_ok)) { 6693 // That's not allowed. Issue fatal error. 6694 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6695 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6696 } 6697 KMP_INTERNAL_FREE(duplicate_ok); 6698 __kmp_duplicate_library_ok = 1; 6699 done = 1; // Exit the loop. 6700 } break; 6701 case 2: { // Neighbor is dead. 6702 6703 #if defined(KMP_USE_SHM) 6704 // close shared memory. 6705 shm_unlink(shm_name); // this removes file in /dev/shm 6706 #else 6707 // Clear the variable and try to register library again. 6708 __kmp_env_unset(name); 6709 #endif 6710 } break; 6711 default: { 6712 KMP_DEBUG_ASSERT(0); 6713 } break; 6714 } 6715 } 6716 KMP_INTERNAL_FREE((void *)value); 6717 #if defined(KMP_USE_SHM) 6718 KMP_INTERNAL_FREE((void *)shm_name); 6719 #endif 6720 } // while 6721 KMP_INTERNAL_FREE((void *)name); 6722 6723 } // func __kmp_register_library_startup 6724 6725 void __kmp_unregister_library(void) { 6726 6727 char *name = __kmp_reg_status_name(); 6728 char *value = NULL; 6729 6730 #if defined(KMP_USE_SHM) 6731 char *shm_name = __kmp_str_format("/%s", name); 6732 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6733 if (fd1 == -1) { 6734 // file did not open. return. 6735 return; 6736 } 6737 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6738 if (data1 != MAP_FAILED) { 6739 value = __kmp_str_format("%s", data1); // read value from SHM 6740 munmap(data1, SHM_SIZE); 6741 } 6742 close(fd1); 6743 #else 6744 value = __kmp_env_get(name); 6745 #endif 6746 6747 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6748 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6749 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6750 // Ok, this is our variable. Delete it. 6751 #if defined(KMP_USE_SHM) 6752 shm_unlink(shm_name); // this removes file in /dev/shm 6753 #else 6754 __kmp_env_unset(name); 6755 #endif 6756 } 6757 6758 #if defined(KMP_USE_SHM) 6759 KMP_INTERNAL_FREE(shm_name); 6760 #endif 6761 6762 KMP_INTERNAL_FREE(__kmp_registration_str); 6763 KMP_INTERNAL_FREE(value); 6764 KMP_INTERNAL_FREE(name); 6765 6766 __kmp_registration_flag = 0; 6767 __kmp_registration_str = NULL; 6768 6769 } // __kmp_unregister_library 6770 6771 // End of Library registration stuff. 6772 // ----------------------------------------------------------------------------- 6773 6774 #if KMP_MIC_SUPPORTED 6775 6776 static void __kmp_check_mic_type() { 6777 kmp_cpuid_t cpuid_state = {0}; 6778 kmp_cpuid_t *cs_p = &cpuid_state; 6779 __kmp_x86_cpuid(1, 0, cs_p); 6780 // We don't support mic1 at the moment 6781 if ((cs_p->eax & 0xff0) == 0xB10) { 6782 __kmp_mic_type = mic2; 6783 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6784 __kmp_mic_type = mic3; 6785 } else { 6786 __kmp_mic_type = non_mic; 6787 } 6788 } 6789 6790 #endif /* KMP_MIC_SUPPORTED */ 6791 6792 #if KMP_HAVE_UMWAIT 6793 static void __kmp_user_level_mwait_init() { 6794 struct kmp_cpuid buf; 6795 __kmp_x86_cpuid(7, 0, &buf); 6796 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6797 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6798 __kmp_umwait_enabled)); 6799 } 6800 #elif KMP_HAVE_MWAIT 6801 #ifndef AT_INTELPHIUSERMWAIT 6802 // Spurious, non-existent value that should always fail to return anything. 6803 // Will be replaced with the correct value when we know that. 6804 #define AT_INTELPHIUSERMWAIT 10000 6805 #endif 6806 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6807 // earlier OS is used to build the RTL, we'll use the following internal 6808 // function when the entry is not found. 6809 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6810 unsigned long getauxval(unsigned long) { return 0; } 6811 6812 static void __kmp_user_level_mwait_init() { 6813 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6814 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6815 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6816 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6817 if (__kmp_mic_type == mic3) { 6818 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6819 if ((res & 0x1) || __kmp_user_level_mwait) { 6820 __kmp_mwait_enabled = TRUE; 6821 if (__kmp_user_level_mwait) { 6822 KMP_INFORM(EnvMwaitWarn); 6823 } 6824 } else { 6825 __kmp_mwait_enabled = FALSE; 6826 } 6827 } 6828 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6829 "__kmp_mwait_enabled = %d\n", 6830 __kmp_mic_type, __kmp_mwait_enabled)); 6831 } 6832 #endif /* KMP_HAVE_UMWAIT */ 6833 6834 static void __kmp_do_serial_initialize(void) { 6835 int i, gtid; 6836 size_t size; 6837 6838 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6839 6840 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6841 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6842 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6843 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6844 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6845 6846 #if OMPT_SUPPORT 6847 ompt_pre_init(); 6848 #endif 6849 #if OMPD_SUPPORT 6850 __kmp_env_dump(); 6851 ompd_init(); 6852 #endif 6853 6854 __kmp_validate_locks(); 6855 6856 /* Initialize internal memory allocator */ 6857 __kmp_init_allocator(); 6858 6859 /* Register the library startup via an environment variable and check to see 6860 whether another copy of the library is already registered. */ 6861 6862 __kmp_register_library_startup(); 6863 6864 /* TODO reinitialization of library */ 6865 if (TCR_4(__kmp_global.g.g_done)) { 6866 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6867 } 6868 6869 __kmp_global.g.g_abort = 0; 6870 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6871 6872 /* initialize the locks */ 6873 #if KMP_USE_ADAPTIVE_LOCKS 6874 #if KMP_DEBUG_ADAPTIVE_LOCKS 6875 __kmp_init_speculative_stats(); 6876 #endif 6877 #endif 6878 #if KMP_STATS_ENABLED 6879 __kmp_stats_init(); 6880 #endif 6881 __kmp_init_lock(&__kmp_global_lock); 6882 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6883 __kmp_init_lock(&__kmp_debug_lock); 6884 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6885 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6886 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6887 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6888 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6889 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6890 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6891 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6892 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6893 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6894 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6895 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6896 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6897 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6898 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6899 #if KMP_USE_MONITOR 6900 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6901 #endif 6902 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6903 6904 /* conduct initialization and initial setup of configuration */ 6905 6906 __kmp_runtime_initialize(); 6907 6908 #if KMP_MIC_SUPPORTED 6909 __kmp_check_mic_type(); 6910 #endif 6911 6912 // Some global variable initialization moved here from kmp_env_initialize() 6913 #ifdef KMP_DEBUG 6914 kmp_diag = 0; 6915 #endif 6916 __kmp_abort_delay = 0; 6917 6918 // From __kmp_init_dflt_team_nth() 6919 /* assume the entire machine will be used */ 6920 __kmp_dflt_team_nth_ub = __kmp_xproc; 6921 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6922 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6923 } 6924 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6925 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6926 } 6927 __kmp_max_nth = __kmp_sys_max_nth; 6928 __kmp_cg_max_nth = __kmp_sys_max_nth; 6929 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6930 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6931 __kmp_teams_max_nth = __kmp_sys_max_nth; 6932 } 6933 6934 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6935 // part 6936 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6937 #if KMP_USE_MONITOR 6938 __kmp_monitor_wakeups = 6939 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6940 __kmp_bt_intervals = 6941 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6942 #endif 6943 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6944 __kmp_library = library_throughput; 6945 // From KMP_SCHEDULE initialization 6946 __kmp_static = kmp_sch_static_balanced; 6947 // AC: do not use analytical here, because it is non-monotonous 6948 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6949 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6950 // need to repeat assignment 6951 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6952 // bit control and barrier method control parts 6953 #if KMP_FAST_REDUCTION_BARRIER 6954 #define kmp_reduction_barrier_gather_bb ((int)1) 6955 #define kmp_reduction_barrier_release_bb ((int)1) 6956 #define kmp_reduction_barrier_gather_pat __kmp_barrier_gather_pat_dflt 6957 #define kmp_reduction_barrier_release_pat __kmp_barrier_release_pat_dflt 6958 #endif // KMP_FAST_REDUCTION_BARRIER 6959 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6960 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6961 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6962 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6963 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6964 #if KMP_FAST_REDUCTION_BARRIER 6965 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6966 // lin_64 ): hyper,1 6967 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6968 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6969 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6970 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6971 } 6972 #endif // KMP_FAST_REDUCTION_BARRIER 6973 } 6974 #if KMP_FAST_REDUCTION_BARRIER 6975 #undef kmp_reduction_barrier_release_pat 6976 #undef kmp_reduction_barrier_gather_pat 6977 #undef kmp_reduction_barrier_release_bb 6978 #undef kmp_reduction_barrier_gather_bb 6979 #endif // KMP_FAST_REDUCTION_BARRIER 6980 #if KMP_MIC_SUPPORTED 6981 if (__kmp_mic_type == mic2) { // KNC 6982 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6983 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6984 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6985 1; // forkjoin release 6986 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6987 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6988 } 6989 #if KMP_FAST_REDUCTION_BARRIER 6990 if (__kmp_mic_type == mic2) { // KNC 6991 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6992 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6993 } 6994 #endif // KMP_FAST_REDUCTION_BARRIER 6995 #endif // KMP_MIC_SUPPORTED 6996 6997 // From KMP_CHECKS initialization 6998 #ifdef KMP_DEBUG 6999 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 7000 #else 7001 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 7002 #endif 7003 7004 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 7005 __kmp_foreign_tp = TRUE; 7006 7007 __kmp_global.g.g_dynamic = FALSE; 7008 __kmp_global.g.g_dynamic_mode = dynamic_default; 7009 7010 __kmp_init_nesting_mode(); 7011 7012 __kmp_env_initialize(NULL); 7013 7014 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 7015 __kmp_user_level_mwait_init(); 7016 #endif 7017 // Print all messages in message catalog for testing purposes. 7018 #ifdef KMP_DEBUG 7019 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 7020 if (__kmp_str_match_true(val)) { 7021 kmp_str_buf_t buffer; 7022 __kmp_str_buf_init(&buffer); 7023 __kmp_i18n_dump_catalog(&buffer); 7024 __kmp_printf("%s", buffer.str); 7025 __kmp_str_buf_free(&buffer); 7026 } 7027 __kmp_env_free(&val); 7028 #endif 7029 7030 __kmp_threads_capacity = 7031 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 7032 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 7033 __kmp_tp_capacity = __kmp_default_tp_capacity( 7034 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 7035 7036 // If the library is shut down properly, both pools must be NULL. Just in 7037 // case, set them to NULL -- some memory may leak, but subsequent code will 7038 // work even if pools are not freed. 7039 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 7040 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 7041 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 7042 __kmp_thread_pool = NULL; 7043 __kmp_thread_pool_insert_pt = NULL; 7044 __kmp_team_pool = NULL; 7045 7046 /* Allocate all of the variable sized records */ 7047 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 7048 * expandable */ 7049 /* Since allocation is cache-aligned, just add extra padding at the end */ 7050 size = 7051 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 7052 CACHE_LINE; 7053 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 7054 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 7055 sizeof(kmp_info_t *) * __kmp_threads_capacity); 7056 7057 /* init thread counts */ 7058 KMP_DEBUG_ASSERT(__kmp_all_nth == 7059 0); // Asserts fail if the library is reinitializing and 7060 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 7061 __kmp_all_nth = 0; 7062 __kmp_nth = 0; 7063 7064 /* setup the uber master thread and hierarchy */ 7065 gtid = __kmp_register_root(TRUE); 7066 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 7067 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7068 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 7069 7070 KMP_MB(); /* Flush all pending memory write invalidates. */ 7071 7072 __kmp_common_initialize(); 7073 7074 #if KMP_OS_UNIX 7075 /* invoke the child fork handler */ 7076 __kmp_register_atfork(); 7077 #endif 7078 7079 #if !KMP_DYNAMIC_LIB 7080 { 7081 /* Invoke the exit handler when the program finishes, only for static 7082 library. For dynamic library, we already have _fini and DllMain. */ 7083 int rc = atexit(__kmp_internal_end_atexit); 7084 if (rc != 0) { 7085 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 7086 __kmp_msg_null); 7087 } 7088 } 7089 #endif 7090 7091 #if KMP_HANDLE_SIGNALS 7092 #if KMP_OS_UNIX 7093 /* NOTE: make sure that this is called before the user installs their own 7094 signal handlers so that the user handlers are called first. this way they 7095 can return false, not call our handler, avoid terminating the library, and 7096 continue execution where they left off. */ 7097 __kmp_install_signals(FALSE); 7098 #endif /* KMP_OS_UNIX */ 7099 #if KMP_OS_WINDOWS 7100 __kmp_install_signals(TRUE); 7101 #endif /* KMP_OS_WINDOWS */ 7102 #endif 7103 7104 /* we have finished the serial initialization */ 7105 __kmp_init_counter++; 7106 7107 __kmp_init_serial = TRUE; 7108 7109 if (__kmp_settings) { 7110 __kmp_env_print(); 7111 } 7112 7113 if (__kmp_display_env || __kmp_display_env_verbose) { 7114 __kmp_env_print_2(); 7115 } 7116 7117 #if OMPT_SUPPORT 7118 ompt_post_init(); 7119 #endif 7120 7121 KMP_MB(); 7122 7123 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7124 } 7125 7126 void __kmp_serial_initialize(void) { 7127 if (__kmp_init_serial) { 7128 return; 7129 } 7130 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7131 if (__kmp_init_serial) { 7132 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7133 return; 7134 } 7135 __kmp_do_serial_initialize(); 7136 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7137 } 7138 7139 static void __kmp_do_middle_initialize(void) { 7140 int i, j; 7141 int prev_dflt_team_nth; 7142 7143 if (!__kmp_init_serial) { 7144 __kmp_do_serial_initialize(); 7145 } 7146 7147 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7148 7149 // Save the previous value for the __kmp_dflt_team_nth so that 7150 // we can avoid some reinitialization if it hasn't changed. 7151 prev_dflt_team_nth = __kmp_dflt_team_nth; 7152 7153 #if KMP_AFFINITY_SUPPORTED 7154 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7155 // number of cores on the machine. 7156 __kmp_affinity_initialize(); 7157 7158 #endif /* KMP_AFFINITY_SUPPORTED */ 7159 7160 KMP_ASSERT(__kmp_xproc > 0); 7161 if (__kmp_avail_proc == 0) { 7162 __kmp_avail_proc = __kmp_xproc; 7163 } 7164 7165 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7166 // correct them now 7167 j = 0; 7168 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7169 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7170 __kmp_avail_proc; 7171 j++; 7172 } 7173 7174 if (__kmp_dflt_team_nth == 0) { 7175 #ifdef KMP_DFLT_NTH_CORES 7176 // Default #threads = #cores 7177 __kmp_dflt_team_nth = __kmp_ncores; 7178 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7179 "__kmp_ncores (%d)\n", 7180 __kmp_dflt_team_nth)); 7181 #else 7182 // Default #threads = #available OS procs 7183 __kmp_dflt_team_nth = __kmp_avail_proc; 7184 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7185 "__kmp_avail_proc(%d)\n", 7186 __kmp_dflt_team_nth)); 7187 #endif /* KMP_DFLT_NTH_CORES */ 7188 } 7189 7190 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7191 __kmp_dflt_team_nth = KMP_MIN_NTH; 7192 } 7193 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7194 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7195 } 7196 7197 if (__kmp_nesting_mode > 0) 7198 __kmp_set_nesting_mode_threads(); 7199 7200 // There's no harm in continuing if the following check fails, 7201 // but it indicates an error in the previous logic. 7202 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7203 7204 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7205 // Run through the __kmp_threads array and set the num threads icv for each 7206 // root thread that is currently registered with the RTL (which has not 7207 // already explicitly set its nthreads-var with a call to 7208 // omp_set_num_threads()). 7209 for (i = 0; i < __kmp_threads_capacity; i++) { 7210 kmp_info_t *thread = __kmp_threads[i]; 7211 if (thread == NULL) 7212 continue; 7213 if (thread->th.th_current_task->td_icvs.nproc != 0) 7214 continue; 7215 7216 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7217 } 7218 } 7219 KA_TRACE( 7220 20, 7221 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7222 __kmp_dflt_team_nth)); 7223 7224 #ifdef KMP_ADJUST_BLOCKTIME 7225 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7226 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7227 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7228 if (__kmp_nth > __kmp_avail_proc) { 7229 __kmp_zero_bt = TRUE; 7230 } 7231 } 7232 #endif /* KMP_ADJUST_BLOCKTIME */ 7233 7234 /* we have finished middle initialization */ 7235 TCW_SYNC_4(__kmp_init_middle, TRUE); 7236 7237 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7238 } 7239 7240 void __kmp_middle_initialize(void) { 7241 if (__kmp_init_middle) { 7242 return; 7243 } 7244 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7245 if (__kmp_init_middle) { 7246 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7247 return; 7248 } 7249 __kmp_do_middle_initialize(); 7250 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7251 } 7252 7253 void __kmp_parallel_initialize(void) { 7254 int gtid = __kmp_entry_gtid(); // this might be a new root 7255 7256 /* synchronize parallel initialization (for sibling) */ 7257 if (TCR_4(__kmp_init_parallel)) 7258 return; 7259 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7260 if (TCR_4(__kmp_init_parallel)) { 7261 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7262 return; 7263 } 7264 7265 /* TODO reinitialization after we have already shut down */ 7266 if (TCR_4(__kmp_global.g.g_done)) { 7267 KA_TRACE( 7268 10, 7269 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7270 __kmp_infinite_loop(); 7271 } 7272 7273 /* jc: The lock __kmp_initz_lock is already held, so calling 7274 __kmp_serial_initialize would cause a deadlock. So we call 7275 __kmp_do_serial_initialize directly. */ 7276 if (!__kmp_init_middle) { 7277 __kmp_do_middle_initialize(); 7278 } 7279 __kmp_assign_root_init_mask(); 7280 __kmp_resume_if_hard_paused(); 7281 7282 /* begin initialization */ 7283 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7284 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7285 7286 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7287 // Save the FP control regs. 7288 // Worker threads will set theirs to these values at thread startup. 7289 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7290 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7291 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7292 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7293 7294 #if KMP_OS_UNIX 7295 #if KMP_HANDLE_SIGNALS 7296 /* must be after __kmp_serial_initialize */ 7297 __kmp_install_signals(TRUE); 7298 #endif 7299 #endif 7300 7301 __kmp_suspend_initialize(); 7302 7303 #if defined(USE_LOAD_BALANCE) 7304 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7305 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7306 } 7307 #else 7308 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7309 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7310 } 7311 #endif 7312 7313 if (__kmp_version) { 7314 __kmp_print_version_2(); 7315 } 7316 7317 /* we have finished parallel initialization */ 7318 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7319 7320 KMP_MB(); 7321 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7322 7323 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7324 } 7325 7326 void __kmp_hidden_helper_initialize() { 7327 if (TCR_4(__kmp_init_hidden_helper)) 7328 return; 7329 7330 // __kmp_parallel_initialize is required before we initialize hidden helper 7331 if (!TCR_4(__kmp_init_parallel)) 7332 __kmp_parallel_initialize(); 7333 7334 // Double check. Note that this double check should not be placed before 7335 // __kmp_parallel_initialize as it will cause dead lock. 7336 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7337 if (TCR_4(__kmp_init_hidden_helper)) { 7338 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7339 return; 7340 } 7341 7342 // Set the count of hidden helper tasks to be executed to zero 7343 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7344 7345 // Set the global variable indicating that we're initializing hidden helper 7346 // team/threads 7347 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7348 7349 // Platform independent initialization 7350 __kmp_do_initialize_hidden_helper_threads(); 7351 7352 // Wait here for the finish of initialization of hidden helper teams 7353 __kmp_hidden_helper_threads_initz_wait(); 7354 7355 // We have finished hidden helper initialization 7356 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7357 7358 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7359 } 7360 7361 /* ------------------------------------------------------------------------ */ 7362 7363 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7364 kmp_team_t *team) { 7365 kmp_disp_t *dispatch; 7366 7367 KMP_MB(); 7368 7369 /* none of the threads have encountered any constructs, yet. */ 7370 this_thr->th.th_local.this_construct = 0; 7371 #if KMP_CACHE_MANAGE 7372 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7373 #endif /* KMP_CACHE_MANAGE */ 7374 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7375 KMP_DEBUG_ASSERT(dispatch); 7376 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7377 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7378 // this_thr->th.th_info.ds.ds_tid ] ); 7379 7380 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7381 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7382 if (__kmp_env_consistency_check) 7383 __kmp_push_parallel(gtid, team->t.t_ident); 7384 7385 KMP_MB(); /* Flush all pending memory write invalidates. */ 7386 } 7387 7388 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7389 kmp_team_t *team) { 7390 if (__kmp_env_consistency_check) 7391 __kmp_pop_parallel(gtid, team->t.t_ident); 7392 7393 __kmp_finish_implicit_task(this_thr); 7394 } 7395 7396 int __kmp_invoke_task_func(int gtid) { 7397 int rc; 7398 int tid = __kmp_tid_from_gtid(gtid); 7399 kmp_info_t *this_thr = __kmp_threads[gtid]; 7400 kmp_team_t *team = this_thr->th.th_team; 7401 7402 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7403 #if USE_ITT_BUILD 7404 if (__itt_stack_caller_create_ptr) { 7405 // inform ittnotify about entering user's code 7406 if (team->t.t_stack_id != NULL) { 7407 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7408 } else { 7409 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7410 __kmp_itt_stack_callee_enter( 7411 (__itt_caller)team->t.t_parent->t.t_stack_id); 7412 } 7413 } 7414 #endif /* USE_ITT_BUILD */ 7415 #if INCLUDE_SSC_MARKS 7416 SSC_MARK_INVOKING(); 7417 #endif 7418 7419 #if OMPT_SUPPORT 7420 void *dummy; 7421 void **exit_frame_p; 7422 ompt_data_t *my_task_data; 7423 ompt_data_t *my_parallel_data; 7424 int ompt_team_size; 7425 7426 if (ompt_enabled.enabled) { 7427 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7428 .ompt_task_info.frame.exit_frame.ptr); 7429 } else { 7430 exit_frame_p = &dummy; 7431 } 7432 7433 my_task_data = 7434 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7435 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7436 if (ompt_enabled.ompt_callback_implicit_task) { 7437 ompt_team_size = team->t.t_nproc; 7438 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7439 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7440 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7441 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7442 } 7443 #endif 7444 7445 #if KMP_STATS_ENABLED 7446 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7447 if (previous_state == stats_state_e::TEAMS_REGION) { 7448 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7449 } else { 7450 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7451 } 7452 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7453 #endif 7454 7455 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7456 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7457 #if OMPT_SUPPORT 7458 , 7459 exit_frame_p 7460 #endif 7461 ); 7462 #if OMPT_SUPPORT 7463 *exit_frame_p = NULL; 7464 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7465 #endif 7466 7467 #if KMP_STATS_ENABLED 7468 if (previous_state == stats_state_e::TEAMS_REGION) { 7469 KMP_SET_THREAD_STATE(previous_state); 7470 } 7471 KMP_POP_PARTITIONED_TIMER(); 7472 #endif 7473 7474 #if USE_ITT_BUILD 7475 if (__itt_stack_caller_create_ptr) { 7476 // inform ittnotify about leaving user's code 7477 if (team->t.t_stack_id != NULL) { 7478 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7479 } else { 7480 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7481 __kmp_itt_stack_callee_leave( 7482 (__itt_caller)team->t.t_parent->t.t_stack_id); 7483 } 7484 } 7485 #endif /* USE_ITT_BUILD */ 7486 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7487 7488 return rc; 7489 } 7490 7491 void __kmp_teams_master(int gtid) { 7492 // This routine is called by all primary threads in teams construct 7493 kmp_info_t *thr = __kmp_threads[gtid]; 7494 kmp_team_t *team = thr->th.th_team; 7495 ident_t *loc = team->t.t_ident; 7496 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7497 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7498 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7499 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7500 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7501 7502 // This thread is a new CG root. Set up the proper variables. 7503 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7504 tmp->cg_root = thr; // Make thr the CG root 7505 // Init to thread limit stored when league primary threads were forked 7506 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7507 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7508 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7509 " cg_nthreads to 1\n", 7510 thr, tmp)); 7511 tmp->up = thr->th.th_cg_roots; 7512 thr->th.th_cg_roots = tmp; 7513 7514 // Launch league of teams now, but not let workers execute 7515 // (they hang on fork barrier until next parallel) 7516 #if INCLUDE_SSC_MARKS 7517 SSC_MARK_FORKING(); 7518 #endif 7519 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7520 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7521 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7522 #if INCLUDE_SSC_MARKS 7523 SSC_MARK_JOINING(); 7524 #endif 7525 // If the team size was reduced from the limit, set it to the new size 7526 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7527 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7528 // AC: last parameter "1" eliminates join barrier which won't work because 7529 // worker threads are in a fork barrier waiting for more parallel regions 7530 __kmp_join_call(loc, gtid 7531 #if OMPT_SUPPORT 7532 , 7533 fork_context_intel 7534 #endif 7535 , 7536 1); 7537 } 7538 7539 int __kmp_invoke_teams_master(int gtid) { 7540 kmp_info_t *this_thr = __kmp_threads[gtid]; 7541 kmp_team_t *team = this_thr->th.th_team; 7542 #if KMP_DEBUG 7543 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7544 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7545 (void *)__kmp_teams_master); 7546 #endif 7547 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7548 #if OMPT_SUPPORT 7549 int tid = __kmp_tid_from_gtid(gtid); 7550 ompt_data_t *task_data = 7551 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7552 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7553 if (ompt_enabled.ompt_callback_implicit_task) { 7554 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7555 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7556 ompt_task_initial); 7557 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7558 } 7559 #endif 7560 __kmp_teams_master(gtid); 7561 #if OMPT_SUPPORT 7562 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7563 #endif 7564 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7565 return 1; 7566 } 7567 7568 /* this sets the requested number of threads for the next parallel region 7569 encountered by this team. since this should be enclosed in the forkjoin 7570 critical section it should avoid race conditions with asymmetrical nested 7571 parallelism */ 7572 7573 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7574 kmp_info_t *thr = __kmp_threads[gtid]; 7575 7576 if (num_threads > 0) 7577 thr->th.th_set_nproc = num_threads; 7578 } 7579 7580 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7581 int num_threads) { 7582 KMP_DEBUG_ASSERT(thr); 7583 // Remember the number of threads for inner parallel regions 7584 if (!TCR_4(__kmp_init_middle)) 7585 __kmp_middle_initialize(); // get internal globals calculated 7586 __kmp_assign_root_init_mask(); 7587 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7588 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7589 7590 if (num_threads == 0) { 7591 if (__kmp_teams_thread_limit > 0) { 7592 num_threads = __kmp_teams_thread_limit; 7593 } else { 7594 num_threads = __kmp_avail_proc / num_teams; 7595 } 7596 // adjust num_threads w/o warning as it is not user setting 7597 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7598 // no thread_limit clause specified - do not change thread-limit-var ICV 7599 if (num_threads > __kmp_dflt_team_nth) { 7600 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7601 } 7602 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7603 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7604 } // prevent team size to exceed thread-limit-var 7605 if (num_teams * num_threads > __kmp_teams_max_nth) { 7606 num_threads = __kmp_teams_max_nth / num_teams; 7607 } 7608 if (num_threads == 0) { 7609 num_threads = 1; 7610 } 7611 } else { 7612 // This thread will be the primary thread of the league primary threads 7613 // Store new thread limit; old limit is saved in th_cg_roots list 7614 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7615 // num_threads = min(num_threads, nthreads-var) 7616 if (num_threads > __kmp_dflt_team_nth) { 7617 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7618 } 7619 if (num_teams * num_threads > __kmp_teams_max_nth) { 7620 int new_threads = __kmp_teams_max_nth / num_teams; 7621 if (new_threads == 0) { 7622 new_threads = 1; 7623 } 7624 if (new_threads != num_threads) { 7625 if (!__kmp_reserve_warn) { // user asked for too many threads 7626 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7627 __kmp_msg(kmp_ms_warning, 7628 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7629 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7630 } 7631 } 7632 num_threads = new_threads; 7633 } 7634 } 7635 thr->th.th_teams_size.nth = num_threads; 7636 } 7637 7638 /* this sets the requested number of teams for the teams region and/or 7639 the number of threads for the next parallel region encountered */ 7640 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7641 int num_threads) { 7642 kmp_info_t *thr = __kmp_threads[gtid]; 7643 KMP_DEBUG_ASSERT(num_teams >= 0); 7644 KMP_DEBUG_ASSERT(num_threads >= 0); 7645 7646 if (num_teams == 0) { 7647 if (__kmp_nteams > 0) { 7648 num_teams = __kmp_nteams; 7649 } else { 7650 num_teams = 1; // default number of teams is 1. 7651 } 7652 } 7653 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7654 if (!__kmp_reserve_warn) { 7655 __kmp_reserve_warn = 1; 7656 __kmp_msg(kmp_ms_warning, 7657 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7658 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7659 } 7660 num_teams = __kmp_teams_max_nth; 7661 } 7662 // Set number of teams (number of threads in the outer "parallel" of the 7663 // teams) 7664 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7665 7666 __kmp_push_thread_limit(thr, num_teams, num_threads); 7667 } 7668 7669 /* This sets the requested number of teams for the teams region and/or 7670 the number of threads for the next parallel region encountered */ 7671 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7672 int num_teams_ub, int num_threads) { 7673 kmp_info_t *thr = __kmp_threads[gtid]; 7674 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7675 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7676 KMP_DEBUG_ASSERT(num_threads >= 0); 7677 7678 if (num_teams_lb > num_teams_ub) { 7679 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7680 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7681 } 7682 7683 int num_teams = 1; // defalt number of teams is 1. 7684 7685 if (num_teams_lb == 0 && num_teams_ub > 0) 7686 num_teams_lb = num_teams_ub; 7687 7688 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7689 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7690 if (num_teams > __kmp_teams_max_nth) { 7691 if (!__kmp_reserve_warn) { 7692 __kmp_reserve_warn = 1; 7693 __kmp_msg(kmp_ms_warning, 7694 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7695 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7696 } 7697 num_teams = __kmp_teams_max_nth; 7698 } 7699 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7700 num_teams = num_teams_ub; 7701 } else { // num_teams_lb <= num_teams <= num_teams_ub 7702 if (num_threads == 0) { 7703 if (num_teams_ub > __kmp_teams_max_nth) { 7704 num_teams = num_teams_lb; 7705 } else { 7706 num_teams = num_teams_ub; 7707 } 7708 } else { 7709 num_teams = (num_threads > __kmp_teams_max_nth) 7710 ? num_teams 7711 : __kmp_teams_max_nth / num_threads; 7712 if (num_teams < num_teams_lb) { 7713 num_teams = num_teams_lb; 7714 } else if (num_teams > num_teams_ub) { 7715 num_teams = num_teams_ub; 7716 } 7717 } 7718 } 7719 // Set number of teams (number of threads in the outer "parallel" of the 7720 // teams) 7721 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7722 7723 __kmp_push_thread_limit(thr, num_teams, num_threads); 7724 } 7725 7726 // Set the proc_bind var to use in the following parallel region. 7727 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7728 kmp_info_t *thr = __kmp_threads[gtid]; 7729 thr->th.th_set_proc_bind = proc_bind; 7730 } 7731 7732 /* Launch the worker threads into the microtask. */ 7733 7734 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7735 kmp_info_t *this_thr = __kmp_threads[gtid]; 7736 7737 #ifdef KMP_DEBUG 7738 int f; 7739 #endif /* KMP_DEBUG */ 7740 7741 KMP_DEBUG_ASSERT(team); 7742 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7743 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7744 KMP_MB(); /* Flush all pending memory write invalidates. */ 7745 7746 team->t.t_construct = 0; /* no single directives seen yet */ 7747 team->t.t_ordered.dt.t_value = 7748 0; /* thread 0 enters the ordered section first */ 7749 7750 /* Reset the identifiers on the dispatch buffer */ 7751 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7752 if (team->t.t_max_nproc > 1) { 7753 int i; 7754 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7755 team->t.t_disp_buffer[i].buffer_index = i; 7756 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7757 } 7758 } else { 7759 team->t.t_disp_buffer[0].buffer_index = 0; 7760 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7761 } 7762 7763 KMP_MB(); /* Flush all pending memory write invalidates. */ 7764 KMP_ASSERT(this_thr->th.th_team == team); 7765 7766 #ifdef KMP_DEBUG 7767 for (f = 0; f < team->t.t_nproc; f++) { 7768 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7769 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7770 } 7771 #endif /* KMP_DEBUG */ 7772 7773 /* release the worker threads so they may begin working */ 7774 __kmp_fork_barrier(gtid, 0); 7775 } 7776 7777 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7778 kmp_info_t *this_thr = __kmp_threads[gtid]; 7779 7780 KMP_DEBUG_ASSERT(team); 7781 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7782 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7783 KMP_MB(); /* Flush all pending memory write invalidates. */ 7784 7785 /* Join barrier after fork */ 7786 7787 #ifdef KMP_DEBUG 7788 if (__kmp_threads[gtid] && 7789 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7790 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7791 __kmp_threads[gtid]); 7792 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7793 "team->t.t_nproc=%d\n", 7794 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7795 team->t.t_nproc); 7796 __kmp_print_structure(); 7797 } 7798 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7799 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7800 #endif /* KMP_DEBUG */ 7801 7802 __kmp_join_barrier(gtid); /* wait for everyone */ 7803 #if OMPT_SUPPORT 7804 if (ompt_enabled.enabled && 7805 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7806 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7807 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7808 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7809 #if OMPT_OPTIONAL 7810 void *codeptr = NULL; 7811 if (KMP_MASTER_TID(ds_tid) && 7812 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7813 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7814 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7815 7816 if (ompt_enabled.ompt_callback_sync_region_wait) { 7817 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7818 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7819 codeptr); 7820 } 7821 if (ompt_enabled.ompt_callback_sync_region) { 7822 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7823 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7824 codeptr); 7825 } 7826 #endif 7827 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7828 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7829 ompt_scope_end, NULL, task_data, 0, ds_tid, 7830 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7831 } 7832 } 7833 #endif 7834 7835 KMP_MB(); /* Flush all pending memory write invalidates. */ 7836 KMP_ASSERT(this_thr->th.th_team == team); 7837 } 7838 7839 /* ------------------------------------------------------------------------ */ 7840 7841 #ifdef USE_LOAD_BALANCE 7842 7843 // Return the worker threads actively spinning in the hot team, if we 7844 // are at the outermost level of parallelism. Otherwise, return 0. 7845 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7846 int i; 7847 int retval; 7848 kmp_team_t *hot_team; 7849 7850 if (root->r.r_active) { 7851 return 0; 7852 } 7853 hot_team = root->r.r_hot_team; 7854 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7855 return hot_team->t.t_nproc - 1; // Don't count primary thread 7856 } 7857 7858 // Skip the primary thread - it is accounted for elsewhere. 7859 retval = 0; 7860 for (i = 1; i < hot_team->t.t_nproc; i++) { 7861 if (hot_team->t.t_threads[i]->th.th_active) { 7862 retval++; 7863 } 7864 } 7865 return retval; 7866 } 7867 7868 // Perform an automatic adjustment to the number of 7869 // threads used by the next parallel region. 7870 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7871 int retval; 7872 int pool_active; 7873 int hot_team_active; 7874 int team_curr_active; 7875 int system_active; 7876 7877 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7878 set_nproc)); 7879 KMP_DEBUG_ASSERT(root); 7880 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7881 ->th.th_current_task->td_icvs.dynamic == TRUE); 7882 KMP_DEBUG_ASSERT(set_nproc > 1); 7883 7884 if (set_nproc == 1) { 7885 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7886 return 1; 7887 } 7888 7889 // Threads that are active in the thread pool, active in the hot team for this 7890 // particular root (if we are at the outer par level), and the currently 7891 // executing thread (to become the primary thread) are available to add to the 7892 // new team, but are currently contributing to the system load, and must be 7893 // accounted for. 7894 pool_active = __kmp_thread_pool_active_nth; 7895 hot_team_active = __kmp_active_hot_team_nproc(root); 7896 team_curr_active = pool_active + hot_team_active + 1; 7897 7898 // Check the system load. 7899 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7900 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7901 "hot team active = %d\n", 7902 system_active, pool_active, hot_team_active)); 7903 7904 if (system_active < 0) { 7905 // There was an error reading the necessary info from /proc, so use the 7906 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7907 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7908 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7909 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7910 7911 // Make this call behave like the thread limit algorithm. 7912 retval = __kmp_avail_proc - __kmp_nth + 7913 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7914 if (retval > set_nproc) { 7915 retval = set_nproc; 7916 } 7917 if (retval < KMP_MIN_NTH) { 7918 retval = KMP_MIN_NTH; 7919 } 7920 7921 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7922 retval)); 7923 return retval; 7924 } 7925 7926 // There is a slight delay in the load balance algorithm in detecting new 7927 // running procs. The real system load at this instant should be at least as 7928 // large as the #active omp thread that are available to add to the team. 7929 if (system_active < team_curr_active) { 7930 system_active = team_curr_active; 7931 } 7932 retval = __kmp_avail_proc - system_active + team_curr_active; 7933 if (retval > set_nproc) { 7934 retval = set_nproc; 7935 } 7936 if (retval < KMP_MIN_NTH) { 7937 retval = KMP_MIN_NTH; 7938 } 7939 7940 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7941 return retval; 7942 } // __kmp_load_balance_nproc() 7943 7944 #endif /* USE_LOAD_BALANCE */ 7945 7946 /* ------------------------------------------------------------------------ */ 7947 7948 /* NOTE: this is called with the __kmp_init_lock held */ 7949 void __kmp_cleanup(void) { 7950 int f; 7951 7952 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7953 7954 if (TCR_4(__kmp_init_parallel)) { 7955 #if KMP_HANDLE_SIGNALS 7956 __kmp_remove_signals(); 7957 #endif 7958 TCW_4(__kmp_init_parallel, FALSE); 7959 } 7960 7961 if (TCR_4(__kmp_init_middle)) { 7962 #if KMP_AFFINITY_SUPPORTED 7963 __kmp_affinity_uninitialize(); 7964 #endif /* KMP_AFFINITY_SUPPORTED */ 7965 __kmp_cleanup_hierarchy(); 7966 TCW_4(__kmp_init_middle, FALSE); 7967 } 7968 7969 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7970 7971 if (__kmp_init_serial) { 7972 __kmp_runtime_destroy(); 7973 __kmp_init_serial = FALSE; 7974 } 7975 7976 __kmp_cleanup_threadprivate_caches(); 7977 7978 for (f = 0; f < __kmp_threads_capacity; f++) { 7979 if (__kmp_root[f] != NULL) { 7980 __kmp_free(__kmp_root[f]); 7981 __kmp_root[f] = NULL; 7982 } 7983 } 7984 __kmp_free(__kmp_threads); 7985 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7986 // there is no need in freeing __kmp_root. 7987 __kmp_threads = NULL; 7988 __kmp_root = NULL; 7989 __kmp_threads_capacity = 0; 7990 7991 #if KMP_USE_DYNAMIC_LOCK 7992 __kmp_cleanup_indirect_user_locks(); 7993 #else 7994 __kmp_cleanup_user_locks(); 7995 #endif 7996 #if OMPD_SUPPORT 7997 if (ompd_state) { 7998 __kmp_free(ompd_env_block); 7999 ompd_env_block = NULL; 8000 ompd_env_block_size = 0; 8001 } 8002 #endif 8003 8004 #if KMP_AFFINITY_SUPPORTED 8005 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 8006 __kmp_cpuinfo_file = NULL; 8007 #endif /* KMP_AFFINITY_SUPPORTED */ 8008 8009 #if KMP_USE_ADAPTIVE_LOCKS 8010 #if KMP_DEBUG_ADAPTIVE_LOCKS 8011 __kmp_print_speculative_stats(); 8012 #endif 8013 #endif 8014 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 8015 __kmp_nested_nth.nth = NULL; 8016 __kmp_nested_nth.size = 0; 8017 __kmp_nested_nth.used = 0; 8018 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 8019 __kmp_nested_proc_bind.bind_types = NULL; 8020 __kmp_nested_proc_bind.size = 0; 8021 __kmp_nested_proc_bind.used = 0; 8022 if (__kmp_affinity_format) { 8023 KMP_INTERNAL_FREE(__kmp_affinity_format); 8024 __kmp_affinity_format = NULL; 8025 } 8026 8027 __kmp_i18n_catclose(); 8028 8029 #if KMP_USE_HIER_SCHED 8030 __kmp_hier_scheds.deallocate(); 8031 #endif 8032 8033 #if KMP_STATS_ENABLED 8034 __kmp_stats_fini(); 8035 #endif 8036 8037 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 8038 } 8039 8040 /* ------------------------------------------------------------------------ */ 8041 8042 int __kmp_ignore_mppbeg(void) { 8043 char *env; 8044 8045 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 8046 if (__kmp_str_match_false(env)) 8047 return FALSE; 8048 } 8049 // By default __kmpc_begin() is no-op. 8050 return TRUE; 8051 } 8052 8053 int __kmp_ignore_mppend(void) { 8054 char *env; 8055 8056 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 8057 if (__kmp_str_match_false(env)) 8058 return FALSE; 8059 } 8060 // By default __kmpc_end() is no-op. 8061 return TRUE; 8062 } 8063 8064 void __kmp_internal_begin(void) { 8065 int gtid; 8066 kmp_root_t *root; 8067 8068 /* this is a very important step as it will register new sibling threads 8069 and assign these new uber threads a new gtid */ 8070 gtid = __kmp_entry_gtid(); 8071 root = __kmp_threads[gtid]->th.th_root; 8072 KMP_ASSERT(KMP_UBER_GTID(gtid)); 8073 8074 if (root->r.r_begin) 8075 return; 8076 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 8077 if (root->r.r_begin) { 8078 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8079 return; 8080 } 8081 8082 root->r.r_begin = TRUE; 8083 8084 __kmp_release_lock(&root->r.r_begin_lock, gtid); 8085 } 8086 8087 /* ------------------------------------------------------------------------ */ 8088 8089 void __kmp_user_set_library(enum library_type arg) { 8090 int gtid; 8091 kmp_root_t *root; 8092 kmp_info_t *thread; 8093 8094 /* first, make sure we are initialized so we can get our gtid */ 8095 8096 gtid = __kmp_entry_gtid(); 8097 thread = __kmp_threads[gtid]; 8098 8099 root = thread->th.th_root; 8100 8101 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 8102 library_serial)); 8103 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 8104 thread */ 8105 KMP_WARNING(SetLibraryIncorrectCall); 8106 return; 8107 } 8108 8109 switch (arg) { 8110 case library_serial: 8111 thread->th.th_set_nproc = 0; 8112 set__nproc(thread, 1); 8113 break; 8114 case library_turnaround: 8115 thread->th.th_set_nproc = 0; 8116 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8117 : __kmp_dflt_team_nth_ub); 8118 break; 8119 case library_throughput: 8120 thread->th.th_set_nproc = 0; 8121 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8122 : __kmp_dflt_team_nth_ub); 8123 break; 8124 default: 8125 KMP_FATAL(UnknownLibraryType, arg); 8126 } 8127 8128 __kmp_aux_set_library(arg); 8129 } 8130 8131 void __kmp_aux_set_stacksize(size_t arg) { 8132 if (!__kmp_init_serial) 8133 __kmp_serial_initialize(); 8134 8135 #if KMP_OS_DARWIN 8136 if (arg & (0x1000 - 1)) { 8137 arg &= ~(0x1000 - 1); 8138 if (arg + 0x1000) /* check for overflow if we round up */ 8139 arg += 0x1000; 8140 } 8141 #endif 8142 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8143 8144 /* only change the default stacksize before the first parallel region */ 8145 if (!TCR_4(__kmp_init_parallel)) { 8146 size_t value = arg; /* argument is in bytes */ 8147 8148 if (value < __kmp_sys_min_stksize) 8149 value = __kmp_sys_min_stksize; 8150 else if (value > KMP_MAX_STKSIZE) 8151 value = KMP_MAX_STKSIZE; 8152 8153 __kmp_stksize = value; 8154 8155 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8156 } 8157 8158 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8159 } 8160 8161 /* set the behaviour of the runtime library */ 8162 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8163 void __kmp_aux_set_library(enum library_type arg) { 8164 __kmp_library = arg; 8165 8166 switch (__kmp_library) { 8167 case library_serial: { 8168 KMP_INFORM(LibraryIsSerial); 8169 } break; 8170 case library_turnaround: 8171 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8172 __kmp_use_yield = 2; // only yield when oversubscribed 8173 break; 8174 case library_throughput: 8175 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8176 __kmp_dflt_blocktime = 200; 8177 break; 8178 default: 8179 KMP_FATAL(UnknownLibraryType, arg); 8180 } 8181 } 8182 8183 /* Getting team information common for all team API */ 8184 // Returns NULL if not in teams construct 8185 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8186 kmp_info_t *thr = __kmp_entry_thread(); 8187 teams_serialized = 0; 8188 if (thr->th.th_teams_microtask) { 8189 kmp_team_t *team = thr->th.th_team; 8190 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8191 int ii = team->t.t_level; 8192 teams_serialized = team->t.t_serialized; 8193 int level = tlevel + 1; 8194 KMP_DEBUG_ASSERT(ii >= tlevel); 8195 while (ii > level) { 8196 for (teams_serialized = team->t.t_serialized; 8197 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8198 } 8199 if (team->t.t_serialized && (!teams_serialized)) { 8200 team = team->t.t_parent; 8201 continue; 8202 } 8203 if (ii > level) { 8204 team = team->t.t_parent; 8205 ii--; 8206 } 8207 } 8208 return team; 8209 } 8210 return NULL; 8211 } 8212 8213 int __kmp_aux_get_team_num() { 8214 int serialized; 8215 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8216 if (team) { 8217 if (serialized > 1) { 8218 return 0; // teams region is serialized ( 1 team of 1 thread ). 8219 } else { 8220 return team->t.t_master_tid; 8221 } 8222 } 8223 return 0; 8224 } 8225 8226 int __kmp_aux_get_num_teams() { 8227 int serialized; 8228 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8229 if (team) { 8230 if (serialized > 1) { 8231 return 1; 8232 } else { 8233 return team->t.t_parent->t.t_nproc; 8234 } 8235 } 8236 return 1; 8237 } 8238 8239 /* ------------------------------------------------------------------------ */ 8240 8241 /* 8242 * Affinity Format Parser 8243 * 8244 * Field is in form of: %[[[0].]size]type 8245 * % and type are required (%% means print a literal '%') 8246 * type is either single char or long name surrounded by {}, 8247 * e.g., N or {num_threads} 8248 * 0 => leading zeros 8249 * . => right justified when size is specified 8250 * by default output is left justified 8251 * size is the *minimum* field length 8252 * All other characters are printed as is 8253 * 8254 * Available field types: 8255 * L {thread_level} - omp_get_level() 8256 * n {thread_num} - omp_get_thread_num() 8257 * h {host} - name of host machine 8258 * P {process_id} - process id (integer) 8259 * T {thread_identifier} - native thread identifier (integer) 8260 * N {num_threads} - omp_get_num_threads() 8261 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8262 * a {thread_affinity} - comma separated list of integers or integer ranges 8263 * (values of affinity mask) 8264 * 8265 * Implementation-specific field types can be added 8266 * If a type is unknown, print "undefined" 8267 */ 8268 8269 // Structure holding the short name, long name, and corresponding data type 8270 // for snprintf. A table of these will represent the entire valid keyword 8271 // field types. 8272 typedef struct kmp_affinity_format_field_t { 8273 char short_name; // from spec e.g., L -> thread level 8274 const char *long_name; // from spec thread_level -> thread level 8275 char field_format; // data type for snprintf (typically 'd' or 's' 8276 // for integer or string) 8277 } kmp_affinity_format_field_t; 8278 8279 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8280 #if KMP_AFFINITY_SUPPORTED 8281 {'A', "thread_affinity", 's'}, 8282 #endif 8283 {'t', "team_num", 'd'}, 8284 {'T', "num_teams", 'd'}, 8285 {'L', "nesting_level", 'd'}, 8286 {'n', "thread_num", 'd'}, 8287 {'N', "num_threads", 'd'}, 8288 {'a', "ancestor_tnum", 'd'}, 8289 {'H', "host", 's'}, 8290 {'P', "process_id", 'd'}, 8291 {'i', "native_thread_id", 'd'}}; 8292 8293 // Return the number of characters it takes to hold field 8294 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8295 const char **ptr, 8296 kmp_str_buf_t *field_buffer) { 8297 int rc, format_index, field_value; 8298 const char *width_left, *width_right; 8299 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8300 static const int FORMAT_SIZE = 20; 8301 char format[FORMAT_SIZE] = {0}; 8302 char absolute_short_name = 0; 8303 8304 KMP_DEBUG_ASSERT(gtid >= 0); 8305 KMP_DEBUG_ASSERT(th); 8306 KMP_DEBUG_ASSERT(**ptr == '%'); 8307 KMP_DEBUG_ASSERT(field_buffer); 8308 8309 __kmp_str_buf_clear(field_buffer); 8310 8311 // Skip the initial % 8312 (*ptr)++; 8313 8314 // Check for %% first 8315 if (**ptr == '%') { 8316 __kmp_str_buf_cat(field_buffer, "%", 1); 8317 (*ptr)++; // skip over the second % 8318 return 1; 8319 } 8320 8321 // Parse field modifiers if they are present 8322 pad_zeros = false; 8323 if (**ptr == '0') { 8324 pad_zeros = true; 8325 (*ptr)++; // skip over 0 8326 } 8327 right_justify = false; 8328 if (**ptr == '.') { 8329 right_justify = true; 8330 (*ptr)++; // skip over . 8331 } 8332 // Parse width of field: [width_left, width_right) 8333 width_left = width_right = NULL; 8334 if (**ptr >= '0' && **ptr <= '9') { 8335 width_left = *ptr; 8336 SKIP_DIGITS(*ptr); 8337 width_right = *ptr; 8338 } 8339 8340 // Create the format for KMP_SNPRINTF based on flags parsed above 8341 format_index = 0; 8342 format[format_index++] = '%'; 8343 if (!right_justify) 8344 format[format_index++] = '-'; 8345 if (pad_zeros) 8346 format[format_index++] = '0'; 8347 if (width_left && width_right) { 8348 int i = 0; 8349 // Only allow 8 digit number widths. 8350 // This also prevents overflowing format variable 8351 while (i < 8 && width_left < width_right) { 8352 format[format_index++] = *width_left; 8353 width_left++; 8354 i++; 8355 } 8356 } 8357 8358 // Parse a name (long or short) 8359 // Canonicalize the name into absolute_short_name 8360 found_valid_name = false; 8361 parse_long_name = (**ptr == '{'); 8362 if (parse_long_name) 8363 (*ptr)++; // skip initial left brace 8364 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8365 sizeof(__kmp_affinity_format_table[0]); 8366 ++i) { 8367 char short_name = __kmp_affinity_format_table[i].short_name; 8368 const char *long_name = __kmp_affinity_format_table[i].long_name; 8369 char field_format = __kmp_affinity_format_table[i].field_format; 8370 if (parse_long_name) { 8371 size_t length = KMP_STRLEN(long_name); 8372 if (strncmp(*ptr, long_name, length) == 0) { 8373 found_valid_name = true; 8374 (*ptr) += length; // skip the long name 8375 } 8376 } else if (**ptr == short_name) { 8377 found_valid_name = true; 8378 (*ptr)++; // skip the short name 8379 } 8380 if (found_valid_name) { 8381 format[format_index++] = field_format; 8382 format[format_index++] = '\0'; 8383 absolute_short_name = short_name; 8384 break; 8385 } 8386 } 8387 if (parse_long_name) { 8388 if (**ptr != '}') { 8389 absolute_short_name = 0; 8390 } else { 8391 (*ptr)++; // skip over the right brace 8392 } 8393 } 8394 8395 // Attempt to fill the buffer with the requested 8396 // value using snprintf within __kmp_str_buf_print() 8397 switch (absolute_short_name) { 8398 case 't': 8399 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8400 break; 8401 case 'T': 8402 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8403 break; 8404 case 'L': 8405 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8406 break; 8407 case 'n': 8408 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8409 break; 8410 case 'H': { 8411 static const int BUFFER_SIZE = 256; 8412 char buf[BUFFER_SIZE]; 8413 __kmp_expand_host_name(buf, BUFFER_SIZE); 8414 rc = __kmp_str_buf_print(field_buffer, format, buf); 8415 } break; 8416 case 'P': 8417 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8418 break; 8419 case 'i': 8420 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8421 break; 8422 case 'N': 8423 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8424 break; 8425 case 'a': 8426 field_value = 8427 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8428 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8429 break; 8430 #if KMP_AFFINITY_SUPPORTED 8431 case 'A': { 8432 kmp_str_buf_t buf; 8433 __kmp_str_buf_init(&buf); 8434 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8435 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8436 __kmp_str_buf_free(&buf); 8437 } break; 8438 #endif 8439 default: 8440 // According to spec, If an implementation does not have info for field 8441 // type, then "undefined" is printed 8442 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8443 // Skip the field 8444 if (parse_long_name) { 8445 SKIP_TOKEN(*ptr); 8446 if (**ptr == '}') 8447 (*ptr)++; 8448 } else { 8449 (*ptr)++; 8450 } 8451 } 8452 8453 KMP_ASSERT(format_index <= FORMAT_SIZE); 8454 return rc; 8455 } 8456 8457 /* 8458 * Return number of characters needed to hold the affinity string 8459 * (not including null byte character) 8460 * The resultant string is printed to buffer, which the caller can then 8461 * handle afterwards 8462 */ 8463 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8464 kmp_str_buf_t *buffer) { 8465 const char *parse_ptr; 8466 size_t retval; 8467 const kmp_info_t *th; 8468 kmp_str_buf_t field; 8469 8470 KMP_DEBUG_ASSERT(buffer); 8471 KMP_DEBUG_ASSERT(gtid >= 0); 8472 8473 __kmp_str_buf_init(&field); 8474 __kmp_str_buf_clear(buffer); 8475 8476 th = __kmp_threads[gtid]; 8477 retval = 0; 8478 8479 // If format is NULL or zero-length string, then we use 8480 // affinity-format-var ICV 8481 parse_ptr = format; 8482 if (parse_ptr == NULL || *parse_ptr == '\0') { 8483 parse_ptr = __kmp_affinity_format; 8484 } 8485 KMP_DEBUG_ASSERT(parse_ptr); 8486 8487 while (*parse_ptr != '\0') { 8488 // Parse a field 8489 if (*parse_ptr == '%') { 8490 // Put field in the buffer 8491 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8492 __kmp_str_buf_catbuf(buffer, &field); 8493 retval += rc; 8494 } else { 8495 // Put literal character in buffer 8496 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8497 retval++; 8498 parse_ptr++; 8499 } 8500 } 8501 __kmp_str_buf_free(&field); 8502 return retval; 8503 } 8504 8505 // Displays the affinity string to stdout 8506 void __kmp_aux_display_affinity(int gtid, const char *format) { 8507 kmp_str_buf_t buf; 8508 __kmp_str_buf_init(&buf); 8509 __kmp_aux_capture_affinity(gtid, format, &buf); 8510 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8511 __kmp_str_buf_free(&buf); 8512 } 8513 8514 /* ------------------------------------------------------------------------ */ 8515 8516 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8517 int blocktime = arg; /* argument is in milliseconds */ 8518 #if KMP_USE_MONITOR 8519 int bt_intervals; 8520 #endif 8521 kmp_int8 bt_set; 8522 8523 __kmp_save_internal_controls(thread); 8524 8525 /* Normalize and set blocktime for the teams */ 8526 if (blocktime < KMP_MIN_BLOCKTIME) 8527 blocktime = KMP_MIN_BLOCKTIME; 8528 else if (blocktime > KMP_MAX_BLOCKTIME) 8529 blocktime = KMP_MAX_BLOCKTIME; 8530 8531 set__blocktime_team(thread->th.th_team, tid, blocktime); 8532 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8533 8534 #if KMP_USE_MONITOR 8535 /* Calculate and set blocktime intervals for the teams */ 8536 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8537 8538 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8539 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8540 #endif 8541 8542 /* Set whether blocktime has been set to "TRUE" */ 8543 bt_set = TRUE; 8544 8545 set__bt_set_team(thread->th.th_team, tid, bt_set); 8546 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8547 #if KMP_USE_MONITOR 8548 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8549 "bt_intervals=%d, monitor_updates=%d\n", 8550 __kmp_gtid_from_tid(tid, thread->th.th_team), 8551 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8552 __kmp_monitor_wakeups)); 8553 #else 8554 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8555 __kmp_gtid_from_tid(tid, thread->th.th_team), 8556 thread->th.th_team->t.t_id, tid, blocktime)); 8557 #endif 8558 } 8559 8560 void __kmp_aux_set_defaults(char const *str, size_t len) { 8561 if (!__kmp_init_serial) { 8562 __kmp_serial_initialize(); 8563 } 8564 __kmp_env_initialize(str); 8565 8566 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8567 __kmp_env_print(); 8568 } 8569 } // __kmp_aux_set_defaults 8570 8571 /* ------------------------------------------------------------------------ */ 8572 /* internal fast reduction routines */ 8573 8574 PACKED_REDUCTION_METHOD_T 8575 __kmp_determine_reduction_method( 8576 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8577 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8578 kmp_critical_name *lck) { 8579 8580 // Default reduction method: critical construct ( lck != NULL, like in current 8581 // PAROPT ) 8582 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8583 // can be selected by RTL 8584 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8585 // can be selected by RTL 8586 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8587 // among generated by PAROPT. 8588 8589 PACKED_REDUCTION_METHOD_T retval; 8590 8591 int team_size; 8592 8593 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8594 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8595 8596 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8597 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8598 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8599 8600 retval = critical_reduce_block; 8601 8602 // another choice of getting a team size (with 1 dynamic deference) is slower 8603 team_size = __kmp_get_team_num_threads(global_tid); 8604 if (team_size == 1) { 8605 8606 retval = empty_reduce_block; 8607 8608 } else { 8609 8610 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8611 8612 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8613 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8614 8615 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8616 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8617 8618 int teamsize_cutoff = 4; 8619 8620 #if KMP_MIC_SUPPORTED 8621 if (__kmp_mic_type != non_mic) { 8622 teamsize_cutoff = 8; 8623 } 8624 #endif 8625 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8626 if (tree_available) { 8627 if (team_size <= teamsize_cutoff) { 8628 if (atomic_available) { 8629 retval = atomic_reduce_block; 8630 } 8631 } else { 8632 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8633 } 8634 } else if (atomic_available) { 8635 retval = atomic_reduce_block; 8636 } 8637 #else 8638 #error "Unknown or unsupported OS" 8639 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8640 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8641 8642 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8643 8644 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8645 8646 // basic tuning 8647 8648 if (atomic_available) { 8649 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8650 retval = atomic_reduce_block; 8651 } 8652 } // otherwise: use critical section 8653 8654 #elif KMP_OS_DARWIN 8655 8656 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8657 if (atomic_available && (num_vars <= 3)) { 8658 retval = atomic_reduce_block; 8659 } else if (tree_available) { 8660 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8661 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8662 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8663 } 8664 } // otherwise: use critical section 8665 8666 #else 8667 #error "Unknown or unsupported OS" 8668 #endif 8669 8670 #else 8671 #error "Unknown or unsupported architecture" 8672 #endif 8673 } 8674 8675 // KMP_FORCE_REDUCTION 8676 8677 // If the team is serialized (team_size == 1), ignore the forced reduction 8678 // method and stay with the unsynchronized method (empty_reduce_block) 8679 if (__kmp_force_reduction_method != reduction_method_not_defined && 8680 team_size != 1) { 8681 8682 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8683 8684 int atomic_available, tree_available; 8685 8686 switch ((forced_retval = __kmp_force_reduction_method)) { 8687 case critical_reduce_block: 8688 KMP_ASSERT(lck); // lck should be != 0 8689 break; 8690 8691 case atomic_reduce_block: 8692 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8693 if (!atomic_available) { 8694 KMP_WARNING(RedMethodNotSupported, "atomic"); 8695 forced_retval = critical_reduce_block; 8696 } 8697 break; 8698 8699 case tree_reduce_block: 8700 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8701 if (!tree_available) { 8702 KMP_WARNING(RedMethodNotSupported, "tree"); 8703 forced_retval = critical_reduce_block; 8704 } else { 8705 #if KMP_FAST_REDUCTION_BARRIER 8706 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8707 #endif 8708 } 8709 break; 8710 8711 default: 8712 KMP_ASSERT(0); // "unsupported method specified" 8713 } 8714 8715 retval = forced_retval; 8716 } 8717 8718 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8719 8720 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8721 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8722 8723 return (retval); 8724 } 8725 // this function is for testing set/get/determine reduce method 8726 kmp_int32 __kmp_get_reduce_method(void) { 8727 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8728 } 8729 8730 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8731 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8732 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8733 8734 // Hard pause shuts down the runtime completely. Resume happens naturally when 8735 // OpenMP is used subsequently. 8736 void __kmp_hard_pause() { 8737 __kmp_pause_status = kmp_hard_paused; 8738 __kmp_internal_end_thread(-1); 8739 } 8740 8741 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8742 void __kmp_resume_if_soft_paused() { 8743 if (__kmp_pause_status == kmp_soft_paused) { 8744 __kmp_pause_status = kmp_not_paused; 8745 8746 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8747 kmp_info_t *thread = __kmp_threads[gtid]; 8748 if (thread) { // Wake it if sleeping 8749 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8750 thread); 8751 if (fl.is_sleeping()) 8752 fl.resume(gtid); 8753 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8754 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8755 } else { // thread holds the lock and may sleep soon 8756 do { // until either the thread sleeps, or we can get the lock 8757 if (fl.is_sleeping()) { 8758 fl.resume(gtid); 8759 break; 8760 } else if (__kmp_try_suspend_mx(thread)) { 8761 __kmp_unlock_suspend_mx(thread); 8762 break; 8763 } 8764 } while (1); 8765 } 8766 } 8767 } 8768 } 8769 } 8770 8771 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8772 // TODO: add warning messages 8773 int __kmp_pause_resource(kmp_pause_status_t level) { 8774 if (level == kmp_not_paused) { // requesting resume 8775 if (__kmp_pause_status == kmp_not_paused) { 8776 // error message about runtime not being paused, so can't resume 8777 return 1; 8778 } else { 8779 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8780 __kmp_pause_status == kmp_hard_paused); 8781 __kmp_pause_status = kmp_not_paused; 8782 return 0; 8783 } 8784 } else if (level == kmp_soft_paused) { // requesting soft pause 8785 if (__kmp_pause_status != kmp_not_paused) { 8786 // error message about already being paused 8787 return 1; 8788 } else { 8789 __kmp_soft_pause(); 8790 return 0; 8791 } 8792 } else if (level == kmp_hard_paused) { // requesting hard pause 8793 if (__kmp_pause_status != kmp_not_paused) { 8794 // error message about already being paused 8795 return 1; 8796 } else { 8797 __kmp_hard_pause(); 8798 return 0; 8799 } 8800 } else { 8801 // error message about invalid level 8802 return 1; 8803 } 8804 } 8805 8806 void __kmp_omp_display_env(int verbose) { 8807 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8808 if (__kmp_init_serial == 0) 8809 __kmp_do_serial_initialize(); 8810 __kmp_display_env_impl(!verbose, verbose); 8811 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8812 } 8813 8814 // The team size is changing, so distributed barrier must be modified 8815 void __kmp_resize_dist_barrier(kmp_team_t *team, int old_nthreads, 8816 int new_nthreads) { 8817 KMP_DEBUG_ASSERT(__kmp_barrier_release_pattern[bs_forkjoin_barrier] == 8818 bp_dist_bar); 8819 kmp_info_t **other_threads = team->t.t_threads; 8820 8821 // We want all the workers to stop waiting on the barrier while we adjust the 8822 // size of the team. 8823 for (int f = 1; f < old_nthreads; ++f) { 8824 KMP_DEBUG_ASSERT(other_threads[f] != NULL); 8825 // Ignore threads that are already inactive or not present in the team 8826 if (team->t.t_threads[f]->th.th_used_in_team.load() == 0) { 8827 // teams construct causes thread_limit to get passed in, and some of 8828 // those could be inactive; just ignore them 8829 continue; 8830 } 8831 // If thread is transitioning still to in_use state, wait for it 8832 if (team->t.t_threads[f]->th.th_used_in_team.load() == 3) { 8833 while (team->t.t_threads[f]->th.th_used_in_team.load() == 3) 8834 KMP_CPU_PAUSE(); 8835 } 8836 // The thread should be in_use now 8837 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 1); 8838 // Transition to unused state 8839 team->t.t_threads[f]->th.th_used_in_team.store(2); 8840 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 2); 8841 } 8842 // Release all the workers 8843 kmp_uint64 new_value; // new value for go 8844 new_value = team->t.b->go_release(); 8845 8846 KMP_MFENCE(); 8847 8848 // Workers should see transition status 2 and move to 0; but may need to be 8849 // woken up first 8850 size_t my_go_index; 8851 int count = old_nthreads - 1; 8852 while (count > 0) { 8853 count = old_nthreads - 1; 8854 for (int f = 1; f < old_nthreads; ++f) { 8855 my_go_index = f / team->t.b->threads_per_go; 8856 if (other_threads[f]->th.th_used_in_team.load() != 0) { 8857 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up the workers 8858 kmp_atomic_flag_64<> *flag = (kmp_atomic_flag_64<> *)CCAST( 8859 void *, other_threads[f]->th.th_sleep_loc); 8860 __kmp_atomic_resume_64(other_threads[f]->th.th_info.ds.ds_gtid, flag); 8861 } 8862 } else { 8863 KMP_DEBUG_ASSERT(team->t.t_threads[f]->th.th_used_in_team.load() == 0); 8864 count--; 8865 } 8866 } 8867 } 8868 // Now update the barrier size 8869 team->t.b->update_num_threads(new_nthreads); 8870 team->t.b->go_reset(); 8871 } 8872 8873 void __kmp_add_threads_to_team(kmp_team_t *team, int new_nthreads) { 8874 // Add the threads back to the team 8875 KMP_DEBUG_ASSERT(team); 8876 // Threads were paused and pointed at th_used_in_team temporarily during a 8877 // resize of the team. We're going to set th_used_in_team to 3 to indicate to 8878 // the thread that it should transition itself back into the team. Then, if 8879 // blocktime isn't infinite, the thread could be sleeping, so we send a resume 8880 // to wake it up. 8881 for (int f = 1; f < new_nthreads; ++f) { 8882 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 8883 KMP_COMPARE_AND_STORE_ACQ32(&(team->t.t_threads[f]->th.th_used_in_team), 0, 8884 3); 8885 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { // Wake up sleeping threads 8886 __kmp_resume_32(team->t.t_threads[f]->th.th_info.ds.ds_gtid, 8887 (kmp_flag_32<false, false> *)NULL); 8888 } 8889 } 8890 // The threads should be transitioning to the team; when they are done, they 8891 // should have set th_used_in_team to 1. This loop forces master to wait until 8892 // all threads have moved into the team and are waiting in the barrier. 8893 int count = new_nthreads - 1; 8894 while (count > 0) { 8895 count = new_nthreads - 1; 8896 for (int f = 1; f < new_nthreads; ++f) { 8897 if (team->t.t_threads[f]->th.th_used_in_team.load() == 1) { 8898 count--; 8899 } 8900 } 8901 } 8902 } 8903 8904 // Globals and functions for hidden helper task 8905 kmp_info_t **__kmp_hidden_helper_threads; 8906 kmp_info_t *__kmp_hidden_helper_main_thread; 8907 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8908 #if KMP_OS_LINUX 8909 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8910 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8911 #else 8912 kmp_int32 __kmp_hidden_helper_threads_num = 0; 8913 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8914 #endif 8915 8916 namespace { 8917 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8918 8919 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8920 // This is an explicit synchronization on all hidden helper threads in case 8921 // that when a regular thread pushes a hidden helper task to one hidden 8922 // helper thread, the thread has not been awaken once since they're released 8923 // by the main thread after creating the team. 8924 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8925 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8926 __kmp_hidden_helper_threads_num) 8927 ; 8928 8929 // If main thread, then wait for signal 8930 if (__kmpc_master(nullptr, *gtid)) { 8931 // First, unset the initial state and release the initial thread 8932 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8933 __kmp_hidden_helper_initz_release(); 8934 __kmp_hidden_helper_main_thread_wait(); 8935 // Now wake up all worker threads 8936 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8937 __kmp_hidden_helper_worker_thread_signal(); 8938 } 8939 } 8940 } 8941 } // namespace 8942 8943 void __kmp_hidden_helper_threads_initz_routine() { 8944 // Create a new root for hidden helper team/threads 8945 const int gtid = __kmp_register_root(TRUE); 8946 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8947 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8948 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8949 __kmp_hidden_helper_threads_num; 8950 8951 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8952 8953 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8954 8955 // Set the initialization flag to FALSE 8956 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8957 8958 __kmp_hidden_helper_threads_deinitz_release(); 8959 } 8960 8961 /* Nesting Mode: 8962 Set via KMP_NESTING_MODE, which takes an integer. 8963 Note: we skip duplicate topology levels, and skip levels with only 8964 one entity. 8965 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 8966 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 8967 in the topology, and initializes the number of threads at each of those 8968 levels to the number of entities at each level, respectively, below the 8969 entity at the parent level. 8970 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 8971 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 8972 the user to turn nesting on explicitly. This is an even more experimental 8973 option to this experimental feature, and may change or go away in the 8974 future. 8975 */ 8976 8977 // Allocate space to store nesting levels 8978 void __kmp_init_nesting_mode() { 8979 int levels = KMP_HW_LAST; 8980 __kmp_nesting_mode_nlevels = levels; 8981 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 8982 for (int i = 0; i < levels; ++i) 8983 __kmp_nesting_nth_level[i] = 0; 8984 if (__kmp_nested_nth.size < levels) { 8985 __kmp_nested_nth.nth = 8986 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 8987 __kmp_nested_nth.size = levels; 8988 } 8989 } 8990 8991 // Set # threads for top levels of nesting; must be called after topology set 8992 void __kmp_set_nesting_mode_threads() { 8993 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 8994 8995 if (__kmp_nesting_mode == 1) 8996 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 8997 else if (__kmp_nesting_mode > 1) 8998 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8999 9000 if (__kmp_topology) { // use topology info 9001 int loc, hw_level; 9002 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 9003 loc < __kmp_nesting_mode_nlevels; 9004 loc++, hw_level++) { 9005 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 9006 if (__kmp_nesting_nth_level[loc] == 1) 9007 loc--; 9008 } 9009 // Make sure all cores are used 9010 if (__kmp_nesting_mode > 1 && loc > 1) { 9011 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 9012 int num_cores = __kmp_topology->get_count(core_level); 9013 int upper_levels = 1; 9014 for (int level = 0; level < loc - 1; ++level) 9015 upper_levels *= __kmp_nesting_nth_level[level]; 9016 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 9017 __kmp_nesting_nth_level[loc - 1] = 9018 num_cores / __kmp_nesting_nth_level[loc - 2]; 9019 } 9020 __kmp_nesting_mode_nlevels = loc; 9021 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9022 } else { // no topology info available; provide a reasonable guesstimation 9023 if (__kmp_avail_proc >= 4) { 9024 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 9025 __kmp_nesting_nth_level[1] = 2; 9026 __kmp_nesting_mode_nlevels = 2; 9027 } else { 9028 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 9029 __kmp_nesting_mode_nlevels = 1; 9030 } 9031 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 9032 } 9033 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 9034 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 9035 } 9036 set__nproc(thread, __kmp_nesting_nth_level[0]); 9037 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 9038 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 9039 if (get__max_active_levels(thread) > 1) { 9040 // if max levels was set, set nesting mode levels to same 9041 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 9042 } 9043 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 9044 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 9045 } 9046