1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 #if OMPD_SUPPORT 35 #include "ompd-specific.h" 36 #endif 37 38 #if OMP_PROFILING_SUPPORT 39 #include "llvm/Support/TimeProfiler.h" 40 static char *ProfileTraceFile = nullptr; 41 #endif 42 43 /* these are temporary issues to be dealt with */ 44 #define KMP_USE_PRCTL 0 45 46 #if KMP_OS_WINDOWS 47 #include <process.h> 48 #endif 49 50 #include "tsan_annotations.h" 51 52 #if KMP_OS_WINDOWS 53 // windows does not need include files as it doesn't use shared memory 54 #else 55 #include <sys/mman.h> 56 #include <sys/stat.h> 57 #include <fcntl.h> 58 #define SHM_SIZE 1024 59 #endif 60 61 #if defined(KMP_GOMP_COMPAT) 62 char const __kmp_version_alt_comp[] = 63 KMP_VERSION_PREFIX "alternative compiler support: yes"; 64 #endif /* defined(KMP_GOMP_COMPAT) */ 65 66 char const __kmp_version_omp_api[] = 67 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 68 69 #ifdef KMP_DEBUG 70 char const __kmp_version_lock[] = 71 KMP_VERSION_PREFIX "lock type: run time selectable"; 72 #endif /* KMP_DEBUG */ 73 74 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 75 76 /* ------------------------------------------------------------------------ */ 77 78 #if KMP_USE_MONITOR 79 kmp_info_t __kmp_monitor; 80 #endif 81 82 /* Forward declarations */ 83 84 void __kmp_cleanup(void); 85 86 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 87 int gtid); 88 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 89 kmp_internal_control_t *new_icvs, 90 ident_t *loc); 91 #if KMP_AFFINITY_SUPPORTED 92 static void __kmp_partition_places(kmp_team_t *team, 93 int update_master_only = 0); 94 #endif 95 static void __kmp_do_serial_initialize(void); 96 void __kmp_fork_barrier(int gtid, int tid); 97 void __kmp_join_barrier(int gtid); 98 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 99 kmp_internal_control_t *new_icvs, ident_t *loc); 100 101 #ifdef USE_LOAD_BALANCE 102 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 103 #endif 104 105 static int __kmp_expand_threads(int nNeed); 106 #if KMP_OS_WINDOWS 107 static int __kmp_unregister_root_other_thread(int gtid); 108 #endif 109 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 110 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 111 112 /* Calculate the identifier of the current thread */ 113 /* fast (and somewhat portable) way to get unique identifier of executing 114 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 115 int __kmp_get_global_thread_id() { 116 int i; 117 kmp_info_t **other_threads; 118 size_t stack_data; 119 char *stack_addr; 120 size_t stack_size; 121 char *stack_base; 122 123 KA_TRACE( 124 1000, 125 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 126 __kmp_nth, __kmp_all_nth)); 127 128 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 129 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 130 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 131 __kmp_init_gtid for this to work. */ 132 133 if (!TCR_4(__kmp_init_gtid)) 134 return KMP_GTID_DNE; 135 136 #ifdef KMP_TDATA_GTID 137 if (TCR_4(__kmp_gtid_mode) >= 3) { 138 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 139 return __kmp_gtid; 140 } 141 #endif 142 if (TCR_4(__kmp_gtid_mode) >= 2) { 143 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 144 return __kmp_gtid_get_specific(); 145 } 146 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 147 148 stack_addr = (char *)&stack_data; 149 other_threads = __kmp_threads; 150 151 /* ATT: The code below is a source of potential bugs due to unsynchronized 152 access to __kmp_threads array. For example: 153 1. Current thread loads other_threads[i] to thr and checks it, it is 154 non-NULL. 155 2. Current thread is suspended by OS. 156 3. Another thread unregisters and finishes (debug versions of free() 157 may fill memory with something like 0xEF). 158 4. Current thread is resumed. 159 5. Current thread reads junk from *thr. 160 TODO: Fix it. --ln */ 161 162 for (i = 0; i < __kmp_threads_capacity; i++) { 163 164 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 165 if (!thr) 166 continue; 167 168 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 169 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 170 171 /* stack grows down -- search through all of the active threads */ 172 173 if (stack_addr <= stack_base) { 174 size_t stack_diff = stack_base - stack_addr; 175 176 if (stack_diff <= stack_size) { 177 /* The only way we can be closer than the allocated */ 178 /* stack size is if we are running on this thread. */ 179 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 180 return i; 181 } 182 } 183 } 184 185 /* get specific to try and determine our gtid */ 186 KA_TRACE(1000, 187 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 188 "thread, using TLS\n")); 189 i = __kmp_gtid_get_specific(); 190 191 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 192 193 /* if we havn't been assigned a gtid, then return code */ 194 if (i < 0) 195 return i; 196 197 /* dynamically updated stack window for uber threads to avoid get_specific 198 call */ 199 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 200 KMP_FATAL(StackOverflow, i); 201 } 202 203 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 204 if (stack_addr > stack_base) { 205 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 206 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 207 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 208 stack_base); 209 } else { 210 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 211 stack_base - stack_addr); 212 } 213 214 /* Reprint stack bounds for ubermaster since they have been refined */ 215 if (__kmp_storage_map) { 216 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 217 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 218 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 219 other_threads[i]->th.th_info.ds.ds_stacksize, 220 "th_%d stack (refinement)", i); 221 } 222 return i; 223 } 224 225 int __kmp_get_global_thread_id_reg() { 226 int gtid; 227 228 if (!__kmp_init_serial) { 229 gtid = KMP_GTID_DNE; 230 } else 231 #ifdef KMP_TDATA_GTID 232 if (TCR_4(__kmp_gtid_mode) >= 3) { 233 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 234 gtid = __kmp_gtid; 235 } else 236 #endif 237 if (TCR_4(__kmp_gtid_mode) >= 2) { 238 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 239 gtid = __kmp_gtid_get_specific(); 240 } else { 241 KA_TRACE(1000, 242 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 243 gtid = __kmp_get_global_thread_id(); 244 } 245 246 /* we must be a new uber master sibling thread */ 247 if (gtid == KMP_GTID_DNE) { 248 KA_TRACE(10, 249 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 250 "Registering a new gtid.\n")); 251 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 252 if (!__kmp_init_serial) { 253 __kmp_do_serial_initialize(); 254 gtid = __kmp_gtid_get_specific(); 255 } else { 256 gtid = __kmp_register_root(FALSE); 257 } 258 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 259 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 260 } 261 262 KMP_DEBUG_ASSERT(gtid >= 0); 263 264 return gtid; 265 } 266 267 /* caller must hold forkjoin_lock */ 268 void __kmp_check_stack_overlap(kmp_info_t *th) { 269 int f; 270 char *stack_beg = NULL; 271 char *stack_end = NULL; 272 int gtid; 273 274 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 275 if (__kmp_storage_map) { 276 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 277 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 278 279 gtid = __kmp_gtid_from_thread(th); 280 281 if (gtid == KMP_GTID_MONITOR) { 282 __kmp_print_storage_map_gtid( 283 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 284 "th_%s stack (%s)", "mon", 285 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 286 } else { 287 __kmp_print_storage_map_gtid( 288 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 289 "th_%d stack (%s)", gtid, 290 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 291 } 292 } 293 294 /* No point in checking ubermaster threads since they use refinement and 295 * cannot overlap */ 296 gtid = __kmp_gtid_from_thread(th); 297 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 298 KA_TRACE(10, 299 ("__kmp_check_stack_overlap: performing extensive checking\n")); 300 if (stack_beg == NULL) { 301 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 302 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 303 } 304 305 for (f = 0; f < __kmp_threads_capacity; f++) { 306 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 307 308 if (f_th && f_th != th) { 309 char *other_stack_end = 310 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 311 char *other_stack_beg = 312 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 313 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 314 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 315 316 /* Print the other stack values before the abort */ 317 if (__kmp_storage_map) 318 __kmp_print_storage_map_gtid( 319 -1, other_stack_beg, other_stack_end, 320 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 321 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 322 323 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 324 __kmp_msg_null); 325 } 326 } 327 } 328 } 329 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 330 } 331 332 /* ------------------------------------------------------------------------ */ 333 334 void __kmp_infinite_loop(void) { 335 static int done = FALSE; 336 337 while (!done) { 338 KMP_YIELD(TRUE); 339 } 340 } 341 342 #define MAX_MESSAGE 512 343 344 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 345 char const *format, ...) { 346 char buffer[MAX_MESSAGE]; 347 va_list ap; 348 349 va_start(ap, format); 350 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 351 p2, (unsigned long)size, format); 352 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 353 __kmp_vprintf(kmp_err, buffer, ap); 354 #if KMP_PRINT_DATA_PLACEMENT 355 int node; 356 if (gtid >= 0) { 357 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 358 if (__kmp_storage_map_verbose) { 359 node = __kmp_get_host_node(p1); 360 if (node < 0) /* doesn't work, so don't try this next time */ 361 __kmp_storage_map_verbose = FALSE; 362 else { 363 char *last; 364 int lastNode; 365 int localProc = __kmp_get_cpu_from_gtid(gtid); 366 367 const int page_size = KMP_GET_PAGE_SIZE(); 368 369 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 370 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 371 if (localProc >= 0) 372 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 373 localProc >> 1); 374 else 375 __kmp_printf_no_lock(" GTID %d\n", gtid); 376 #if KMP_USE_PRCTL 377 /* The more elaborate format is disabled for now because of the prctl 378 * hanging bug. */ 379 do { 380 last = p1; 381 lastNode = node; 382 /* This loop collates adjacent pages with the same host node. */ 383 do { 384 (char *)p1 += page_size; 385 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 386 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 387 lastNode); 388 } while (p1 <= p2); 389 #else 390 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 391 (char *)p1 + (page_size - 1), 392 __kmp_get_host_node(p1)); 393 if (p1 < p2) { 394 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 395 (char *)p2 + (page_size - 1), 396 __kmp_get_host_node(p2)); 397 } 398 #endif 399 } 400 } 401 } else 402 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 403 } 404 #endif /* KMP_PRINT_DATA_PLACEMENT */ 405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 406 } 407 408 void __kmp_warn(char const *format, ...) { 409 char buffer[MAX_MESSAGE]; 410 va_list ap; 411 412 if (__kmp_generate_warnings == kmp_warnings_off) { 413 return; 414 } 415 416 va_start(ap, format); 417 418 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 419 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 420 __kmp_vprintf(kmp_err, buffer, ap); 421 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 422 423 va_end(ap); 424 } 425 426 void __kmp_abort_process() { 427 // Later threads may stall here, but that's ok because abort() will kill them. 428 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 429 430 if (__kmp_debug_buf) { 431 __kmp_dump_debug_buffer(); 432 } 433 434 if (KMP_OS_WINDOWS) { 435 // Let other threads know of abnormal termination and prevent deadlock 436 // if abort happened during library initialization or shutdown 437 __kmp_global.g.g_abort = SIGABRT; 438 439 /* On Windows* OS by default abort() causes pop-up error box, which stalls 440 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 441 boxes. _set_abort_behavior() works well, but this function is not 442 available in VS7 (this is not problem for DLL, but it is a problem for 443 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 444 help, at least in some versions of MS C RTL. 445 446 It seems following sequence is the only way to simulate abort() and 447 avoid pop-up error box. */ 448 raise(SIGABRT); 449 _exit(3); // Just in case, if signal ignored, exit anyway. 450 } else { 451 __kmp_unregister_library(); 452 abort(); 453 } 454 455 __kmp_infinite_loop(); 456 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 457 458 } // __kmp_abort_process 459 460 void __kmp_abort_thread(void) { 461 // TODO: Eliminate g_abort global variable and this function. 462 // In case of abort just call abort(), it will kill all the threads. 463 __kmp_infinite_loop(); 464 } // __kmp_abort_thread 465 466 /* Print out the storage map for the major kmp_info_t thread data structures 467 that are allocated together. */ 468 469 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 470 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 471 gtid); 472 473 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 474 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 475 476 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 477 sizeof(kmp_local_t), "th_%d.th_local", gtid); 478 479 __kmp_print_storage_map_gtid( 480 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 481 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 482 483 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 484 &thr->th.th_bar[bs_plain_barrier + 1], 485 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 486 gtid); 487 488 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 489 &thr->th.th_bar[bs_forkjoin_barrier + 1], 490 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 491 gtid); 492 493 #if KMP_FAST_REDUCTION_BARRIER 494 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 495 &thr->th.th_bar[bs_reduction_barrier + 1], 496 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 497 gtid); 498 #endif // KMP_FAST_REDUCTION_BARRIER 499 } 500 501 /* Print out the storage map for the major kmp_team_t team data structures 502 that are allocated together. */ 503 504 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 505 int team_id, int num_thr) { 506 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 507 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 508 header, team_id); 509 510 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 511 &team->t.t_bar[bs_last_barrier], 512 sizeof(kmp_balign_team_t) * bs_last_barrier, 513 "%s_%d.t_bar", header, team_id); 514 515 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 516 &team->t.t_bar[bs_plain_barrier + 1], 517 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 518 header, team_id); 519 520 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 521 &team->t.t_bar[bs_forkjoin_barrier + 1], 522 sizeof(kmp_balign_team_t), 523 "%s_%d.t_bar[forkjoin]", header, team_id); 524 525 #if KMP_FAST_REDUCTION_BARRIER 526 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 527 &team->t.t_bar[bs_reduction_barrier + 1], 528 sizeof(kmp_balign_team_t), 529 "%s_%d.t_bar[reduction]", header, team_id); 530 #endif // KMP_FAST_REDUCTION_BARRIER 531 532 __kmp_print_storage_map_gtid( 533 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 534 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 535 536 __kmp_print_storage_map_gtid( 537 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 538 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 539 540 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 541 &team->t.t_disp_buffer[num_disp_buff], 542 sizeof(dispatch_shared_info_t) * num_disp_buff, 543 "%s_%d.t_disp_buffer", header, team_id); 544 } 545 546 static void __kmp_init_allocator() { 547 __kmp_init_memkind(); 548 __kmp_init_target_mem(); 549 } 550 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 551 552 /* ------------------------------------------------------------------------ */ 553 554 #if KMP_DYNAMIC_LIB 555 #if KMP_OS_WINDOWS 556 557 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 558 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 559 560 switch (fdwReason) { 561 562 case DLL_PROCESS_ATTACH: 563 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 564 565 return TRUE; 566 567 case DLL_PROCESS_DETACH: 568 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 569 570 // According to Windows* documentation for DllMain entry point: 571 // for DLL_PROCESS_DETACH, lpReserved is used for telling the difference: 572 // lpReserved == NULL when FreeLibrary() is called, 573 // lpReserved != NULL when the process is terminated. 574 // When FreeLibrary() is called, worker threads remain alive. So the 575 // runtime's state is consistent and executing proper shutdown is OK. 576 // When the process is terminated, worker threads have exited or been 577 // forcefully terminated by the OS and only the shutdown thread remains. 578 // This can leave the runtime in an inconsistent state. 579 // Hence, only attempt proper cleanup when FreeLibrary() is called. 580 // Otherwise, rely on OS to reclaim resources. 581 if (lpReserved == NULL) 582 __kmp_internal_end_library(__kmp_gtid_get_specific()); 583 584 return TRUE; 585 586 case DLL_THREAD_ATTACH: 587 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 588 589 /* if we want to register new siblings all the time here call 590 * __kmp_get_gtid(); */ 591 return TRUE; 592 593 case DLL_THREAD_DETACH: 594 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 595 596 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 597 return TRUE; 598 } 599 600 return TRUE; 601 } 602 603 #endif /* KMP_OS_WINDOWS */ 604 #endif /* KMP_DYNAMIC_LIB */ 605 606 /* __kmp_parallel_deo -- Wait until it's our turn. */ 607 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 608 int gtid = *gtid_ref; 609 #ifdef BUILD_PARALLEL_ORDERED 610 kmp_team_t *team = __kmp_team_from_gtid(gtid); 611 #endif /* BUILD_PARALLEL_ORDERED */ 612 613 if (__kmp_env_consistency_check) { 614 if (__kmp_threads[gtid]->th.th_root->r.r_active) 615 #if KMP_USE_DYNAMIC_LOCK 616 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 617 #else 618 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 619 #endif 620 } 621 #ifdef BUILD_PARALLEL_ORDERED 622 if (!team->t.t_serialized) { 623 KMP_MB(); 624 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 625 NULL); 626 KMP_MB(); 627 } 628 #endif /* BUILD_PARALLEL_ORDERED */ 629 } 630 631 /* __kmp_parallel_dxo -- Signal the next task. */ 632 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 633 int gtid = *gtid_ref; 634 #ifdef BUILD_PARALLEL_ORDERED 635 int tid = __kmp_tid_from_gtid(gtid); 636 kmp_team_t *team = __kmp_team_from_gtid(gtid); 637 #endif /* BUILD_PARALLEL_ORDERED */ 638 639 if (__kmp_env_consistency_check) { 640 if (__kmp_threads[gtid]->th.th_root->r.r_active) 641 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 642 } 643 #ifdef BUILD_PARALLEL_ORDERED 644 if (!team->t.t_serialized) { 645 KMP_MB(); /* Flush all pending memory write invalidates. */ 646 647 /* use the tid of the next thread in this team */ 648 /* TODO replace with general release procedure */ 649 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 650 651 KMP_MB(); /* Flush all pending memory write invalidates. */ 652 } 653 #endif /* BUILD_PARALLEL_ORDERED */ 654 } 655 656 /* ------------------------------------------------------------------------ */ 657 /* The BARRIER for a SINGLE process section is always explicit */ 658 659 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 660 int status; 661 kmp_info_t *th; 662 kmp_team_t *team; 663 664 if (!TCR_4(__kmp_init_parallel)) 665 __kmp_parallel_initialize(); 666 __kmp_resume_if_soft_paused(); 667 668 th = __kmp_threads[gtid]; 669 team = th->th.th_team; 670 status = 0; 671 672 th->th.th_ident = id_ref; 673 674 if (team->t.t_serialized) { 675 status = 1; 676 } else { 677 kmp_int32 old_this = th->th.th_local.this_construct; 678 679 ++th->th.th_local.this_construct; 680 /* try to set team count to thread count--success means thread got the 681 single block */ 682 /* TODO: Should this be acquire or release? */ 683 if (team->t.t_construct == old_this) { 684 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 685 th->th.th_local.this_construct); 686 } 687 #if USE_ITT_BUILD 688 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 689 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 690 team->t.t_active_level == 1) { 691 // Only report metadata by primary thread of active team at level 1 692 __kmp_itt_metadata_single(id_ref); 693 } 694 #endif /* USE_ITT_BUILD */ 695 } 696 697 if (__kmp_env_consistency_check) { 698 if (status && push_ws) { 699 __kmp_push_workshare(gtid, ct_psingle, id_ref); 700 } else { 701 __kmp_check_workshare(gtid, ct_psingle, id_ref); 702 } 703 } 704 #if USE_ITT_BUILD 705 if (status) { 706 __kmp_itt_single_start(gtid); 707 } 708 #endif /* USE_ITT_BUILD */ 709 return status; 710 } 711 712 void __kmp_exit_single(int gtid) { 713 #if USE_ITT_BUILD 714 __kmp_itt_single_end(gtid); 715 #endif /* USE_ITT_BUILD */ 716 if (__kmp_env_consistency_check) 717 __kmp_pop_workshare(gtid, ct_psingle, NULL); 718 } 719 720 /* determine if we can go parallel or must use a serialized parallel region and 721 * how many threads we can use 722 * set_nproc is the number of threads requested for the team 723 * returns 0 if we should serialize or only use one thread, 724 * otherwise the number of threads to use 725 * The forkjoin lock is held by the caller. */ 726 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 727 int master_tid, int set_nthreads, 728 int enter_teams) { 729 int capacity; 730 int new_nthreads; 731 KMP_DEBUG_ASSERT(__kmp_init_serial); 732 KMP_DEBUG_ASSERT(root && parent_team); 733 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 734 735 // If dyn-var is set, dynamically adjust the number of desired threads, 736 // according to the method specified by dynamic_mode. 737 new_nthreads = set_nthreads; 738 if (!get__dynamic_2(parent_team, master_tid)) { 739 ; 740 } 741 #ifdef USE_LOAD_BALANCE 742 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 743 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 744 if (new_nthreads == 1) { 745 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 746 "reservation to 1 thread\n", 747 master_tid)); 748 return 1; 749 } 750 if (new_nthreads < set_nthreads) { 751 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 752 "reservation to %d threads\n", 753 master_tid, new_nthreads)); 754 } 755 } 756 #endif /* USE_LOAD_BALANCE */ 757 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 758 new_nthreads = __kmp_avail_proc - __kmp_nth + 759 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 760 if (new_nthreads <= 1) { 761 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 762 "reservation to 1 thread\n", 763 master_tid)); 764 return 1; 765 } 766 if (new_nthreads < set_nthreads) { 767 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 768 "reservation to %d threads\n", 769 master_tid, new_nthreads)); 770 } else { 771 new_nthreads = set_nthreads; 772 } 773 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 774 if (set_nthreads > 2) { 775 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 776 new_nthreads = (new_nthreads % set_nthreads) + 1; 777 if (new_nthreads == 1) { 778 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 779 "reservation to 1 thread\n", 780 master_tid)); 781 return 1; 782 } 783 if (new_nthreads < set_nthreads) { 784 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 785 "reservation to %d threads\n", 786 master_tid, new_nthreads)); 787 } 788 } 789 } else { 790 KMP_ASSERT(0); 791 } 792 793 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 794 if (__kmp_nth + new_nthreads - 795 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 796 __kmp_max_nth) { 797 int tl_nthreads = __kmp_max_nth - __kmp_nth + 798 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 799 if (tl_nthreads <= 0) { 800 tl_nthreads = 1; 801 } 802 803 // If dyn-var is false, emit a 1-time warning. 804 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 805 __kmp_reserve_warn = 1; 806 __kmp_msg(kmp_ms_warning, 807 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 808 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 809 } 810 if (tl_nthreads == 1) { 811 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 812 "reduced reservation to 1 thread\n", 813 master_tid)); 814 return 1; 815 } 816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 817 "reservation to %d threads\n", 818 master_tid, tl_nthreads)); 819 new_nthreads = tl_nthreads; 820 } 821 822 // Respect OMP_THREAD_LIMIT 823 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 824 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 825 if (cg_nthreads + new_nthreads - 826 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 827 max_cg_threads) { 828 int tl_nthreads = max_cg_threads - cg_nthreads + 829 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 830 if (tl_nthreads <= 0) { 831 tl_nthreads = 1; 832 } 833 834 // If dyn-var is false, emit a 1-time warning. 835 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 836 __kmp_reserve_warn = 1; 837 __kmp_msg(kmp_ms_warning, 838 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 839 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 840 } 841 if (tl_nthreads == 1) { 842 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 843 "reduced reservation to 1 thread\n", 844 master_tid)); 845 return 1; 846 } 847 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 848 "reservation to %d threads\n", 849 master_tid, tl_nthreads)); 850 new_nthreads = tl_nthreads; 851 } 852 853 // Check if the threads array is large enough, or needs expanding. 854 // See comment in __kmp_register_root() about the adjustment if 855 // __kmp_threads[0] == NULL. 856 capacity = __kmp_threads_capacity; 857 if (TCR_PTR(__kmp_threads[0]) == NULL) { 858 --capacity; 859 } 860 // If it is not for initializing the hidden helper team, we need to take 861 // __kmp_hidden_helper_threads_num out of the capacity because it is included 862 // in __kmp_threads_capacity. 863 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 864 capacity -= __kmp_hidden_helper_threads_num; 865 } 866 if (__kmp_nth + new_nthreads - 867 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 868 capacity) { 869 // Expand the threads array. 870 int slotsRequired = __kmp_nth + new_nthreads - 871 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 872 capacity; 873 int slotsAdded = __kmp_expand_threads(slotsRequired); 874 if (slotsAdded < slotsRequired) { 875 // The threads array was not expanded enough. 876 new_nthreads -= (slotsRequired - slotsAdded); 877 KMP_ASSERT(new_nthreads >= 1); 878 879 // If dyn-var is false, emit a 1-time warning. 880 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 881 __kmp_reserve_warn = 1; 882 if (__kmp_tp_cached) { 883 __kmp_msg(kmp_ms_warning, 884 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 885 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 886 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 887 } else { 888 __kmp_msg(kmp_ms_warning, 889 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 890 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 891 } 892 } 893 } 894 } 895 896 #ifdef KMP_DEBUG 897 if (new_nthreads == 1) { 898 KC_TRACE(10, 899 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 900 "dead roots and rechecking; requested %d threads\n", 901 __kmp_get_gtid(), set_nthreads)); 902 } else { 903 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 904 " %d threads\n", 905 __kmp_get_gtid(), new_nthreads, set_nthreads)); 906 } 907 #endif // KMP_DEBUG 908 return new_nthreads; 909 } 910 911 /* Allocate threads from the thread pool and assign them to the new team. We are 912 assured that there are enough threads available, because we checked on that 913 earlier within critical section forkjoin */ 914 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 915 kmp_info_t *master_th, int master_gtid) { 916 int i; 917 int use_hot_team; 918 919 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 920 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 921 KMP_MB(); 922 923 /* first, let's setup the primary thread */ 924 master_th->th.th_info.ds.ds_tid = 0; 925 master_th->th.th_team = team; 926 master_th->th.th_team_nproc = team->t.t_nproc; 927 master_th->th.th_team_master = master_th; 928 master_th->th.th_team_serialized = FALSE; 929 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 930 931 /* make sure we are not the optimized hot team */ 932 #if KMP_NESTED_HOT_TEAMS 933 use_hot_team = 0; 934 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 935 if (hot_teams) { // hot teams array is not allocated if 936 // KMP_HOT_TEAMS_MAX_LEVEL=0 937 int level = team->t.t_active_level - 1; // index in array of hot teams 938 if (master_th->th.th_teams_microtask) { // are we inside the teams? 939 if (master_th->th.th_teams_size.nteams > 1) { 940 ++level; // level was not increased in teams construct for 941 // team_of_masters 942 } 943 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 944 master_th->th.th_teams_level == team->t.t_level) { 945 ++level; // level was not increased in teams construct for 946 // team_of_workers before the parallel 947 } // team->t.t_level will be increased inside parallel 948 } 949 if (level < __kmp_hot_teams_max_level) { 950 if (hot_teams[level].hot_team) { 951 // hot team has already been allocated for given level 952 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 953 use_hot_team = 1; // the team is ready to use 954 } else { 955 use_hot_team = 0; // AC: threads are not allocated yet 956 hot_teams[level].hot_team = team; // remember new hot team 957 hot_teams[level].hot_team_nth = team->t.t_nproc; 958 } 959 } else { 960 use_hot_team = 0; 961 } 962 } 963 #else 964 use_hot_team = team == root->r.r_hot_team; 965 #endif 966 if (!use_hot_team) { 967 968 /* install the primary thread */ 969 team->t.t_threads[0] = master_th; 970 __kmp_initialize_info(master_th, team, 0, master_gtid); 971 972 /* now, install the worker threads */ 973 for (i = 1; i < team->t.t_nproc; i++) { 974 975 /* fork or reallocate a new thread and install it in team */ 976 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 977 team->t.t_threads[i] = thr; 978 KMP_DEBUG_ASSERT(thr); 979 KMP_DEBUG_ASSERT(thr->th.th_team == team); 980 /* align team and thread arrived states */ 981 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 982 "T#%d(%d:%d) join =%llu, plain=%llu\n", 983 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 984 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 985 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 986 team->t.t_bar[bs_plain_barrier].b_arrived)); 987 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 988 thr->th.th_teams_level = master_th->th.th_teams_level; 989 thr->th.th_teams_size = master_th->th.th_teams_size; 990 { // Initialize threads' barrier data. 991 int b; 992 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 993 for (b = 0; b < bs_last_barrier; ++b) { 994 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 995 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 996 #if USE_DEBUGGER 997 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 998 #endif 999 } 1000 } 1001 } 1002 1003 #if KMP_AFFINITY_SUPPORTED 1004 __kmp_partition_places(team); 1005 #endif 1006 } 1007 1008 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1009 for (i = 0; i < team->t.t_nproc; i++) { 1010 kmp_info_t *thr = team->t.t_threads[i]; 1011 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1012 thr->th.th_prev_level != team->t.t_level) { 1013 team->t.t_display_affinity = 1; 1014 break; 1015 } 1016 } 1017 } 1018 1019 KMP_MB(); 1020 } 1021 1022 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1023 // Propagate any changes to the floating point control registers out to the team 1024 // We try to avoid unnecessary writes to the relevant cache line in the team 1025 // structure, so we don't make changes unless they are needed. 1026 inline static void propagateFPControl(kmp_team_t *team) { 1027 if (__kmp_inherit_fp_control) { 1028 kmp_int16 x87_fpu_control_word; 1029 kmp_uint32 mxcsr; 1030 1031 // Get primary thread's values of FPU control flags (both X87 and vector) 1032 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1033 __kmp_store_mxcsr(&mxcsr); 1034 mxcsr &= KMP_X86_MXCSR_MASK; 1035 1036 // There is no point looking at t_fp_control_saved here. 1037 // If it is TRUE, we still have to update the values if they are different 1038 // from those we now have. If it is FALSE we didn't save anything yet, but 1039 // our objective is the same. We have to ensure that the values in the team 1040 // are the same as those we have. 1041 // So, this code achieves what we need whether or not t_fp_control_saved is 1042 // true. By checking whether the value needs updating we avoid unnecessary 1043 // writes that would put the cache-line into a written state, causing all 1044 // threads in the team to have to read it again. 1045 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1046 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1047 // Although we don't use this value, other code in the runtime wants to know 1048 // whether it should restore them. So we must ensure it is correct. 1049 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1050 } else { 1051 // Similarly here. Don't write to this cache-line in the team structure 1052 // unless we have to. 1053 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1054 } 1055 } 1056 1057 // Do the opposite, setting the hardware registers to the updated values from 1058 // the team. 1059 inline static void updateHWFPControl(kmp_team_t *team) { 1060 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1061 // Only reset the fp control regs if they have been changed in the team. 1062 // the parallel region that we are exiting. 1063 kmp_int16 x87_fpu_control_word; 1064 kmp_uint32 mxcsr; 1065 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1066 __kmp_store_mxcsr(&mxcsr); 1067 mxcsr &= KMP_X86_MXCSR_MASK; 1068 1069 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1070 __kmp_clear_x87_fpu_status_word(); 1071 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1072 } 1073 1074 if (team->t.t_mxcsr != mxcsr) { 1075 __kmp_load_mxcsr(&team->t.t_mxcsr); 1076 } 1077 } 1078 } 1079 #else 1080 #define propagateFPControl(x) ((void)0) 1081 #define updateHWFPControl(x) ((void)0) 1082 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1083 1084 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1085 int realloc); // forward declaration 1086 1087 /* Run a parallel region that has been serialized, so runs only in a team of the 1088 single primary thread. */ 1089 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1090 kmp_info_t *this_thr; 1091 kmp_team_t *serial_team; 1092 1093 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1094 1095 /* Skip all this code for autopar serialized loops since it results in 1096 unacceptable overhead */ 1097 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1098 return; 1099 1100 if (!TCR_4(__kmp_init_parallel)) 1101 __kmp_parallel_initialize(); 1102 __kmp_resume_if_soft_paused(); 1103 1104 this_thr = __kmp_threads[global_tid]; 1105 serial_team = this_thr->th.th_serial_team; 1106 1107 /* utilize the serialized team held by this thread */ 1108 KMP_DEBUG_ASSERT(serial_team); 1109 KMP_MB(); 1110 1111 if (__kmp_tasking_mode != tskm_immediate_exec) { 1112 KMP_DEBUG_ASSERT( 1113 this_thr->th.th_task_team == 1114 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1115 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1116 NULL); 1117 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1118 "team %p, new task_team = NULL\n", 1119 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1120 this_thr->th.th_task_team = NULL; 1121 } 1122 1123 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1124 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1125 proc_bind = proc_bind_false; 1126 } else if (proc_bind == proc_bind_default) { 1127 // No proc_bind clause was specified, so use the current value 1128 // of proc-bind-var for this parallel region. 1129 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1130 } 1131 // Reset for next parallel region 1132 this_thr->th.th_set_proc_bind = proc_bind_default; 1133 1134 #if OMPT_SUPPORT 1135 ompt_data_t ompt_parallel_data = ompt_data_none; 1136 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1137 if (ompt_enabled.enabled && 1138 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1139 1140 ompt_task_info_t *parent_task_info; 1141 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1142 1143 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1144 if (ompt_enabled.ompt_callback_parallel_begin) { 1145 int team_size = 1; 1146 1147 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1148 &(parent_task_info->task_data), &(parent_task_info->frame), 1149 &ompt_parallel_data, team_size, 1150 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1151 } 1152 } 1153 #endif // OMPT_SUPPORT 1154 1155 if (this_thr->th.th_team != serial_team) { 1156 // Nested level will be an index in the nested nthreads array 1157 int level = this_thr->th.th_team->t.t_level; 1158 1159 if (serial_team->t.t_serialized) { 1160 /* this serial team was already used 1161 TODO increase performance by making this locks more specific */ 1162 kmp_team_t *new_team; 1163 1164 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1165 1166 new_team = 1167 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1168 #if OMPT_SUPPORT 1169 ompt_parallel_data, 1170 #endif 1171 proc_bind, &this_thr->th.th_current_task->td_icvs, 1172 0 USE_NESTED_HOT_ARG(NULL)); 1173 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1174 KMP_ASSERT(new_team); 1175 1176 /* setup new serialized team and install it */ 1177 new_team->t.t_threads[0] = this_thr; 1178 new_team->t.t_parent = this_thr->th.th_team; 1179 serial_team = new_team; 1180 this_thr->th.th_serial_team = serial_team; 1181 1182 KF_TRACE( 1183 10, 1184 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1185 global_tid, serial_team)); 1186 1187 /* TODO the above breaks the requirement that if we run out of resources, 1188 then we can still guarantee that serialized teams are ok, since we may 1189 need to allocate a new one */ 1190 } else { 1191 KF_TRACE( 1192 10, 1193 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1194 global_tid, serial_team)); 1195 } 1196 1197 /* we have to initialize this serial team */ 1198 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1199 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1200 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1201 serial_team->t.t_ident = loc; 1202 serial_team->t.t_serialized = 1; 1203 serial_team->t.t_nproc = 1; 1204 serial_team->t.t_parent = this_thr->th.th_team; 1205 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1206 this_thr->th.th_team = serial_team; 1207 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1208 1209 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1210 this_thr->th.th_current_task)); 1211 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1212 this_thr->th.th_current_task->td_flags.executing = 0; 1213 1214 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1215 1216 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1217 implicit task for each serialized task represented by 1218 team->t.t_serialized? */ 1219 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1220 &this_thr->th.th_current_task->td_parent->td_icvs); 1221 1222 // Thread value exists in the nested nthreads array for the next nested 1223 // level 1224 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1225 this_thr->th.th_current_task->td_icvs.nproc = 1226 __kmp_nested_nth.nth[level + 1]; 1227 } 1228 1229 if (__kmp_nested_proc_bind.used && 1230 (level + 1 < __kmp_nested_proc_bind.used)) { 1231 this_thr->th.th_current_task->td_icvs.proc_bind = 1232 __kmp_nested_proc_bind.bind_types[level + 1]; 1233 } 1234 1235 #if USE_DEBUGGER 1236 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1237 #endif 1238 this_thr->th.th_info.ds.ds_tid = 0; 1239 1240 /* set thread cache values */ 1241 this_thr->th.th_team_nproc = 1; 1242 this_thr->th.th_team_master = this_thr; 1243 this_thr->th.th_team_serialized = 1; 1244 1245 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1246 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1247 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1248 1249 propagateFPControl(serial_team); 1250 1251 /* check if we need to allocate dispatch buffers stack */ 1252 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1253 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1254 serial_team->t.t_dispatch->th_disp_buffer = 1255 (dispatch_private_info_t *)__kmp_allocate( 1256 sizeof(dispatch_private_info_t)); 1257 } 1258 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1259 1260 KMP_MB(); 1261 1262 } else { 1263 /* this serialized team is already being used, 1264 * that's fine, just add another nested level */ 1265 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1266 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1267 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1268 ++serial_team->t.t_serialized; 1269 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1270 1271 // Nested level will be an index in the nested nthreads array 1272 int level = this_thr->th.th_team->t.t_level; 1273 // Thread value exists in the nested nthreads array for the next nested 1274 // level 1275 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1276 this_thr->th.th_current_task->td_icvs.nproc = 1277 __kmp_nested_nth.nth[level + 1]; 1278 } 1279 serial_team->t.t_level++; 1280 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1281 "of serial team %p to %d\n", 1282 global_tid, serial_team, serial_team->t.t_level)); 1283 1284 /* allocate/push dispatch buffers stack */ 1285 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1286 { 1287 dispatch_private_info_t *disp_buffer = 1288 (dispatch_private_info_t *)__kmp_allocate( 1289 sizeof(dispatch_private_info_t)); 1290 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1291 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1292 } 1293 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1294 1295 KMP_MB(); 1296 } 1297 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1298 1299 // Perform the display affinity functionality for 1300 // serialized parallel regions 1301 if (__kmp_display_affinity) { 1302 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1303 this_thr->th.th_prev_num_threads != 1) { 1304 // NULL means use the affinity-format-var ICV 1305 __kmp_aux_display_affinity(global_tid, NULL); 1306 this_thr->th.th_prev_level = serial_team->t.t_level; 1307 this_thr->th.th_prev_num_threads = 1; 1308 } 1309 } 1310 1311 if (__kmp_env_consistency_check) 1312 __kmp_push_parallel(global_tid, NULL); 1313 #if OMPT_SUPPORT 1314 serial_team->t.ompt_team_info.master_return_address = codeptr; 1315 if (ompt_enabled.enabled && 1316 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1317 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1318 OMPT_GET_FRAME_ADDRESS(0); 1319 1320 ompt_lw_taskteam_t lw_taskteam; 1321 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1322 &ompt_parallel_data, codeptr); 1323 1324 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1325 // don't use lw_taskteam after linking. content was swaped 1326 1327 /* OMPT implicit task begin */ 1328 if (ompt_enabled.ompt_callback_implicit_task) { 1329 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1330 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1331 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), 1332 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1333 OMPT_CUR_TASK_INFO(this_thr)->thread_num = 1334 __kmp_tid_from_gtid(global_tid); 1335 } 1336 1337 /* OMPT state */ 1338 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1339 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = 1340 OMPT_GET_FRAME_ADDRESS(0); 1341 } 1342 #endif 1343 } 1344 1345 /* most of the work for a fork */ 1346 /* return true if we really went parallel, false if serialized */ 1347 int __kmp_fork_call(ident_t *loc, int gtid, 1348 enum fork_context_e call_context, // Intel, GNU, ... 1349 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1350 kmp_va_list ap) { 1351 void **argv; 1352 int i; 1353 int master_tid; 1354 int master_this_cons; 1355 kmp_team_t *team; 1356 kmp_team_t *parent_team; 1357 kmp_info_t *master_th; 1358 kmp_root_t *root; 1359 int nthreads; 1360 int master_active; 1361 int master_set_numthreads; 1362 int level; 1363 int active_level; 1364 int teams_level; 1365 #if KMP_NESTED_HOT_TEAMS 1366 kmp_hot_team_ptr_t **p_hot_teams; 1367 #endif 1368 { // KMP_TIME_BLOCK 1369 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1370 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1371 1372 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1373 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1374 /* Some systems prefer the stack for the root thread(s) to start with */ 1375 /* some gap from the parent stack to prevent false sharing. */ 1376 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1377 /* These 2 lines below are so this does not get optimized out */ 1378 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1379 __kmp_stkpadding += (short)((kmp_int64)dummy); 1380 } 1381 1382 /* initialize if needed */ 1383 KMP_DEBUG_ASSERT( 1384 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1385 if (!TCR_4(__kmp_init_parallel)) 1386 __kmp_parallel_initialize(); 1387 __kmp_resume_if_soft_paused(); 1388 1389 /* setup current data */ 1390 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1391 // shutdown 1392 parent_team = master_th->th.th_team; 1393 master_tid = master_th->th.th_info.ds.ds_tid; 1394 master_this_cons = master_th->th.th_local.this_construct; 1395 root = master_th->th.th_root; 1396 master_active = root->r.r_active; 1397 master_set_numthreads = master_th->th.th_set_nproc; 1398 1399 #if OMPT_SUPPORT 1400 ompt_data_t ompt_parallel_data = ompt_data_none; 1401 ompt_data_t *parent_task_data; 1402 ompt_frame_t *ompt_frame; 1403 ompt_data_t *implicit_task_data; 1404 void *return_address = NULL; 1405 1406 if (ompt_enabled.enabled) { 1407 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1408 NULL, NULL); 1409 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1410 } 1411 #endif 1412 1413 // Assign affinity to root thread if it hasn't happened yet 1414 __kmp_assign_root_init_mask(); 1415 1416 // Nested level will be an index in the nested nthreads array 1417 level = parent_team->t.t_level; 1418 // used to launch non-serial teams even if nested is not allowed 1419 active_level = parent_team->t.t_active_level; 1420 // needed to check nesting inside the teams 1421 teams_level = master_th->th.th_teams_level; 1422 #if KMP_NESTED_HOT_TEAMS 1423 p_hot_teams = &master_th->th.th_hot_teams; 1424 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1425 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1426 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1427 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1428 // it is either actual or not needed (when active_level > 0) 1429 (*p_hot_teams)[0].hot_team_nth = 1; 1430 } 1431 #endif 1432 1433 #if OMPT_SUPPORT 1434 if (ompt_enabled.enabled) { 1435 if (ompt_enabled.ompt_callback_parallel_begin) { 1436 int team_size = master_set_numthreads 1437 ? master_set_numthreads 1438 : get__nproc_2(parent_team, master_tid); 1439 int flags = OMPT_INVOKER(call_context) | 1440 ((microtask == (microtask_t)__kmp_teams_master) 1441 ? ompt_parallel_league 1442 : ompt_parallel_team); 1443 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1444 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1445 return_address); 1446 } 1447 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1448 } 1449 #endif 1450 1451 master_th->th.th_ident = loc; 1452 1453 if (master_th->th.th_teams_microtask && ap && 1454 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1455 // AC: This is start of parallel that is nested inside teams construct. 1456 // The team is actual (hot), all workers are ready at the fork barrier. 1457 // No lock needed to initialize the team a bit, then free workers. 1458 parent_team->t.t_ident = loc; 1459 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1460 parent_team->t.t_argc = argc; 1461 argv = (void **)parent_team->t.t_argv; 1462 for (i = argc - 1; i >= 0; --i) 1463 *argv++ = va_arg(kmp_va_deref(ap), void *); 1464 // Increment our nested depth levels, but not increase the serialization 1465 if (parent_team == master_th->th.th_serial_team) { 1466 // AC: we are in serialized parallel 1467 __kmpc_serialized_parallel(loc, gtid); 1468 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1469 1470 if (call_context == fork_context_gnu) { 1471 // AC: need to decrement t_serialized for enquiry functions to work 1472 // correctly, will restore at join time 1473 parent_team->t.t_serialized--; 1474 return TRUE; 1475 } 1476 1477 #if OMPD_SUPPORT 1478 parent_team->t.t_pkfn = microtask; 1479 #endif 1480 1481 #if OMPT_SUPPORT 1482 void *dummy; 1483 void **exit_frame_p; 1484 1485 ompt_lw_taskteam_t lw_taskteam; 1486 1487 if (ompt_enabled.enabled) { 1488 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1489 &ompt_parallel_data, return_address); 1490 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1491 1492 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1493 // don't use lw_taskteam after linking. content was swaped 1494 1495 /* OMPT implicit task begin */ 1496 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1497 if (ompt_enabled.ompt_callback_implicit_task) { 1498 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1499 __kmp_tid_from_gtid(gtid); 1500 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1501 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1502 implicit_task_data, 1, 1503 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1504 } 1505 1506 /* OMPT state */ 1507 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1508 } else { 1509 exit_frame_p = &dummy; 1510 } 1511 #endif 1512 // AC: need to decrement t_serialized for enquiry functions to work 1513 // correctly, will restore at join time 1514 parent_team->t.t_serialized--; 1515 1516 { 1517 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1518 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1519 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1520 #if OMPT_SUPPORT 1521 , 1522 exit_frame_p 1523 #endif 1524 ); 1525 } 1526 1527 #if OMPT_SUPPORT 1528 if (ompt_enabled.enabled) { 1529 *exit_frame_p = NULL; 1530 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1531 if (ompt_enabled.ompt_callback_implicit_task) { 1532 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1533 ompt_scope_end, NULL, implicit_task_data, 1, 1534 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1535 } 1536 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1537 __ompt_lw_taskteam_unlink(master_th); 1538 if (ompt_enabled.ompt_callback_parallel_end) { 1539 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1540 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1541 OMPT_INVOKER(call_context) | ompt_parallel_team, 1542 return_address); 1543 } 1544 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1545 } 1546 #endif 1547 return TRUE; 1548 } 1549 1550 parent_team->t.t_pkfn = microtask; 1551 parent_team->t.t_invoke = invoker; 1552 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1553 parent_team->t.t_active_level++; 1554 parent_team->t.t_level++; 1555 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1556 1557 #if OMPT_SUPPORT 1558 if (ompt_enabled.enabled) { 1559 ompt_lw_taskteam_t lw_taskteam; 1560 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1561 &ompt_parallel_data, return_address); 1562 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1563 } 1564 #endif 1565 1566 /* Change number of threads in the team if requested */ 1567 if (master_set_numthreads) { // The parallel has num_threads clause 1568 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1569 // AC: only can reduce number of threads dynamically, can't increase 1570 kmp_info_t **other_threads = parent_team->t.t_threads; 1571 parent_team->t.t_nproc = master_set_numthreads; 1572 for (i = 0; i < master_set_numthreads; ++i) { 1573 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1574 } 1575 // Keep extra threads hot in the team for possible next parallels 1576 } 1577 master_th->th.th_set_nproc = 0; 1578 } 1579 1580 #if USE_DEBUGGER 1581 if (__kmp_debugging) { // Let debugger override number of threads. 1582 int nth = __kmp_omp_num_threads(loc); 1583 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1584 master_set_numthreads = nth; 1585 } 1586 } 1587 #endif 1588 1589 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1590 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1591 KMP_ITT_DEBUG) && 1592 __kmp_forkjoin_frames_mode == 3 && 1593 parent_team->t.t_active_level == 1 // only report frames at level 1 1594 && master_th->th.th_teams_size.nteams == 1) { 1595 kmp_uint64 tmp_time = __itt_get_timestamp(); 1596 master_th->th.th_frame_time = tmp_time; 1597 parent_team->t.t_region_time = tmp_time; 1598 } 1599 if (__itt_stack_caller_create_ptr) { 1600 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 1601 // create new stack stitching id before entering fork barrier 1602 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1603 } 1604 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1605 1606 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1607 "master_th=%p, gtid=%d\n", 1608 root, parent_team, master_th, gtid)); 1609 __kmp_internal_fork(loc, gtid, parent_team); 1610 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1611 "master_th=%p, gtid=%d\n", 1612 root, parent_team, master_th, gtid)); 1613 1614 if (call_context == fork_context_gnu) 1615 return TRUE; 1616 1617 /* Invoke microtask for PRIMARY thread */ 1618 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1619 parent_team->t.t_id, parent_team->t.t_pkfn)); 1620 1621 if (!parent_team->t.t_invoke(gtid)) { 1622 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 1623 } 1624 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1625 parent_team->t.t_id, parent_team->t.t_pkfn)); 1626 KMP_MB(); /* Flush all pending memory write invalidates. */ 1627 1628 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1629 1630 return TRUE; 1631 } // Parallel closely nested in teams construct 1632 1633 #if KMP_DEBUG 1634 if (__kmp_tasking_mode != tskm_immediate_exec) { 1635 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1636 parent_team->t.t_task_team[master_th->th.th_task_state]); 1637 } 1638 #endif 1639 1640 int enter_teams = 0; 1641 if (parent_team->t.t_active_level >= 1642 master_th->th.th_current_task->td_icvs.max_active_levels) { 1643 nthreads = 1; 1644 } else { 1645 enter_teams = ((ap == NULL && active_level == 0) || 1646 (ap && teams_level > 0 && teams_level == level)); 1647 nthreads = 1648 master_set_numthreads 1649 ? master_set_numthreads 1650 : get__nproc_2( 1651 parent_team, 1652 master_tid); // TODO: get nproc directly from current task 1653 1654 // Check if we need to take forkjoin lock? (no need for serialized 1655 // parallel out of teams construct). This code moved here from 1656 // __kmp_reserve_threads() to speedup nested serialized parallels. 1657 if (nthreads > 1) { 1658 if ((get__max_active_levels(master_th) == 1 && 1659 (root->r.r_in_parallel && !enter_teams)) || 1660 (__kmp_library == library_serial)) { 1661 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1662 " threads\n", 1663 gtid, nthreads)); 1664 nthreads = 1; 1665 } 1666 } 1667 if (nthreads > 1) { 1668 /* determine how many new threads we can use */ 1669 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1670 /* AC: If we execute teams from parallel region (on host), then teams 1671 should be created but each can only have 1 thread if nesting is 1672 disabled. If teams called from serial region, then teams and their 1673 threads should be created regardless of the nesting setting. */ 1674 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1675 nthreads, enter_teams); 1676 if (nthreads == 1) { 1677 // Free lock for single thread execution here; for multi-thread 1678 // execution it will be freed later after team of threads created 1679 // and initialized 1680 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1681 } 1682 } 1683 } 1684 KMP_DEBUG_ASSERT(nthreads > 0); 1685 1686 // If we temporarily changed the set number of threads then restore it now 1687 master_th->th.th_set_nproc = 0; 1688 1689 /* create a serialized parallel region? */ 1690 if (nthreads == 1) { 1691 /* josh todo: hypothetical question: what do we do for OS X*? */ 1692 #if KMP_OS_LINUX && \ 1693 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1694 void *args[argc]; 1695 #else 1696 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1697 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1698 KMP_ARCH_AARCH64) */ 1699 1700 KA_TRACE(20, 1701 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1702 1703 __kmpc_serialized_parallel(loc, gtid); 1704 1705 #if OMPD_SUPPORT 1706 master_th->th.th_serial_team->t.t_pkfn = microtask; 1707 #endif 1708 1709 if (call_context == fork_context_intel) { 1710 /* TODO this sucks, use the compiler itself to pass args! :) */ 1711 master_th->th.th_serial_team->t.t_ident = loc; 1712 if (!ap) { 1713 // revert change made in __kmpc_serialized_parallel() 1714 master_th->th.th_serial_team->t.t_level--; 1715 // Get args from parent team for teams construct 1716 1717 #if OMPT_SUPPORT 1718 void *dummy; 1719 void **exit_frame_p; 1720 ompt_task_info_t *task_info; 1721 1722 ompt_lw_taskteam_t lw_taskteam; 1723 1724 if (ompt_enabled.enabled) { 1725 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1726 &ompt_parallel_data, return_address); 1727 1728 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1729 // don't use lw_taskteam after linking. content was swaped 1730 1731 task_info = OMPT_CUR_TASK_INFO(master_th); 1732 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1733 if (ompt_enabled.ompt_callback_implicit_task) { 1734 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1735 __kmp_tid_from_gtid(gtid); 1736 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1737 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1738 &(task_info->task_data), 1, 1739 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1740 ompt_task_implicit); 1741 } 1742 1743 /* OMPT state */ 1744 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1745 } else { 1746 exit_frame_p = &dummy; 1747 } 1748 #endif 1749 1750 { 1751 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1752 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1753 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1754 parent_team->t.t_argv 1755 #if OMPT_SUPPORT 1756 , 1757 exit_frame_p 1758 #endif 1759 ); 1760 } 1761 1762 #if OMPT_SUPPORT 1763 if (ompt_enabled.enabled) { 1764 *exit_frame_p = NULL; 1765 if (ompt_enabled.ompt_callback_implicit_task) { 1766 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1767 ompt_scope_end, NULL, &(task_info->task_data), 1, 1768 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1769 ompt_task_implicit); 1770 } 1771 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1772 __ompt_lw_taskteam_unlink(master_th); 1773 if (ompt_enabled.ompt_callback_parallel_end) { 1774 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1775 &ompt_parallel_data, parent_task_data, 1776 OMPT_INVOKER(call_context) | ompt_parallel_team, 1777 return_address); 1778 } 1779 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1780 } 1781 #endif 1782 } else if (microtask == (microtask_t)__kmp_teams_master) { 1783 KMP_DEBUG_ASSERT(master_th->th.th_team == 1784 master_th->th.th_serial_team); 1785 team = master_th->th.th_team; 1786 // team->t.t_pkfn = microtask; 1787 team->t.t_invoke = invoker; 1788 __kmp_alloc_argv_entries(argc, team, TRUE); 1789 team->t.t_argc = argc; 1790 argv = (void **)team->t.t_argv; 1791 if (ap) { 1792 for (i = argc - 1; i >= 0; --i) 1793 *argv++ = va_arg(kmp_va_deref(ap), void *); 1794 } else { 1795 for (i = 0; i < argc; ++i) 1796 // Get args from parent team for teams construct 1797 argv[i] = parent_team->t.t_argv[i]; 1798 } 1799 // AC: revert change made in __kmpc_serialized_parallel() 1800 // because initial code in teams should have level=0 1801 team->t.t_level--; 1802 // AC: call special invoker for outer "parallel" of teams construct 1803 invoker(gtid); 1804 #if OMPT_SUPPORT 1805 if (ompt_enabled.enabled) { 1806 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1807 if (ompt_enabled.ompt_callback_implicit_task) { 1808 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1809 ompt_scope_end, NULL, &(task_info->task_data), 0, 1810 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1811 } 1812 if (ompt_enabled.ompt_callback_parallel_end) { 1813 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1814 &ompt_parallel_data, parent_task_data, 1815 OMPT_INVOKER(call_context) | ompt_parallel_league, 1816 return_address); 1817 } 1818 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1819 } 1820 #endif 1821 } else { 1822 argv = args; 1823 for (i = argc - 1; i >= 0; --i) 1824 *argv++ = va_arg(kmp_va_deref(ap), void *); 1825 KMP_MB(); 1826 1827 #if OMPT_SUPPORT 1828 void *dummy; 1829 void **exit_frame_p; 1830 ompt_task_info_t *task_info; 1831 1832 ompt_lw_taskteam_t lw_taskteam; 1833 1834 if (ompt_enabled.enabled) { 1835 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1836 &ompt_parallel_data, return_address); 1837 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1838 // don't use lw_taskteam after linking. content was swaped 1839 task_info = OMPT_CUR_TASK_INFO(master_th); 1840 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1841 1842 /* OMPT implicit task begin */ 1843 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1844 if (ompt_enabled.ompt_callback_implicit_task) { 1845 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1846 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1847 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1848 ompt_task_implicit); 1849 OMPT_CUR_TASK_INFO(master_th)->thread_num = 1850 __kmp_tid_from_gtid(gtid); 1851 } 1852 1853 /* OMPT state */ 1854 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1855 } else { 1856 exit_frame_p = &dummy; 1857 } 1858 #endif 1859 1860 { 1861 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1862 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1863 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1864 #if OMPT_SUPPORT 1865 , 1866 exit_frame_p 1867 #endif 1868 ); 1869 } 1870 1871 #if OMPT_SUPPORT 1872 if (ompt_enabled.enabled) { 1873 *exit_frame_p = NULL; 1874 if (ompt_enabled.ompt_callback_implicit_task) { 1875 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1876 ompt_scope_end, NULL, &(task_info->task_data), 1, 1877 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1878 ompt_task_implicit); 1879 } 1880 1881 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1882 __ompt_lw_taskteam_unlink(master_th); 1883 if (ompt_enabled.ompt_callback_parallel_end) { 1884 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1885 &ompt_parallel_data, parent_task_data, 1886 OMPT_INVOKER(call_context) | ompt_parallel_team, 1887 return_address); 1888 } 1889 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1890 } 1891 #endif 1892 } 1893 } else if (call_context == fork_context_gnu) { 1894 #if OMPT_SUPPORT 1895 ompt_lw_taskteam_t lwt; 1896 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1897 return_address); 1898 1899 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1900 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1901 // don't use lw_taskteam after linking. content was swaped 1902 #endif 1903 1904 // we were called from GNU native code 1905 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1906 return FALSE; 1907 } else { 1908 KMP_ASSERT2(call_context < fork_context_last, 1909 "__kmp_fork_call: unknown fork_context parameter"); 1910 } 1911 1912 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1913 KMP_MB(); 1914 return FALSE; 1915 } // if (nthreads == 1) 1916 1917 // GEH: only modify the executing flag in the case when not serialized 1918 // serialized case is handled in kmpc_serialized_parallel 1919 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1920 "curtask=%p, curtask_max_aclevel=%d\n", 1921 parent_team->t.t_active_level, master_th, 1922 master_th->th.th_current_task, 1923 master_th->th.th_current_task->td_icvs.max_active_levels)); 1924 // TODO: GEH - cannot do this assertion because root thread not set up as 1925 // executing 1926 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1927 master_th->th.th_current_task->td_flags.executing = 0; 1928 1929 if (!master_th->th.th_teams_microtask || level > teams_level) { 1930 /* Increment our nested depth level */ 1931 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1932 } 1933 1934 // See if we need to make a copy of the ICVs. 1935 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1936 if ((level + 1 < __kmp_nested_nth.used) && 1937 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1938 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1939 } else { 1940 nthreads_icv = 0; // don't update 1941 } 1942 1943 // Figure out the proc_bind_policy for the new team. 1944 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1945 kmp_proc_bind_t proc_bind_icv = 1946 proc_bind_default; // proc_bind_default means don't update 1947 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1948 proc_bind = proc_bind_false; 1949 } else { 1950 if (proc_bind == proc_bind_default) { 1951 // No proc_bind clause specified; use current proc-bind-var for this 1952 // parallel region 1953 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1954 } 1955 /* else: The proc_bind policy was specified explicitly on parallel clause. 1956 This overrides proc-bind-var for this parallel region, but does not 1957 change proc-bind-var. */ 1958 // Figure the value of proc-bind-var for the child threads. 1959 if ((level + 1 < __kmp_nested_proc_bind.used) && 1960 (__kmp_nested_proc_bind.bind_types[level + 1] != 1961 master_th->th.th_current_task->td_icvs.proc_bind)) { 1962 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1963 } 1964 } 1965 1966 // Reset for next parallel region 1967 master_th->th.th_set_proc_bind = proc_bind_default; 1968 1969 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1970 kmp_internal_control_t new_icvs; 1971 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1972 new_icvs.next = NULL; 1973 if (nthreads_icv > 0) { 1974 new_icvs.nproc = nthreads_icv; 1975 } 1976 if (proc_bind_icv != proc_bind_default) { 1977 new_icvs.proc_bind = proc_bind_icv; 1978 } 1979 1980 /* allocate a new parallel team */ 1981 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1982 team = __kmp_allocate_team(root, nthreads, nthreads, 1983 #if OMPT_SUPPORT 1984 ompt_parallel_data, 1985 #endif 1986 proc_bind, &new_icvs, 1987 argc USE_NESTED_HOT_ARG(master_th)); 1988 } else { 1989 /* allocate a new parallel team */ 1990 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 1991 team = __kmp_allocate_team(root, nthreads, nthreads, 1992 #if OMPT_SUPPORT 1993 ompt_parallel_data, 1994 #endif 1995 proc_bind, 1996 &master_th->th.th_current_task->td_icvs, 1997 argc USE_NESTED_HOT_ARG(master_th)); 1998 } 1999 KF_TRACE( 2000 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2001 2002 /* setup the new team */ 2003 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2004 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2005 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2006 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2007 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2008 #if OMPT_SUPPORT 2009 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2010 return_address); 2011 #endif 2012 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2013 // TODO: parent_team->t.t_level == INT_MAX ??? 2014 if (!master_th->th.th_teams_microtask || level > teams_level) { 2015 int new_level = parent_team->t.t_level + 1; 2016 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2017 new_level = parent_team->t.t_active_level + 1; 2018 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2019 } else { 2020 // AC: Do not increase parallel level at start of the teams construct 2021 int new_level = parent_team->t.t_level; 2022 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2023 new_level = parent_team->t.t_active_level; 2024 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2025 } 2026 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2027 // set primary thread's schedule as new run-time schedule 2028 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2029 2030 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2031 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2032 2033 // Update the floating point rounding in the team if required. 2034 propagateFPControl(team); 2035 #if OMPD_SUPPORT 2036 if (ompd_state & OMPD_ENABLE_BP) 2037 ompd_bp_parallel_begin(); 2038 #endif 2039 2040 if (__kmp_tasking_mode != tskm_immediate_exec) { 2041 // Set primary thread's task team to team's task team. Unless this is hot 2042 // team, it should be NULL. 2043 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2044 parent_team->t.t_task_team[master_th->th.th_task_state]); 2045 KA_TRACE(20, ("__kmp_fork_call: Primary T#%d pushing task_team %p / team " 2046 "%p, new task_team %p / team %p\n", 2047 __kmp_gtid_from_thread(master_th), 2048 master_th->th.th_task_team, parent_team, 2049 team->t.t_task_team[master_th->th.th_task_state], team)); 2050 2051 if (active_level || master_th->th.th_task_team) { 2052 // Take a memo of primary thread's task_state 2053 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2054 if (master_th->th.th_task_state_top >= 2055 master_th->th.th_task_state_stack_sz) { // increase size 2056 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2057 kmp_uint8 *old_stack, *new_stack; 2058 kmp_uint32 i; 2059 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2060 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2061 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2062 } 2063 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2064 ++i) { // zero-init rest of stack 2065 new_stack[i] = 0; 2066 } 2067 old_stack = master_th->th.th_task_state_memo_stack; 2068 master_th->th.th_task_state_memo_stack = new_stack; 2069 master_th->th.th_task_state_stack_sz = new_size; 2070 __kmp_free(old_stack); 2071 } 2072 // Store primary thread's task_state on stack 2073 master_th->th 2074 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2075 master_th->th.th_task_state; 2076 master_th->th.th_task_state_top++; 2077 #if KMP_NESTED_HOT_TEAMS 2078 if (master_th->th.th_hot_teams && 2079 active_level < __kmp_hot_teams_max_level && 2080 team == master_th->th.th_hot_teams[active_level].hot_team) { 2081 // Restore primary thread's nested state if nested hot team 2082 master_th->th.th_task_state = 2083 master_th->th 2084 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2085 } else { 2086 #endif 2087 master_th->th.th_task_state = 0; 2088 #if KMP_NESTED_HOT_TEAMS 2089 } 2090 #endif 2091 } 2092 #if !KMP_NESTED_HOT_TEAMS 2093 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2094 (team == root->r.r_hot_team)); 2095 #endif 2096 } 2097 2098 KA_TRACE( 2099 20, 2100 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2101 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2102 team->t.t_nproc)); 2103 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2104 (team->t.t_master_tid == 0 && 2105 (team->t.t_parent == root->r.r_root_team || 2106 team->t.t_parent->t.t_serialized))); 2107 KMP_MB(); 2108 2109 /* now, setup the arguments */ 2110 argv = (void **)team->t.t_argv; 2111 if (ap) { 2112 for (i = argc - 1; i >= 0; --i) { 2113 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2114 KMP_CHECK_UPDATE(*argv, new_argv); 2115 argv++; 2116 } 2117 } else { 2118 for (i = 0; i < argc; ++i) { 2119 // Get args from parent team for teams construct 2120 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2121 } 2122 } 2123 2124 /* now actually fork the threads */ 2125 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2126 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2127 root->r.r_active = TRUE; 2128 2129 __kmp_fork_team_threads(root, team, master_th, gtid); 2130 __kmp_setup_icv_copy(team, nthreads, 2131 &master_th->th.th_current_task->td_icvs, loc); 2132 2133 #if OMPT_SUPPORT 2134 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2135 #endif 2136 2137 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2138 2139 #if USE_ITT_BUILD 2140 if (team->t.t_active_level == 1 // only report frames at level 1 2141 && !master_th->th.th_teams_microtask) { // not in teams construct 2142 #if USE_ITT_NOTIFY 2143 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2144 (__kmp_forkjoin_frames_mode == 3 || 2145 __kmp_forkjoin_frames_mode == 1)) { 2146 kmp_uint64 tmp_time = 0; 2147 if (__itt_get_timestamp_ptr) 2148 tmp_time = __itt_get_timestamp(); 2149 // Internal fork - report frame begin 2150 master_th->th.th_frame_time = tmp_time; 2151 if (__kmp_forkjoin_frames_mode == 3) 2152 team->t.t_region_time = tmp_time; 2153 } else 2154 // only one notification scheme (either "submit" or "forking/joined", not both) 2155 #endif /* USE_ITT_NOTIFY */ 2156 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2157 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2158 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2159 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2160 } 2161 } 2162 #endif /* USE_ITT_BUILD */ 2163 2164 /* now go on and do the work */ 2165 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2166 KMP_MB(); 2167 KF_TRACE(10, 2168 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2169 root, team, master_th, gtid)); 2170 2171 #if USE_ITT_BUILD 2172 if (__itt_stack_caller_create_ptr) { 2173 // create new stack stitching id before entering fork barrier 2174 if (!enter_teams) { 2175 KMP_DEBUG_ASSERT(team->t.t_stack_id == NULL); 2176 team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2177 } else if (parent_team->t.t_serialized) { 2178 // keep stack stitching id in the serialized parent_team; 2179 // current team will be used for parallel inside the teams; 2180 // if parent_team is active, then it already keeps stack stitching id 2181 // for the league of teams 2182 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id == NULL); 2183 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 2184 } 2185 } 2186 #endif /* USE_ITT_BUILD */ 2187 2188 // AC: skip __kmp_internal_fork at teams construct, let only primary 2189 // threads execute 2190 if (ap) { 2191 __kmp_internal_fork(loc, gtid, team); 2192 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2193 "master_th=%p, gtid=%d\n", 2194 root, team, master_th, gtid)); 2195 } 2196 2197 if (call_context == fork_context_gnu) { 2198 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2199 return TRUE; 2200 } 2201 2202 /* Invoke microtask for PRIMARY thread */ 2203 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2204 team->t.t_id, team->t.t_pkfn)); 2205 } // END of timer KMP_fork_call block 2206 2207 #if KMP_STATS_ENABLED 2208 // If beginning a teams construct, then change thread state 2209 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2210 if (!ap) { 2211 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2212 } 2213 #endif 2214 2215 if (!team->t.t_invoke(gtid)) { 2216 KMP_ASSERT2(0, "cannot invoke microtask for PRIMARY thread"); 2217 } 2218 2219 #if KMP_STATS_ENABLED 2220 // If was beginning of a teams construct, then reset thread state 2221 if (!ap) { 2222 KMP_SET_THREAD_STATE(previous_state); 2223 } 2224 #endif 2225 2226 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2227 team->t.t_id, team->t.t_pkfn)); 2228 KMP_MB(); /* Flush all pending memory write invalidates. */ 2229 2230 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2231 #if OMPT_SUPPORT 2232 if (ompt_enabled.enabled) { 2233 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2234 } 2235 #endif 2236 2237 return TRUE; 2238 } 2239 2240 #if OMPT_SUPPORT 2241 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2242 kmp_team_t *team) { 2243 // restore state outside the region 2244 thread->th.ompt_thread_info.state = 2245 ((team->t.t_serialized) ? ompt_state_work_serial 2246 : ompt_state_work_parallel); 2247 } 2248 2249 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2250 kmp_team_t *team, ompt_data_t *parallel_data, 2251 int flags, void *codeptr) { 2252 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2253 if (ompt_enabled.ompt_callback_parallel_end) { 2254 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2255 parallel_data, &(task_info->task_data), flags, codeptr); 2256 } 2257 2258 task_info->frame.enter_frame = ompt_data_none; 2259 __kmp_join_restore_state(thread, team); 2260 } 2261 #endif 2262 2263 void __kmp_join_call(ident_t *loc, int gtid 2264 #if OMPT_SUPPORT 2265 , 2266 enum fork_context_e fork_context 2267 #endif 2268 , 2269 int exit_teams) { 2270 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2271 kmp_team_t *team; 2272 kmp_team_t *parent_team; 2273 kmp_info_t *master_th; 2274 kmp_root_t *root; 2275 int master_active; 2276 2277 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2278 2279 /* setup current data */ 2280 master_th = __kmp_threads[gtid]; 2281 root = master_th->th.th_root; 2282 team = master_th->th.th_team; 2283 parent_team = team->t.t_parent; 2284 2285 master_th->th.th_ident = loc; 2286 2287 #if OMPT_SUPPORT 2288 void *team_microtask = (void *)team->t.t_pkfn; 2289 // For GOMP interface with serialized parallel, need the 2290 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2291 // and end-parallel events. 2292 if (ompt_enabled.enabled && 2293 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2294 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2295 } 2296 #endif 2297 2298 #if KMP_DEBUG 2299 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2300 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2301 "th_task_team = %p\n", 2302 __kmp_gtid_from_thread(master_th), team, 2303 team->t.t_task_team[master_th->th.th_task_state], 2304 master_th->th.th_task_team)); 2305 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2306 team->t.t_task_team[master_th->th.th_task_state]); 2307 } 2308 #endif 2309 2310 if (team->t.t_serialized) { 2311 if (master_th->th.th_teams_microtask) { 2312 // We are in teams construct 2313 int level = team->t.t_level; 2314 int tlevel = master_th->th.th_teams_level; 2315 if (level == tlevel) { 2316 // AC: we haven't incremented it earlier at start of teams construct, 2317 // so do it here - at the end of teams construct 2318 team->t.t_level++; 2319 } else if (level == tlevel + 1) { 2320 // AC: we are exiting parallel inside teams, need to increment 2321 // serialization in order to restore it in the next call to 2322 // __kmpc_end_serialized_parallel 2323 team->t.t_serialized++; 2324 } 2325 } 2326 __kmpc_end_serialized_parallel(loc, gtid); 2327 2328 #if OMPT_SUPPORT 2329 if (ompt_enabled.enabled) { 2330 __kmp_join_restore_state(master_th, parent_team); 2331 } 2332 #endif 2333 2334 return; 2335 } 2336 2337 master_active = team->t.t_master_active; 2338 2339 if (!exit_teams) { 2340 // AC: No barrier for internal teams at exit from teams construct. 2341 // But there is barrier for external team (league). 2342 __kmp_internal_join(loc, gtid, team); 2343 #if USE_ITT_BUILD 2344 if (__itt_stack_caller_create_ptr) { 2345 KMP_DEBUG_ASSERT(team->t.t_stack_id != NULL); 2346 // destroy the stack stitching id after join barrier 2347 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2348 team->t.t_stack_id = NULL; 2349 } 2350 #endif 2351 } else { 2352 master_th->th.th_task_state = 2353 0; // AC: no tasking in teams (out of any parallel) 2354 #if USE_ITT_BUILD 2355 if (__itt_stack_caller_create_ptr && parent_team->t.t_serialized) { 2356 KMP_DEBUG_ASSERT(parent_team->t.t_stack_id != NULL); 2357 // destroy the stack stitching id on exit from the teams construct 2358 // if parent_team is active, then the id will be destroyed later on 2359 // by master of the league of teams 2360 __kmp_itt_stack_caller_destroy((__itt_caller)parent_team->t.t_stack_id); 2361 parent_team->t.t_stack_id = NULL; 2362 } 2363 #endif 2364 } 2365 2366 KMP_MB(); 2367 2368 #if OMPT_SUPPORT 2369 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2370 void *codeptr = team->t.ompt_team_info.master_return_address; 2371 #endif 2372 2373 #if USE_ITT_BUILD 2374 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2375 if (team->t.t_active_level == 1 && 2376 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2377 master_th->th.th_teams_size.nteams == 1)) { 2378 master_th->th.th_ident = loc; 2379 // only one notification scheme (either "submit" or "forking/joined", not 2380 // both) 2381 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2382 __kmp_forkjoin_frames_mode == 3) 2383 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2384 master_th->th.th_frame_time, 0, loc, 2385 master_th->th.th_team_nproc, 1); 2386 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2387 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2388 __kmp_itt_region_joined(gtid); 2389 } // active_level == 1 2390 #endif /* USE_ITT_BUILD */ 2391 2392 if (master_th->th.th_teams_microtask && !exit_teams && 2393 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2394 team->t.t_level == master_th->th.th_teams_level + 1) { 2395 // AC: We need to leave the team structure intact at the end of parallel 2396 // inside the teams construct, so that at the next parallel same (hot) team 2397 // works, only adjust nesting levels 2398 #if OMPT_SUPPORT 2399 ompt_data_t ompt_parallel_data = ompt_data_none; 2400 if (ompt_enabled.enabled) { 2401 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2402 if (ompt_enabled.ompt_callback_implicit_task) { 2403 int ompt_team_size = team->t.t_nproc; 2404 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2405 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2406 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2407 } 2408 task_info->frame.exit_frame = ompt_data_none; 2409 task_info->task_data = ompt_data_none; 2410 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2411 __ompt_lw_taskteam_unlink(master_th); 2412 } 2413 #endif 2414 /* Decrement our nested depth level */ 2415 team->t.t_level--; 2416 team->t.t_active_level--; 2417 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2418 2419 // Restore number of threads in the team if needed. This code relies on 2420 // the proper adjustment of th_teams_size.nth after the fork in 2421 // __kmp_teams_master on each teams primary thread in the case that 2422 // __kmp_reserve_threads reduced it. 2423 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2424 int old_num = master_th->th.th_team_nproc; 2425 int new_num = master_th->th.th_teams_size.nth; 2426 kmp_info_t **other_threads = team->t.t_threads; 2427 team->t.t_nproc = new_num; 2428 for (int i = 0; i < old_num; ++i) { 2429 other_threads[i]->th.th_team_nproc = new_num; 2430 } 2431 // Adjust states of non-used threads of the team 2432 for (int i = old_num; i < new_num; ++i) { 2433 // Re-initialize thread's barrier data. 2434 KMP_DEBUG_ASSERT(other_threads[i]); 2435 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2436 for (int b = 0; b < bs_last_barrier; ++b) { 2437 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2438 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2439 #if USE_DEBUGGER 2440 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2441 #endif 2442 } 2443 if (__kmp_tasking_mode != tskm_immediate_exec) { 2444 // Synchronize thread's task state 2445 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2446 } 2447 } 2448 } 2449 2450 #if OMPT_SUPPORT 2451 if (ompt_enabled.enabled) { 2452 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2453 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2454 } 2455 #endif 2456 2457 return; 2458 } 2459 2460 /* do cleanup and restore the parent team */ 2461 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2462 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2463 2464 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2465 2466 /* jc: The following lock has instructions with REL and ACQ semantics, 2467 separating the parallel user code called in this parallel region 2468 from the serial user code called after this function returns. */ 2469 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2470 2471 if (!master_th->th.th_teams_microtask || 2472 team->t.t_level > master_th->th.th_teams_level) { 2473 /* Decrement our nested depth level */ 2474 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2475 } 2476 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2477 2478 #if OMPT_SUPPORT 2479 if (ompt_enabled.enabled) { 2480 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2481 if (ompt_enabled.ompt_callback_implicit_task) { 2482 int flags = (team_microtask == (void *)__kmp_teams_master) 2483 ? ompt_task_initial 2484 : ompt_task_implicit; 2485 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2486 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2487 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2488 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2489 } 2490 task_info->frame.exit_frame = ompt_data_none; 2491 task_info->task_data = ompt_data_none; 2492 } 2493 #endif 2494 2495 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2496 master_th, team)); 2497 __kmp_pop_current_task_from_thread(master_th); 2498 2499 #if KMP_AFFINITY_SUPPORTED 2500 // Restore master thread's partition. 2501 master_th->th.th_first_place = team->t.t_first_place; 2502 master_th->th.th_last_place = team->t.t_last_place; 2503 #endif // KMP_AFFINITY_SUPPORTED 2504 master_th->th.th_def_allocator = team->t.t_def_allocator; 2505 2506 #if OMPD_SUPPORT 2507 if (ompd_state & OMPD_ENABLE_BP) 2508 ompd_bp_parallel_end(); 2509 #endif 2510 updateHWFPControl(team); 2511 2512 if (root->r.r_active != master_active) 2513 root->r.r_active = master_active; 2514 2515 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2516 master_th)); // this will free worker threads 2517 2518 /* this race was fun to find. make sure the following is in the critical 2519 region otherwise assertions may fail occasionally since the old team may be 2520 reallocated and the hierarchy appears inconsistent. it is actually safe to 2521 run and won't cause any bugs, but will cause those assertion failures. it's 2522 only one deref&assign so might as well put this in the critical region */ 2523 master_th->th.th_team = parent_team; 2524 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2525 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2526 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2527 2528 /* restore serialized team, if need be */ 2529 if (parent_team->t.t_serialized && 2530 parent_team != master_th->th.th_serial_team && 2531 parent_team != root->r.r_root_team) { 2532 __kmp_free_team(root, 2533 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2534 master_th->th.th_serial_team = parent_team; 2535 } 2536 2537 if (__kmp_tasking_mode != tskm_immediate_exec) { 2538 if (master_th->th.th_task_state_top > 2539 0) { // Restore task state from memo stack 2540 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2541 // Remember primary thread's state if we re-use this nested hot team 2542 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2543 master_th->th.th_task_state; 2544 --master_th->th.th_task_state_top; // pop 2545 // Now restore state at this level 2546 master_th->th.th_task_state = 2547 master_th->th 2548 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2549 } 2550 // Copy the task team from the parent team to the primary thread 2551 master_th->th.th_task_team = 2552 parent_team->t.t_task_team[master_th->th.th_task_state]; 2553 KA_TRACE(20, 2554 ("__kmp_join_call: Primary T#%d restoring task_team %p, team %p\n", 2555 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2556 parent_team)); 2557 } 2558 2559 // TODO: GEH - cannot do this assertion because root thread not set up as 2560 // executing 2561 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2562 master_th->th.th_current_task->td_flags.executing = 1; 2563 2564 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2565 2566 #if OMPT_SUPPORT 2567 int flags = 2568 OMPT_INVOKER(fork_context) | 2569 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2570 : ompt_parallel_team); 2571 if (ompt_enabled.enabled) { 2572 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2573 codeptr); 2574 } 2575 #endif 2576 2577 KMP_MB(); 2578 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2579 } 2580 2581 /* Check whether we should push an internal control record onto the 2582 serial team stack. If so, do it. */ 2583 void __kmp_save_internal_controls(kmp_info_t *thread) { 2584 2585 if (thread->th.th_team != thread->th.th_serial_team) { 2586 return; 2587 } 2588 if (thread->th.th_team->t.t_serialized > 1) { 2589 int push = 0; 2590 2591 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2592 push = 1; 2593 } else { 2594 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2595 thread->th.th_team->t.t_serialized) { 2596 push = 1; 2597 } 2598 } 2599 if (push) { /* push a record on the serial team's stack */ 2600 kmp_internal_control_t *control = 2601 (kmp_internal_control_t *)__kmp_allocate( 2602 sizeof(kmp_internal_control_t)); 2603 2604 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2605 2606 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2607 2608 control->next = thread->th.th_team->t.t_control_stack_top; 2609 thread->th.th_team->t.t_control_stack_top = control; 2610 } 2611 } 2612 } 2613 2614 /* Changes set_nproc */ 2615 void __kmp_set_num_threads(int new_nth, int gtid) { 2616 kmp_info_t *thread; 2617 kmp_root_t *root; 2618 2619 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2620 KMP_DEBUG_ASSERT(__kmp_init_serial); 2621 2622 if (new_nth < 1) 2623 new_nth = 1; 2624 else if (new_nth > __kmp_max_nth) 2625 new_nth = __kmp_max_nth; 2626 2627 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2628 thread = __kmp_threads[gtid]; 2629 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2630 return; // nothing to do 2631 2632 __kmp_save_internal_controls(thread); 2633 2634 set__nproc(thread, new_nth); 2635 2636 // If this omp_set_num_threads() call will cause the hot team size to be 2637 // reduced (in the absence of a num_threads clause), then reduce it now, 2638 // rather than waiting for the next parallel region. 2639 root = thread->th.th_root; 2640 if (__kmp_init_parallel && (!root->r.r_active) && 2641 (root->r.r_hot_team->t.t_nproc > new_nth) 2642 #if KMP_NESTED_HOT_TEAMS 2643 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2644 #endif 2645 ) { 2646 kmp_team_t *hot_team = root->r.r_hot_team; 2647 int f; 2648 2649 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2650 2651 // Release the extra threads we don't need any more. 2652 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2653 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2654 if (__kmp_tasking_mode != tskm_immediate_exec) { 2655 // When decreasing team size, threads no longer in the team should unref 2656 // task team. 2657 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2658 } 2659 __kmp_free_thread(hot_team->t.t_threads[f]); 2660 hot_team->t.t_threads[f] = NULL; 2661 } 2662 hot_team->t.t_nproc = new_nth; 2663 #if KMP_NESTED_HOT_TEAMS 2664 if (thread->th.th_hot_teams) { 2665 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2666 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2667 } 2668 #endif 2669 2670 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2671 2672 // Update the t_nproc field in the threads that are still active. 2673 for (f = 0; f < new_nth; f++) { 2674 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2675 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2676 } 2677 // Special flag in case omp_set_num_threads() call 2678 hot_team->t.t_size_changed = -1; 2679 } 2680 } 2681 2682 /* Changes max_active_levels */ 2683 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2684 kmp_info_t *thread; 2685 2686 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2687 "%d = (%d)\n", 2688 gtid, max_active_levels)); 2689 KMP_DEBUG_ASSERT(__kmp_init_serial); 2690 2691 // validate max_active_levels 2692 if (max_active_levels < 0) { 2693 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2694 // We ignore this call if the user has specified a negative value. 2695 // The current setting won't be changed. The last valid setting will be 2696 // used. A warning will be issued (if warnings are allowed as controlled by 2697 // the KMP_WARNINGS env var). 2698 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2699 "max_active_levels for thread %d = (%d)\n", 2700 gtid, max_active_levels)); 2701 return; 2702 } 2703 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2704 // it's OK, the max_active_levels is within the valid range: [ 0; 2705 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2706 // We allow a zero value. (implementation defined behavior) 2707 } else { 2708 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2709 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2710 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2711 // Current upper limit is MAX_INT. (implementation defined behavior) 2712 // If the input exceeds the upper limit, we correct the input to be the 2713 // upper limit. (implementation defined behavior) 2714 // Actually, the flow should never get here until we use MAX_INT limit. 2715 } 2716 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2717 "max_active_levels for thread %d = (%d)\n", 2718 gtid, max_active_levels)); 2719 2720 thread = __kmp_threads[gtid]; 2721 2722 __kmp_save_internal_controls(thread); 2723 2724 set__max_active_levels(thread, max_active_levels); 2725 } 2726 2727 /* Gets max_active_levels */ 2728 int __kmp_get_max_active_levels(int gtid) { 2729 kmp_info_t *thread; 2730 2731 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2732 KMP_DEBUG_ASSERT(__kmp_init_serial); 2733 2734 thread = __kmp_threads[gtid]; 2735 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2736 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2737 "curtask_maxaclevel=%d\n", 2738 gtid, thread->th.th_current_task, 2739 thread->th.th_current_task->td_icvs.max_active_levels)); 2740 return thread->th.th_current_task->td_icvs.max_active_levels; 2741 } 2742 2743 // nteams-var per-device ICV 2744 void __kmp_set_num_teams(int num_teams) { 2745 if (num_teams > 0) 2746 __kmp_nteams = num_teams; 2747 } 2748 int __kmp_get_max_teams(void) { return __kmp_nteams; } 2749 // teams-thread-limit-var per-device ICV 2750 void __kmp_set_teams_thread_limit(int limit) { 2751 if (limit > 0) 2752 __kmp_teams_thread_limit = limit; 2753 } 2754 int __kmp_get_teams_thread_limit(void) { return __kmp_teams_thread_limit; } 2755 2756 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2757 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2758 2759 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2760 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2761 kmp_info_t *thread; 2762 kmp_sched_t orig_kind; 2763 // kmp_team_t *team; 2764 2765 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2766 gtid, (int)kind, chunk)); 2767 KMP_DEBUG_ASSERT(__kmp_init_serial); 2768 2769 // Check if the kind parameter is valid, correct if needed. 2770 // Valid parameters should fit in one of two intervals - standard or extended: 2771 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2772 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2773 orig_kind = kind; 2774 kind = __kmp_sched_without_mods(kind); 2775 2776 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2777 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2778 // TODO: Hint needs attention in case we change the default schedule. 2779 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2780 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2781 __kmp_msg_null); 2782 kind = kmp_sched_default; 2783 chunk = 0; // ignore chunk value in case of bad kind 2784 } 2785 2786 thread = __kmp_threads[gtid]; 2787 2788 __kmp_save_internal_controls(thread); 2789 2790 if (kind < kmp_sched_upper_std) { 2791 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2792 // differ static chunked vs. unchunked: chunk should be invalid to 2793 // indicate unchunked schedule (which is the default) 2794 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2795 } else { 2796 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2797 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2798 } 2799 } else { 2800 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2801 // kmp_sched_lower - 2 ]; 2802 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2803 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2804 kmp_sched_lower - 2]; 2805 } 2806 __kmp_sched_apply_mods_intkind( 2807 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2808 if (kind == kmp_sched_auto || chunk < 1) { 2809 // ignore parameter chunk for schedule auto 2810 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2811 } else { 2812 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2813 } 2814 } 2815 2816 /* Gets def_sched_var ICV values */ 2817 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2818 kmp_info_t *thread; 2819 enum sched_type th_type; 2820 2821 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2822 KMP_DEBUG_ASSERT(__kmp_init_serial); 2823 2824 thread = __kmp_threads[gtid]; 2825 2826 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2827 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2828 case kmp_sch_static: 2829 case kmp_sch_static_greedy: 2830 case kmp_sch_static_balanced: 2831 *kind = kmp_sched_static; 2832 __kmp_sched_apply_mods_stdkind(kind, th_type); 2833 *chunk = 0; // chunk was not set, try to show this fact via zero value 2834 return; 2835 case kmp_sch_static_chunked: 2836 *kind = kmp_sched_static; 2837 break; 2838 case kmp_sch_dynamic_chunked: 2839 *kind = kmp_sched_dynamic; 2840 break; 2841 case kmp_sch_guided_chunked: 2842 case kmp_sch_guided_iterative_chunked: 2843 case kmp_sch_guided_analytical_chunked: 2844 *kind = kmp_sched_guided; 2845 break; 2846 case kmp_sch_auto: 2847 *kind = kmp_sched_auto; 2848 break; 2849 case kmp_sch_trapezoidal: 2850 *kind = kmp_sched_trapezoidal; 2851 break; 2852 #if KMP_STATIC_STEAL_ENABLED 2853 case kmp_sch_static_steal: 2854 *kind = kmp_sched_static_steal; 2855 break; 2856 #endif 2857 default: 2858 KMP_FATAL(UnknownSchedulingType, th_type); 2859 } 2860 2861 __kmp_sched_apply_mods_stdkind(kind, th_type); 2862 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2863 } 2864 2865 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2866 2867 int ii, dd; 2868 kmp_team_t *team; 2869 kmp_info_t *thr; 2870 2871 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2872 KMP_DEBUG_ASSERT(__kmp_init_serial); 2873 2874 // validate level 2875 if (level == 0) 2876 return 0; 2877 if (level < 0) 2878 return -1; 2879 thr = __kmp_threads[gtid]; 2880 team = thr->th.th_team; 2881 ii = team->t.t_level; 2882 if (level > ii) 2883 return -1; 2884 2885 if (thr->th.th_teams_microtask) { 2886 // AC: we are in teams region where multiple nested teams have same level 2887 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2888 if (level <= 2889 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2890 KMP_DEBUG_ASSERT(ii >= tlevel); 2891 // AC: As we need to pass by the teams league, we need to artificially 2892 // increase ii 2893 if (ii == tlevel) { 2894 ii += 2; // three teams have same level 2895 } else { 2896 ii++; // two teams have same level 2897 } 2898 } 2899 } 2900 2901 if (ii == level) 2902 return __kmp_tid_from_gtid(gtid); 2903 2904 dd = team->t.t_serialized; 2905 level++; 2906 while (ii > level) { 2907 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2908 } 2909 if ((team->t.t_serialized) && (!dd)) { 2910 team = team->t.t_parent; 2911 continue; 2912 } 2913 if (ii > level) { 2914 team = team->t.t_parent; 2915 dd = team->t.t_serialized; 2916 ii--; 2917 } 2918 } 2919 2920 return (dd > 1) ? (0) : (team->t.t_master_tid); 2921 } 2922 2923 int __kmp_get_team_size(int gtid, int level) { 2924 2925 int ii, dd; 2926 kmp_team_t *team; 2927 kmp_info_t *thr; 2928 2929 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2930 KMP_DEBUG_ASSERT(__kmp_init_serial); 2931 2932 // validate level 2933 if (level == 0) 2934 return 1; 2935 if (level < 0) 2936 return -1; 2937 thr = __kmp_threads[gtid]; 2938 team = thr->th.th_team; 2939 ii = team->t.t_level; 2940 if (level > ii) 2941 return -1; 2942 2943 if (thr->th.th_teams_microtask) { 2944 // AC: we are in teams region where multiple nested teams have same level 2945 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2946 if (level <= 2947 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2948 KMP_DEBUG_ASSERT(ii >= tlevel); 2949 // AC: As we need to pass by the teams league, we need to artificially 2950 // increase ii 2951 if (ii == tlevel) { 2952 ii += 2; // three teams have same level 2953 } else { 2954 ii++; // two teams have same level 2955 } 2956 } 2957 } 2958 2959 while (ii > level) { 2960 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2961 } 2962 if (team->t.t_serialized && (!dd)) { 2963 team = team->t.t_parent; 2964 continue; 2965 } 2966 if (ii > level) { 2967 team = team->t.t_parent; 2968 ii--; 2969 } 2970 } 2971 2972 return team->t.t_nproc; 2973 } 2974 2975 kmp_r_sched_t __kmp_get_schedule_global() { 2976 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2977 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2978 // independently. So one can get the updated schedule here. 2979 2980 kmp_r_sched_t r_sched; 2981 2982 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2983 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2984 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2985 // different roots (even in OMP 2.5) 2986 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2987 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2988 if (s == kmp_sch_static) { 2989 // replace STATIC with more detailed schedule (balanced or greedy) 2990 r_sched.r_sched_type = __kmp_static; 2991 } else if (s == kmp_sch_guided_chunked) { 2992 // replace GUIDED with more detailed schedule (iterative or analytical) 2993 r_sched.r_sched_type = __kmp_guided; 2994 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2995 r_sched.r_sched_type = __kmp_sched; 2996 } 2997 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2998 2999 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3000 // __kmp_chunk may be wrong here (if it was not ever set) 3001 r_sched.chunk = KMP_DEFAULT_CHUNK; 3002 } else { 3003 r_sched.chunk = __kmp_chunk; 3004 } 3005 3006 return r_sched; 3007 } 3008 3009 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3010 at least argc number of *t_argv entries for the requested team. */ 3011 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3012 3013 KMP_DEBUG_ASSERT(team); 3014 if (!realloc || argc > team->t.t_max_argc) { 3015 3016 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3017 "current entries=%d\n", 3018 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3019 /* if previously allocated heap space for args, free them */ 3020 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3021 __kmp_free((void *)team->t.t_argv); 3022 3023 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3024 /* use unused space in the cache line for arguments */ 3025 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3026 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3027 "argv entries\n", 3028 team->t.t_id, team->t.t_max_argc)); 3029 team->t.t_argv = &team->t.t_inline_argv[0]; 3030 if (__kmp_storage_map) { 3031 __kmp_print_storage_map_gtid( 3032 -1, &team->t.t_inline_argv[0], 3033 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3034 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3035 team->t.t_id); 3036 } 3037 } else { 3038 /* allocate space for arguments in the heap */ 3039 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3040 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3041 : 2 * argc; 3042 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3043 "argv entries\n", 3044 team->t.t_id, team->t.t_max_argc)); 3045 team->t.t_argv = 3046 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3047 if (__kmp_storage_map) { 3048 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3049 &team->t.t_argv[team->t.t_max_argc], 3050 sizeof(void *) * team->t.t_max_argc, 3051 "team_%d.t_argv", team->t.t_id); 3052 } 3053 } 3054 } 3055 } 3056 3057 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3058 int i; 3059 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3060 team->t.t_threads = 3061 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3062 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3063 sizeof(dispatch_shared_info_t) * num_disp_buff); 3064 team->t.t_dispatch = 3065 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3066 team->t.t_implicit_task_taskdata = 3067 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3068 team->t.t_max_nproc = max_nth; 3069 3070 /* setup dispatch buffers */ 3071 for (i = 0; i < num_disp_buff; ++i) { 3072 team->t.t_disp_buffer[i].buffer_index = i; 3073 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3074 } 3075 } 3076 3077 static void __kmp_free_team_arrays(kmp_team_t *team) { 3078 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3079 int i; 3080 for (i = 0; i < team->t.t_max_nproc; ++i) { 3081 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3082 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3083 team->t.t_dispatch[i].th_disp_buffer = NULL; 3084 } 3085 } 3086 #if KMP_USE_HIER_SCHED 3087 __kmp_dispatch_free_hierarchies(team); 3088 #endif 3089 __kmp_free(team->t.t_threads); 3090 __kmp_free(team->t.t_disp_buffer); 3091 __kmp_free(team->t.t_dispatch); 3092 __kmp_free(team->t.t_implicit_task_taskdata); 3093 team->t.t_threads = NULL; 3094 team->t.t_disp_buffer = NULL; 3095 team->t.t_dispatch = NULL; 3096 team->t.t_implicit_task_taskdata = 0; 3097 } 3098 3099 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3100 kmp_info_t **oldThreads = team->t.t_threads; 3101 3102 __kmp_free(team->t.t_disp_buffer); 3103 __kmp_free(team->t.t_dispatch); 3104 __kmp_free(team->t.t_implicit_task_taskdata); 3105 __kmp_allocate_team_arrays(team, max_nth); 3106 3107 KMP_MEMCPY(team->t.t_threads, oldThreads, 3108 team->t.t_nproc * sizeof(kmp_info_t *)); 3109 3110 __kmp_free(oldThreads); 3111 } 3112 3113 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3114 3115 kmp_r_sched_t r_sched = 3116 __kmp_get_schedule_global(); // get current state of scheduling globals 3117 3118 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3119 3120 kmp_internal_control_t g_icvs = { 3121 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3122 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3123 // adjustment of threads (per thread) 3124 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3125 // whether blocktime is explicitly set 3126 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3127 #if KMP_USE_MONITOR 3128 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3129 // intervals 3130 #endif 3131 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3132 // next parallel region (per thread) 3133 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3134 __kmp_cg_max_nth, // int thread_limit; 3135 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3136 // for max_active_levels 3137 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3138 // {sched,chunk} pair 3139 __kmp_nested_proc_bind.bind_types[0], 3140 __kmp_default_device, 3141 NULL // struct kmp_internal_control *next; 3142 }; 3143 3144 return g_icvs; 3145 } 3146 3147 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3148 3149 kmp_internal_control_t gx_icvs; 3150 gx_icvs.serial_nesting_level = 3151 0; // probably =team->t.t_serial like in save_inter_controls 3152 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3153 gx_icvs.next = NULL; 3154 3155 return gx_icvs; 3156 } 3157 3158 static void __kmp_initialize_root(kmp_root_t *root) { 3159 int f; 3160 kmp_team_t *root_team; 3161 kmp_team_t *hot_team; 3162 int hot_team_max_nth; 3163 kmp_r_sched_t r_sched = 3164 __kmp_get_schedule_global(); // get current state of scheduling globals 3165 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3166 KMP_DEBUG_ASSERT(root); 3167 KMP_ASSERT(!root->r.r_begin); 3168 3169 /* setup the root state structure */ 3170 __kmp_init_lock(&root->r.r_begin_lock); 3171 root->r.r_begin = FALSE; 3172 root->r.r_active = FALSE; 3173 root->r.r_in_parallel = 0; 3174 root->r.r_blocktime = __kmp_dflt_blocktime; 3175 #if KMP_AFFINITY_SUPPORTED 3176 root->r.r_affinity_assigned = FALSE; 3177 #endif 3178 3179 /* setup the root team for this task */ 3180 /* allocate the root team structure */ 3181 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3182 3183 root_team = 3184 __kmp_allocate_team(root, 3185 1, // new_nproc 3186 1, // max_nproc 3187 #if OMPT_SUPPORT 3188 ompt_data_none, // root parallel id 3189 #endif 3190 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3191 0 // argc 3192 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3193 ); 3194 #if USE_DEBUGGER 3195 // Non-NULL value should be assigned to make the debugger display the root 3196 // team. 3197 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3198 #endif 3199 3200 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3201 3202 root->r.r_root_team = root_team; 3203 root_team->t.t_control_stack_top = NULL; 3204 3205 /* initialize root team */ 3206 root_team->t.t_threads[0] = NULL; 3207 root_team->t.t_nproc = 1; 3208 root_team->t.t_serialized = 1; 3209 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3210 root_team->t.t_sched.sched = r_sched.sched; 3211 KA_TRACE( 3212 20, 3213 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3214 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3215 3216 /* setup the hot team for this task */ 3217 /* allocate the hot team structure */ 3218 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3219 3220 hot_team = 3221 __kmp_allocate_team(root, 3222 1, // new_nproc 3223 __kmp_dflt_team_nth_ub * 2, // max_nproc 3224 #if OMPT_SUPPORT 3225 ompt_data_none, // root parallel id 3226 #endif 3227 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3228 0 // argc 3229 USE_NESTED_HOT_ARG(NULL) // primary thread is unknown 3230 ); 3231 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3232 3233 root->r.r_hot_team = hot_team; 3234 root_team->t.t_control_stack_top = NULL; 3235 3236 /* first-time initialization */ 3237 hot_team->t.t_parent = root_team; 3238 3239 /* initialize hot team */ 3240 hot_team_max_nth = hot_team->t.t_max_nproc; 3241 for (f = 0; f < hot_team_max_nth; ++f) { 3242 hot_team->t.t_threads[f] = NULL; 3243 } 3244 hot_team->t.t_nproc = 1; 3245 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3246 hot_team->t.t_sched.sched = r_sched.sched; 3247 hot_team->t.t_size_changed = 0; 3248 } 3249 3250 #ifdef KMP_DEBUG 3251 3252 typedef struct kmp_team_list_item { 3253 kmp_team_p const *entry; 3254 struct kmp_team_list_item *next; 3255 } kmp_team_list_item_t; 3256 typedef kmp_team_list_item_t *kmp_team_list_t; 3257 3258 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3259 kmp_team_list_t list, // List of teams. 3260 kmp_team_p const *team // Team to add. 3261 ) { 3262 3263 // List must terminate with item where both entry and next are NULL. 3264 // Team is added to the list only once. 3265 // List is sorted in ascending order by team id. 3266 // Team id is *not* a key. 3267 3268 kmp_team_list_t l; 3269 3270 KMP_DEBUG_ASSERT(list != NULL); 3271 if (team == NULL) { 3272 return; 3273 } 3274 3275 __kmp_print_structure_team_accum(list, team->t.t_parent); 3276 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3277 3278 // Search list for the team. 3279 l = list; 3280 while (l->next != NULL && l->entry != team) { 3281 l = l->next; 3282 } 3283 if (l->next != NULL) { 3284 return; // Team has been added before, exit. 3285 } 3286 3287 // Team is not found. Search list again for insertion point. 3288 l = list; 3289 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3290 l = l->next; 3291 } 3292 3293 // Insert team. 3294 { 3295 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3296 sizeof(kmp_team_list_item_t)); 3297 *item = *l; 3298 l->entry = team; 3299 l->next = item; 3300 } 3301 } 3302 3303 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3304 3305 ) { 3306 __kmp_printf("%s", title); 3307 if (team != NULL) { 3308 __kmp_printf("%2x %p\n", team->t.t_id, team); 3309 } else { 3310 __kmp_printf(" - (nil)\n"); 3311 } 3312 } 3313 3314 static void __kmp_print_structure_thread(char const *title, 3315 kmp_info_p const *thread) { 3316 __kmp_printf("%s", title); 3317 if (thread != NULL) { 3318 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3319 } else { 3320 __kmp_printf(" - (nil)\n"); 3321 } 3322 } 3323 3324 void __kmp_print_structure(void) { 3325 3326 kmp_team_list_t list; 3327 3328 // Initialize list of teams. 3329 list = 3330 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3331 list->entry = NULL; 3332 list->next = NULL; 3333 3334 __kmp_printf("\n------------------------------\nGlobal Thread " 3335 "Table\n------------------------------\n"); 3336 { 3337 int gtid; 3338 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3339 __kmp_printf("%2d", gtid); 3340 if (__kmp_threads != NULL) { 3341 __kmp_printf(" %p", __kmp_threads[gtid]); 3342 } 3343 if (__kmp_root != NULL) { 3344 __kmp_printf(" %p", __kmp_root[gtid]); 3345 } 3346 __kmp_printf("\n"); 3347 } 3348 } 3349 3350 // Print out __kmp_threads array. 3351 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3352 "----------\n"); 3353 if (__kmp_threads != NULL) { 3354 int gtid; 3355 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3356 kmp_info_t const *thread = __kmp_threads[gtid]; 3357 if (thread != NULL) { 3358 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3359 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3360 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3361 __kmp_print_structure_team(" Serial Team: ", 3362 thread->th.th_serial_team); 3363 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3364 __kmp_print_structure_thread(" Primary: ", 3365 thread->th.th_team_master); 3366 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3367 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3368 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3369 __kmp_print_structure_thread(" Next in pool: ", 3370 thread->th.th_next_pool); 3371 __kmp_printf("\n"); 3372 __kmp_print_structure_team_accum(list, thread->th.th_team); 3373 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3374 } 3375 } 3376 } else { 3377 __kmp_printf("Threads array is not allocated.\n"); 3378 } 3379 3380 // Print out __kmp_root array. 3381 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3382 "--------\n"); 3383 if (__kmp_root != NULL) { 3384 int gtid; 3385 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3386 kmp_root_t const *root = __kmp_root[gtid]; 3387 if (root != NULL) { 3388 __kmp_printf("GTID %2d %p:\n", gtid, root); 3389 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3390 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3391 __kmp_print_structure_thread(" Uber Thread: ", 3392 root->r.r_uber_thread); 3393 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3394 __kmp_printf(" In Parallel: %2d\n", 3395 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3396 __kmp_printf("\n"); 3397 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3398 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3399 } 3400 } 3401 } else { 3402 __kmp_printf("Ubers array is not allocated.\n"); 3403 } 3404 3405 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3406 "--------\n"); 3407 while (list->next != NULL) { 3408 kmp_team_p const *team = list->entry; 3409 int i; 3410 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3411 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3412 __kmp_printf(" Primary TID: %2d\n", team->t.t_master_tid); 3413 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3414 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3415 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3416 for (i = 0; i < team->t.t_nproc; ++i) { 3417 __kmp_printf(" Thread %2d: ", i); 3418 __kmp_print_structure_thread("", team->t.t_threads[i]); 3419 } 3420 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3421 __kmp_printf("\n"); 3422 list = list->next; 3423 } 3424 3425 // Print out __kmp_thread_pool and __kmp_team_pool. 3426 __kmp_printf("\n------------------------------\nPools\n----------------------" 3427 "--------\n"); 3428 __kmp_print_structure_thread("Thread pool: ", 3429 CCAST(kmp_info_t *, __kmp_thread_pool)); 3430 __kmp_print_structure_team("Team pool: ", 3431 CCAST(kmp_team_t *, __kmp_team_pool)); 3432 __kmp_printf("\n"); 3433 3434 // Free team list. 3435 while (list != NULL) { 3436 kmp_team_list_item_t *item = list; 3437 list = list->next; 3438 KMP_INTERNAL_FREE(item); 3439 } 3440 } 3441 3442 #endif 3443 3444 //--------------------------------------------------------------------------- 3445 // Stuff for per-thread fast random number generator 3446 // Table of primes 3447 static const unsigned __kmp_primes[] = { 3448 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3449 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3450 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3451 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3452 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3453 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3454 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3455 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3456 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3457 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3458 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3459 3460 //--------------------------------------------------------------------------- 3461 // __kmp_get_random: Get a random number using a linear congruential method. 3462 unsigned short __kmp_get_random(kmp_info_t *thread) { 3463 unsigned x = thread->th.th_x; 3464 unsigned short r = (unsigned short)(x >> 16); 3465 3466 thread->th.th_x = x * thread->th.th_a + 1; 3467 3468 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3469 thread->th.th_info.ds.ds_tid, r)); 3470 3471 return r; 3472 } 3473 //-------------------------------------------------------- 3474 // __kmp_init_random: Initialize a random number generator 3475 void __kmp_init_random(kmp_info_t *thread) { 3476 unsigned seed = thread->th.th_info.ds.ds_tid; 3477 3478 thread->th.th_a = 3479 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3480 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3481 KA_TRACE(30, 3482 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3483 } 3484 3485 #if KMP_OS_WINDOWS 3486 /* reclaim array entries for root threads that are already dead, returns number 3487 * reclaimed */ 3488 static int __kmp_reclaim_dead_roots(void) { 3489 int i, r = 0; 3490 3491 for (i = 0; i < __kmp_threads_capacity; ++i) { 3492 if (KMP_UBER_GTID(i) && 3493 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3494 !__kmp_root[i] 3495 ->r.r_active) { // AC: reclaim only roots died in non-active state 3496 r += __kmp_unregister_root_other_thread(i); 3497 } 3498 } 3499 return r; 3500 } 3501 #endif 3502 3503 /* This function attempts to create free entries in __kmp_threads and 3504 __kmp_root, and returns the number of free entries generated. 3505 3506 For Windows* OS static library, the first mechanism used is to reclaim array 3507 entries for root threads that are already dead. 3508 3509 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3510 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3511 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3512 threadprivate cache array has been created. Synchronization with 3513 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3514 3515 After any dead root reclamation, if the clipping value allows array expansion 3516 to result in the generation of a total of nNeed free slots, the function does 3517 that expansion. If not, nothing is done beyond the possible initial root 3518 thread reclamation. 3519 3520 If any argument is negative, the behavior is undefined. */ 3521 static int __kmp_expand_threads(int nNeed) { 3522 int added = 0; 3523 int minimumRequiredCapacity; 3524 int newCapacity; 3525 kmp_info_t **newThreads; 3526 kmp_root_t **newRoot; 3527 3528 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3529 // resizing __kmp_threads does not need additional protection if foreign 3530 // threads are present 3531 3532 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3533 /* only for Windows static library */ 3534 /* reclaim array entries for root threads that are already dead */ 3535 added = __kmp_reclaim_dead_roots(); 3536 3537 if (nNeed) { 3538 nNeed -= added; 3539 if (nNeed < 0) 3540 nNeed = 0; 3541 } 3542 #endif 3543 if (nNeed <= 0) 3544 return added; 3545 3546 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3547 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3548 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3549 // > __kmp_max_nth in one of two ways: 3550 // 3551 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3552 // may not be reused by another thread, so we may need to increase 3553 // __kmp_threads_capacity to __kmp_max_nth + 1. 3554 // 3555 // 2) New foreign root(s) are encountered. We always register new foreign 3556 // roots. This may cause a smaller # of threads to be allocated at 3557 // subsequent parallel regions, but the worker threads hang around (and 3558 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3559 // 3560 // Anyway, that is the reason for moving the check to see if 3561 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3562 // instead of having it performed here. -BB 3563 3564 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3565 3566 /* compute expansion headroom to check if we can expand */ 3567 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3568 /* possible expansion too small -- give up */ 3569 return added; 3570 } 3571 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3572 3573 newCapacity = __kmp_threads_capacity; 3574 do { 3575 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3576 : __kmp_sys_max_nth; 3577 } while (newCapacity < minimumRequiredCapacity); 3578 newThreads = (kmp_info_t **)__kmp_allocate( 3579 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3580 newRoot = 3581 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3582 KMP_MEMCPY(newThreads, __kmp_threads, 3583 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3584 KMP_MEMCPY(newRoot, __kmp_root, 3585 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3586 3587 kmp_info_t **temp_threads = __kmp_threads; 3588 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3589 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3590 __kmp_free(temp_threads); 3591 added += newCapacity - __kmp_threads_capacity; 3592 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3593 3594 if (newCapacity > __kmp_tp_capacity) { 3595 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3596 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3597 __kmp_threadprivate_resize_cache(newCapacity); 3598 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3599 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3600 } 3601 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3602 } 3603 3604 return added; 3605 } 3606 3607 /* Register the current thread as a root thread and obtain our gtid. We must 3608 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3609 thread that calls from __kmp_do_serial_initialize() */ 3610 int __kmp_register_root(int initial_thread) { 3611 kmp_info_t *root_thread; 3612 kmp_root_t *root; 3613 int gtid; 3614 int capacity; 3615 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3616 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3617 KMP_MB(); 3618 3619 /* 2007-03-02: 3620 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3621 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3622 work as expected -- it may return false (that means there is at least one 3623 empty slot in __kmp_threads array), but it is possible the only free slot 3624 is #0, which is reserved for initial thread and so cannot be used for this 3625 one. Following code workarounds this bug. 3626 3627 However, right solution seems to be not reserving slot #0 for initial 3628 thread because: 3629 (1) there is no magic in slot #0, 3630 (2) we cannot detect initial thread reliably (the first thread which does 3631 serial initialization may be not a real initial thread). 3632 */ 3633 capacity = __kmp_threads_capacity; 3634 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3635 --capacity; 3636 } 3637 3638 // If it is not for initializing the hidden helper team, we need to take 3639 // __kmp_hidden_helper_threads_num out of the capacity because it is included 3640 // in __kmp_threads_capacity. 3641 if (__kmp_enable_hidden_helper && !TCR_4(__kmp_init_hidden_helper_threads)) { 3642 capacity -= __kmp_hidden_helper_threads_num; 3643 } 3644 3645 /* see if there are too many threads */ 3646 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3647 if (__kmp_tp_cached) { 3648 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3649 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3650 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3651 } else { 3652 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3653 __kmp_msg_null); 3654 } 3655 } 3656 3657 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3658 // 0: initial thread, also a regular OpenMP thread. 3659 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3660 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3661 // regular OpenMP threads. 3662 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3663 // Find an available thread slot for hidden helper thread. Slots for hidden 3664 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3665 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3666 gtid <= __kmp_hidden_helper_threads_num; 3667 gtid++) 3668 ; 3669 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3670 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3671 "hidden helper thread: T#%d\n", 3672 gtid)); 3673 } else { 3674 /* find an available thread slot */ 3675 // Don't reassign the zero slot since we need that to only be used by 3676 // initial thread. Slots for hidden helper threads should also be skipped. 3677 if (initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3678 gtid = 0; 3679 } else { 3680 for (gtid = __kmp_hidden_helper_threads_num + 1; 3681 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3682 ; 3683 } 3684 KA_TRACE( 3685 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3686 KMP_ASSERT(gtid < __kmp_threads_capacity); 3687 } 3688 3689 /* update global accounting */ 3690 __kmp_all_nth++; 3691 TCW_4(__kmp_nth, __kmp_nth + 1); 3692 3693 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3694 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3695 if (__kmp_adjust_gtid_mode) { 3696 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3697 if (TCR_4(__kmp_gtid_mode) != 2) { 3698 TCW_4(__kmp_gtid_mode, 2); 3699 } 3700 } else { 3701 if (TCR_4(__kmp_gtid_mode) != 1) { 3702 TCW_4(__kmp_gtid_mode, 1); 3703 } 3704 } 3705 } 3706 3707 #ifdef KMP_ADJUST_BLOCKTIME 3708 /* Adjust blocktime to zero if necessary */ 3709 /* Middle initialization might not have occurred yet */ 3710 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3711 if (__kmp_nth > __kmp_avail_proc) { 3712 __kmp_zero_bt = TRUE; 3713 } 3714 } 3715 #endif /* KMP_ADJUST_BLOCKTIME */ 3716 3717 /* setup this new hierarchy */ 3718 if (!(root = __kmp_root[gtid])) { 3719 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3720 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3721 } 3722 3723 #if KMP_STATS_ENABLED 3724 // Initialize stats as soon as possible (right after gtid assignment). 3725 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3726 __kmp_stats_thread_ptr->startLife(); 3727 KMP_SET_THREAD_STATE(SERIAL_REGION); 3728 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3729 #endif 3730 __kmp_initialize_root(root); 3731 3732 /* setup new root thread structure */ 3733 if (root->r.r_uber_thread) { 3734 root_thread = root->r.r_uber_thread; 3735 } else { 3736 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3737 if (__kmp_storage_map) { 3738 __kmp_print_thread_storage_map(root_thread, gtid); 3739 } 3740 root_thread->th.th_info.ds.ds_gtid = gtid; 3741 #if OMPT_SUPPORT 3742 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3743 #endif 3744 root_thread->th.th_root = root; 3745 if (__kmp_env_consistency_check) { 3746 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3747 } 3748 #if USE_FAST_MEMORY 3749 __kmp_initialize_fast_memory(root_thread); 3750 #endif /* USE_FAST_MEMORY */ 3751 3752 #if KMP_USE_BGET 3753 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3754 __kmp_initialize_bget(root_thread); 3755 #endif 3756 __kmp_init_random(root_thread); // Initialize random number generator 3757 } 3758 3759 /* setup the serial team held in reserve by the root thread */ 3760 if (!root_thread->th.th_serial_team) { 3761 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3762 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3763 root_thread->th.th_serial_team = __kmp_allocate_team( 3764 root, 1, 1, 3765 #if OMPT_SUPPORT 3766 ompt_data_none, // root parallel id 3767 #endif 3768 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3769 } 3770 KMP_ASSERT(root_thread->th.th_serial_team); 3771 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3772 root_thread->th.th_serial_team)); 3773 3774 /* drop root_thread into place */ 3775 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3776 3777 root->r.r_root_team->t.t_threads[0] = root_thread; 3778 root->r.r_hot_team->t.t_threads[0] = root_thread; 3779 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3780 // AC: the team created in reserve, not for execution (it is unused for now). 3781 root_thread->th.th_serial_team->t.t_serialized = 0; 3782 root->r.r_uber_thread = root_thread; 3783 3784 /* initialize the thread, get it ready to go */ 3785 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3786 TCW_4(__kmp_init_gtid, TRUE); 3787 3788 /* prepare the primary thread for get_gtid() */ 3789 __kmp_gtid_set_specific(gtid); 3790 3791 #if USE_ITT_BUILD 3792 __kmp_itt_thread_name(gtid); 3793 #endif /* USE_ITT_BUILD */ 3794 3795 #ifdef KMP_TDATA_GTID 3796 __kmp_gtid = gtid; 3797 #endif 3798 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3799 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3800 3801 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3802 "plain=%u\n", 3803 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3804 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3805 KMP_INIT_BARRIER_STATE)); 3806 { // Initialize barrier data. 3807 int b; 3808 for (b = 0; b < bs_last_barrier; ++b) { 3809 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3810 #if USE_DEBUGGER 3811 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3812 #endif 3813 } 3814 } 3815 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3816 KMP_INIT_BARRIER_STATE); 3817 3818 #if KMP_AFFINITY_SUPPORTED 3819 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3820 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3821 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3822 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3823 #endif /* KMP_AFFINITY_SUPPORTED */ 3824 root_thread->th.th_def_allocator = __kmp_def_allocator; 3825 root_thread->th.th_prev_level = 0; 3826 root_thread->th.th_prev_num_threads = 1; 3827 3828 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3829 tmp->cg_root = root_thread; 3830 tmp->cg_thread_limit = __kmp_cg_max_nth; 3831 tmp->cg_nthreads = 1; 3832 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3833 " cg_nthreads init to 1\n", 3834 root_thread, tmp)); 3835 tmp->up = NULL; 3836 root_thread->th.th_cg_roots = tmp; 3837 3838 __kmp_root_counter++; 3839 3840 #if OMPT_SUPPORT 3841 if (!initial_thread && ompt_enabled.enabled) { 3842 3843 kmp_info_t *root_thread = ompt_get_thread(); 3844 3845 ompt_set_thread_state(root_thread, ompt_state_overhead); 3846 3847 if (ompt_enabled.ompt_callback_thread_begin) { 3848 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3849 ompt_thread_initial, __ompt_get_thread_data_internal()); 3850 } 3851 ompt_data_t *task_data; 3852 ompt_data_t *parallel_data; 3853 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3854 NULL); 3855 if (ompt_enabled.ompt_callback_implicit_task) { 3856 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3857 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3858 } 3859 3860 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3861 } 3862 #endif 3863 #if OMPD_SUPPORT 3864 if (ompd_state & OMPD_ENABLE_BP) 3865 ompd_bp_thread_begin(); 3866 #endif 3867 3868 KMP_MB(); 3869 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3870 3871 return gtid; 3872 } 3873 3874 #if KMP_NESTED_HOT_TEAMS 3875 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3876 const int max_level) { 3877 int i, n, nth; 3878 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3879 if (!hot_teams || !hot_teams[level].hot_team) { 3880 return 0; 3881 } 3882 KMP_DEBUG_ASSERT(level < max_level); 3883 kmp_team_t *team = hot_teams[level].hot_team; 3884 nth = hot_teams[level].hot_team_nth; 3885 n = nth - 1; // primary thread is not freed 3886 if (level < max_level - 1) { 3887 for (i = 0; i < nth; ++i) { 3888 kmp_info_t *th = team->t.t_threads[i]; 3889 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3890 if (i > 0 && th->th.th_hot_teams) { 3891 __kmp_free(th->th.th_hot_teams); 3892 th->th.th_hot_teams = NULL; 3893 } 3894 } 3895 } 3896 __kmp_free_team(root, team, NULL); 3897 return n; 3898 } 3899 #endif 3900 3901 // Resets a root thread and clear its root and hot teams. 3902 // Returns the number of __kmp_threads entries directly and indirectly freed. 3903 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3904 kmp_team_t *root_team = root->r.r_root_team; 3905 kmp_team_t *hot_team = root->r.r_hot_team; 3906 int n = hot_team->t.t_nproc; 3907 int i; 3908 3909 KMP_DEBUG_ASSERT(!root->r.r_active); 3910 3911 root->r.r_root_team = NULL; 3912 root->r.r_hot_team = NULL; 3913 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3914 // before call to __kmp_free_team(). 3915 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3916 #if KMP_NESTED_HOT_TEAMS 3917 if (__kmp_hot_teams_max_level > 3918 0) { // need to free nested hot teams and their threads if any 3919 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3920 kmp_info_t *th = hot_team->t.t_threads[i]; 3921 if (__kmp_hot_teams_max_level > 1) { 3922 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3923 } 3924 if (th->th.th_hot_teams) { 3925 __kmp_free(th->th.th_hot_teams); 3926 th->th.th_hot_teams = NULL; 3927 } 3928 } 3929 } 3930 #endif 3931 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3932 3933 // Before we can reap the thread, we need to make certain that all other 3934 // threads in the teams that had this root as ancestor have stopped trying to 3935 // steal tasks. 3936 if (__kmp_tasking_mode != tskm_immediate_exec) { 3937 __kmp_wait_to_unref_task_teams(); 3938 } 3939 3940 #if KMP_OS_WINDOWS 3941 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3942 KA_TRACE( 3943 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3944 "\n", 3945 (LPVOID) & (root->r.r_uber_thread->th), 3946 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3947 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3948 #endif /* KMP_OS_WINDOWS */ 3949 3950 #if OMPD_SUPPORT 3951 if (ompd_state & OMPD_ENABLE_BP) 3952 ompd_bp_thread_end(); 3953 #endif 3954 3955 #if OMPT_SUPPORT 3956 ompt_data_t *task_data; 3957 ompt_data_t *parallel_data; 3958 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, 3959 NULL); 3960 if (ompt_enabled.ompt_callback_implicit_task) { 3961 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3962 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3963 } 3964 if (ompt_enabled.ompt_callback_thread_end) { 3965 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3966 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3967 } 3968 #endif 3969 3970 TCW_4(__kmp_nth, 3971 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3972 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3973 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3974 " to %d\n", 3975 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3976 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3977 if (i == 1) { 3978 // need to free contention group structure 3979 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3980 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3981 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3982 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3983 root->r.r_uber_thread->th.th_cg_roots = NULL; 3984 } 3985 __kmp_reap_thread(root->r.r_uber_thread, 1); 3986 3987 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3988 // instead of freeing. 3989 root->r.r_uber_thread = NULL; 3990 /* mark root as no longer in use */ 3991 root->r.r_begin = FALSE; 3992 3993 return n; 3994 } 3995 3996 void __kmp_unregister_root_current_thread(int gtid) { 3997 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3998 /* this lock should be ok, since unregister_root_current_thread is never 3999 called during an abort, only during a normal close. furthermore, if you 4000 have the forkjoin lock, you should never try to get the initz lock */ 4001 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 4002 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4003 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4004 "exiting T#%d\n", 4005 gtid)); 4006 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4007 return; 4008 } 4009 kmp_root_t *root = __kmp_root[gtid]; 4010 4011 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4012 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4013 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4014 KMP_ASSERT(root->r.r_active == FALSE); 4015 4016 KMP_MB(); 4017 4018 kmp_info_t *thread = __kmp_threads[gtid]; 4019 kmp_team_t *team = thread->th.th_team; 4020 kmp_task_team_t *task_team = thread->th.th_task_team; 4021 4022 // we need to wait for the proxy tasks before finishing the thread 4023 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4024 #if OMPT_SUPPORT 4025 // the runtime is shutting down so we won't report any events 4026 thread->th.ompt_thread_info.state = ompt_state_undefined; 4027 #endif 4028 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4029 } 4030 4031 __kmp_reset_root(gtid, root); 4032 4033 KMP_MB(); 4034 KC_TRACE(10, 4035 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4036 4037 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4038 } 4039 4040 #if KMP_OS_WINDOWS 4041 /* __kmp_forkjoin_lock must be already held 4042 Unregisters a root thread that is not the current thread. Returns the number 4043 of __kmp_threads entries freed as a result. */ 4044 static int __kmp_unregister_root_other_thread(int gtid) { 4045 kmp_root_t *root = __kmp_root[gtid]; 4046 int r; 4047 4048 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4049 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4050 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4051 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4052 KMP_ASSERT(root->r.r_active == FALSE); 4053 4054 r = __kmp_reset_root(gtid, root); 4055 KC_TRACE(10, 4056 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4057 return r; 4058 } 4059 #endif 4060 4061 #if KMP_DEBUG 4062 void __kmp_task_info() { 4063 4064 kmp_int32 gtid = __kmp_entry_gtid(); 4065 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4066 kmp_info_t *this_thr = __kmp_threads[gtid]; 4067 kmp_team_t *steam = this_thr->th.th_serial_team; 4068 kmp_team_t *team = this_thr->th.th_team; 4069 4070 __kmp_printf( 4071 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4072 "ptask=%p\n", 4073 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4074 team->t.t_implicit_task_taskdata[tid].td_parent); 4075 } 4076 #endif // KMP_DEBUG 4077 4078 /* TODO optimize with one big memclr, take out what isn't needed, split 4079 responsibility to workers as much as possible, and delay initialization of 4080 features as much as possible */ 4081 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4082 int tid, int gtid) { 4083 /* this_thr->th.th_info.ds.ds_gtid is setup in 4084 kmp_allocate_thread/create_worker. 4085 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4086 KMP_DEBUG_ASSERT(this_thr != NULL); 4087 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4088 KMP_DEBUG_ASSERT(team); 4089 KMP_DEBUG_ASSERT(team->t.t_threads); 4090 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4091 kmp_info_t *master = team->t.t_threads[0]; 4092 KMP_DEBUG_ASSERT(master); 4093 KMP_DEBUG_ASSERT(master->th.th_root); 4094 4095 KMP_MB(); 4096 4097 TCW_SYNC_PTR(this_thr->th.th_team, team); 4098 4099 this_thr->th.th_info.ds.ds_tid = tid; 4100 this_thr->th.th_set_nproc = 0; 4101 if (__kmp_tasking_mode != tskm_immediate_exec) 4102 // When tasking is possible, threads are not safe to reap until they are 4103 // done tasking; this will be set when tasking code is exited in wait 4104 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4105 else // no tasking --> always safe to reap 4106 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4107 this_thr->th.th_set_proc_bind = proc_bind_default; 4108 #if KMP_AFFINITY_SUPPORTED 4109 this_thr->th.th_new_place = this_thr->th.th_current_place; 4110 #endif 4111 this_thr->th.th_root = master->th.th_root; 4112 4113 /* setup the thread's cache of the team structure */ 4114 this_thr->th.th_team_nproc = team->t.t_nproc; 4115 this_thr->th.th_team_master = master; 4116 this_thr->th.th_team_serialized = team->t.t_serialized; 4117 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4118 4119 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4120 4121 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4122 tid, gtid, this_thr, this_thr->th.th_current_task)); 4123 4124 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4125 team, tid, TRUE); 4126 4127 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4128 tid, gtid, this_thr, this_thr->th.th_current_task)); 4129 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4130 // __kmp_initialize_team()? 4131 4132 /* TODO no worksharing in speculative threads */ 4133 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4134 4135 this_thr->th.th_local.this_construct = 0; 4136 4137 if (!this_thr->th.th_pri_common) { 4138 this_thr->th.th_pri_common = 4139 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4140 if (__kmp_storage_map) { 4141 __kmp_print_storage_map_gtid( 4142 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4143 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4144 } 4145 this_thr->th.th_pri_head = NULL; 4146 } 4147 4148 if (this_thr != master && // Primary thread's CG root is initialized elsewhere 4149 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4150 // Make new thread's CG root same as primary thread's 4151 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4152 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4153 if (tmp) { 4154 // worker changes CG, need to check if old CG should be freed 4155 int i = tmp->cg_nthreads--; 4156 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4157 " on node %p of thread %p to %d\n", 4158 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4159 if (i == 1) { 4160 __kmp_free(tmp); // last thread left CG --> free it 4161 } 4162 } 4163 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4164 // Increment new thread's CG root's counter to add the new thread 4165 this_thr->th.th_cg_roots->cg_nthreads++; 4166 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4167 " node %p of thread %p to %d\n", 4168 this_thr, this_thr->th.th_cg_roots, 4169 this_thr->th.th_cg_roots->cg_root, 4170 this_thr->th.th_cg_roots->cg_nthreads)); 4171 this_thr->th.th_current_task->td_icvs.thread_limit = 4172 this_thr->th.th_cg_roots->cg_thread_limit; 4173 } 4174 4175 /* Initialize dynamic dispatch */ 4176 { 4177 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4178 // Use team max_nproc since this will never change for the team. 4179 size_t disp_size = 4180 sizeof(dispatch_private_info_t) * 4181 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4182 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4183 team->t.t_max_nproc)); 4184 KMP_ASSERT(dispatch); 4185 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4186 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4187 4188 dispatch->th_disp_index = 0; 4189 dispatch->th_doacross_buf_idx = 0; 4190 if (!dispatch->th_disp_buffer) { 4191 dispatch->th_disp_buffer = 4192 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4193 4194 if (__kmp_storage_map) { 4195 __kmp_print_storage_map_gtid( 4196 gtid, &dispatch->th_disp_buffer[0], 4197 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4198 ? 1 4199 : __kmp_dispatch_num_buffers], 4200 disp_size, 4201 "th_%d.th_dispatch.th_disp_buffer " 4202 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4203 gtid, team->t.t_id, gtid); 4204 } 4205 } else { 4206 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4207 } 4208 4209 dispatch->th_dispatch_pr_current = 0; 4210 dispatch->th_dispatch_sh_current = 0; 4211 4212 dispatch->th_deo_fcn = 0; /* ORDERED */ 4213 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4214 } 4215 4216 this_thr->th.th_next_pool = NULL; 4217 4218 if (!this_thr->th.th_task_state_memo_stack) { 4219 size_t i; 4220 this_thr->th.th_task_state_memo_stack = 4221 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4222 this_thr->th.th_task_state_top = 0; 4223 this_thr->th.th_task_state_stack_sz = 4; 4224 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4225 ++i) // zero init the stack 4226 this_thr->th.th_task_state_memo_stack[i] = 0; 4227 } 4228 4229 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4230 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4231 4232 KMP_MB(); 4233 } 4234 4235 /* allocate a new thread for the requesting team. this is only called from 4236 within a forkjoin critical section. we will first try to get an available 4237 thread from the thread pool. if none is available, we will fork a new one 4238 assuming we are able to create a new one. this should be assured, as the 4239 caller should check on this first. */ 4240 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4241 int new_tid) { 4242 kmp_team_t *serial_team; 4243 kmp_info_t *new_thr; 4244 int new_gtid; 4245 4246 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4247 KMP_DEBUG_ASSERT(root && team); 4248 #if !KMP_NESTED_HOT_TEAMS 4249 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4250 #endif 4251 KMP_MB(); 4252 4253 /* first, try to get one from the thread pool */ 4254 if (__kmp_thread_pool) { 4255 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4256 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4257 if (new_thr == __kmp_thread_pool_insert_pt) { 4258 __kmp_thread_pool_insert_pt = NULL; 4259 } 4260 TCW_4(new_thr->th.th_in_pool, FALSE); 4261 __kmp_suspend_initialize_thread(new_thr); 4262 __kmp_lock_suspend_mx(new_thr); 4263 if (new_thr->th.th_active_in_pool == TRUE) { 4264 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4265 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4266 new_thr->th.th_active_in_pool = FALSE; 4267 } 4268 __kmp_unlock_suspend_mx(new_thr); 4269 4270 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4271 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4272 KMP_ASSERT(!new_thr->th.th_team); 4273 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4274 4275 /* setup the thread structure */ 4276 __kmp_initialize_info(new_thr, team, new_tid, 4277 new_thr->th.th_info.ds.ds_gtid); 4278 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4279 4280 TCW_4(__kmp_nth, __kmp_nth + 1); 4281 4282 new_thr->th.th_task_state = 0; 4283 new_thr->th.th_task_state_top = 0; 4284 new_thr->th.th_task_state_stack_sz = 4; 4285 4286 #ifdef KMP_ADJUST_BLOCKTIME 4287 /* Adjust blocktime back to zero if necessary */ 4288 /* Middle initialization might not have occurred yet */ 4289 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4290 if (__kmp_nth > __kmp_avail_proc) { 4291 __kmp_zero_bt = TRUE; 4292 } 4293 } 4294 #endif /* KMP_ADJUST_BLOCKTIME */ 4295 4296 #if KMP_DEBUG 4297 // If thread entered pool via __kmp_free_thread, wait_flag should != 4298 // KMP_BARRIER_PARENT_FLAG. 4299 int b; 4300 kmp_balign_t *balign = new_thr->th.th_bar; 4301 for (b = 0; b < bs_last_barrier; ++b) 4302 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4303 #endif 4304 4305 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4306 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4307 4308 KMP_MB(); 4309 return new_thr; 4310 } 4311 4312 /* no, well fork a new one */ 4313 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4314 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4315 4316 #if KMP_USE_MONITOR 4317 // If this is the first worker thread the RTL is creating, then also 4318 // launch the monitor thread. We try to do this as early as possible. 4319 if (!TCR_4(__kmp_init_monitor)) { 4320 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4321 if (!TCR_4(__kmp_init_monitor)) { 4322 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4323 TCW_4(__kmp_init_monitor, 1); 4324 __kmp_create_monitor(&__kmp_monitor); 4325 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4326 #if KMP_OS_WINDOWS 4327 // AC: wait until monitor has started. This is a fix for CQ232808. 4328 // The reason is that if the library is loaded/unloaded in a loop with 4329 // small (parallel) work in between, then there is high probability that 4330 // monitor thread started after the library shutdown. At shutdown it is 4331 // too late to cope with the problem, because when the primary thread is 4332 // in DllMain (process detach) the monitor has no chances to start (it is 4333 // blocked), and primary thread has no means to inform the monitor that 4334 // the library has gone, because all the memory which the monitor can 4335 // access is going to be released/reset. 4336 while (TCR_4(__kmp_init_monitor) < 2) { 4337 KMP_YIELD(TRUE); 4338 } 4339 KF_TRACE(10, ("after monitor thread has started\n")); 4340 #endif 4341 } 4342 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4343 } 4344 #endif 4345 4346 KMP_MB(); 4347 4348 { 4349 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4350 ? 1 4351 : __kmp_hidden_helper_threads_num + 1; 4352 4353 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4354 ++new_gtid) { 4355 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4356 } 4357 4358 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4359 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4360 } 4361 } 4362 4363 /* allocate space for it. */ 4364 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4365 4366 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4367 4368 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4369 // suppress race conditions detection on synchronization flags in debug mode 4370 // this helps to analyze library internals eliminating false positives 4371 __itt_suppress_mark_range( 4372 __itt_suppress_range, __itt_suppress_threading_errors, 4373 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4374 __itt_suppress_mark_range( 4375 __itt_suppress_range, __itt_suppress_threading_errors, 4376 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4377 #if KMP_OS_WINDOWS 4378 __itt_suppress_mark_range( 4379 __itt_suppress_range, __itt_suppress_threading_errors, 4380 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4381 #else 4382 __itt_suppress_mark_range(__itt_suppress_range, 4383 __itt_suppress_threading_errors, 4384 &new_thr->th.th_suspend_init_count, 4385 sizeof(new_thr->th.th_suspend_init_count)); 4386 #endif 4387 // TODO: check if we need to also suppress b_arrived flags 4388 __itt_suppress_mark_range(__itt_suppress_range, 4389 __itt_suppress_threading_errors, 4390 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4391 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4392 __itt_suppress_mark_range(__itt_suppress_range, 4393 __itt_suppress_threading_errors, 4394 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4395 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4396 __itt_suppress_mark_range(__itt_suppress_range, 4397 __itt_suppress_threading_errors, 4398 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4399 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4400 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4401 if (__kmp_storage_map) { 4402 __kmp_print_thread_storage_map(new_thr, new_gtid); 4403 } 4404 4405 // add the reserve serialized team, initialized from the team's primary thread 4406 { 4407 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4408 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4409 new_thr->th.th_serial_team = serial_team = 4410 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4411 #if OMPT_SUPPORT 4412 ompt_data_none, // root parallel id 4413 #endif 4414 proc_bind_default, &r_icvs, 4415 0 USE_NESTED_HOT_ARG(NULL)); 4416 } 4417 KMP_ASSERT(serial_team); 4418 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4419 // execution (it is unused for now). 4420 serial_team->t.t_threads[0] = new_thr; 4421 KF_TRACE(10, 4422 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4423 new_thr)); 4424 4425 /* setup the thread structures */ 4426 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4427 4428 #if USE_FAST_MEMORY 4429 __kmp_initialize_fast_memory(new_thr); 4430 #endif /* USE_FAST_MEMORY */ 4431 4432 #if KMP_USE_BGET 4433 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4434 __kmp_initialize_bget(new_thr); 4435 #endif 4436 4437 __kmp_init_random(new_thr); // Initialize random number generator 4438 4439 /* Initialize these only once when thread is grabbed for a team allocation */ 4440 KA_TRACE(20, 4441 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4442 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4443 4444 int b; 4445 kmp_balign_t *balign = new_thr->th.th_bar; 4446 for (b = 0; b < bs_last_barrier; ++b) { 4447 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4448 balign[b].bb.team = NULL; 4449 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4450 balign[b].bb.use_oncore_barrier = 0; 4451 } 4452 4453 new_thr->th.th_spin_here = FALSE; 4454 new_thr->th.th_next_waiting = 0; 4455 #if KMP_OS_UNIX 4456 new_thr->th.th_blocking = false; 4457 #endif 4458 4459 #if KMP_AFFINITY_SUPPORTED 4460 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4461 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4462 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4463 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4464 #endif 4465 new_thr->th.th_def_allocator = __kmp_def_allocator; 4466 new_thr->th.th_prev_level = 0; 4467 new_thr->th.th_prev_num_threads = 1; 4468 4469 TCW_4(new_thr->th.th_in_pool, FALSE); 4470 new_thr->th.th_active_in_pool = FALSE; 4471 TCW_4(new_thr->th.th_active, TRUE); 4472 4473 /* adjust the global counters */ 4474 __kmp_all_nth++; 4475 __kmp_nth++; 4476 4477 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4478 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4479 if (__kmp_adjust_gtid_mode) { 4480 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4481 if (TCR_4(__kmp_gtid_mode) != 2) { 4482 TCW_4(__kmp_gtid_mode, 2); 4483 } 4484 } else { 4485 if (TCR_4(__kmp_gtid_mode) != 1) { 4486 TCW_4(__kmp_gtid_mode, 1); 4487 } 4488 } 4489 } 4490 4491 #ifdef KMP_ADJUST_BLOCKTIME 4492 /* Adjust blocktime back to zero if necessary */ 4493 /* Middle initialization might not have occurred yet */ 4494 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4495 if (__kmp_nth > __kmp_avail_proc) { 4496 __kmp_zero_bt = TRUE; 4497 } 4498 } 4499 #endif /* KMP_ADJUST_BLOCKTIME */ 4500 4501 /* actually fork it and create the new worker thread */ 4502 KF_TRACE( 4503 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4504 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4505 KF_TRACE(10, 4506 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4507 4508 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4509 new_gtid)); 4510 KMP_MB(); 4511 return new_thr; 4512 } 4513 4514 /* Reinitialize team for reuse. 4515 The hot team code calls this case at every fork barrier, so EPCC barrier 4516 test are extremely sensitive to changes in it, esp. writes to the team 4517 struct, which cause a cache invalidation in all threads. 4518 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4519 static void __kmp_reinitialize_team(kmp_team_t *team, 4520 kmp_internal_control_t *new_icvs, 4521 ident_t *loc) { 4522 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4523 team->t.t_threads[0], team)); 4524 KMP_DEBUG_ASSERT(team && new_icvs); 4525 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4526 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4527 4528 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4529 // Copy ICVs to the primary thread's implicit taskdata 4530 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4531 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4532 4533 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4534 team->t.t_threads[0], team)); 4535 } 4536 4537 /* Initialize the team data structure. 4538 This assumes the t_threads and t_max_nproc are already set. 4539 Also, we don't touch the arguments */ 4540 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4541 kmp_internal_control_t *new_icvs, 4542 ident_t *loc) { 4543 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4544 4545 /* verify */ 4546 KMP_DEBUG_ASSERT(team); 4547 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4548 KMP_DEBUG_ASSERT(team->t.t_threads); 4549 KMP_MB(); 4550 4551 team->t.t_master_tid = 0; /* not needed */ 4552 /* team->t.t_master_bar; not needed */ 4553 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4554 team->t.t_nproc = new_nproc; 4555 4556 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4557 team->t.t_next_pool = NULL; 4558 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4559 * up hot team */ 4560 4561 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4562 team->t.t_invoke = NULL; /* not needed */ 4563 4564 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4565 team->t.t_sched.sched = new_icvs->sched.sched; 4566 4567 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4568 team->t.t_fp_control_saved = FALSE; /* not needed */ 4569 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4570 team->t.t_mxcsr = 0; /* not needed */ 4571 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4572 4573 team->t.t_construct = 0; 4574 4575 team->t.t_ordered.dt.t_value = 0; 4576 team->t.t_master_active = FALSE; 4577 4578 #ifdef KMP_DEBUG 4579 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4580 #endif 4581 #if KMP_OS_WINDOWS 4582 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4583 #endif 4584 4585 team->t.t_control_stack_top = NULL; 4586 4587 __kmp_reinitialize_team(team, new_icvs, loc); 4588 4589 KMP_MB(); 4590 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4591 } 4592 4593 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4594 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4595 static void 4596 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4597 if (KMP_AFFINITY_CAPABLE()) { 4598 int status; 4599 if (old_mask != NULL) { 4600 status = __kmp_get_system_affinity(old_mask, TRUE); 4601 int error = errno; 4602 if (status != 0) { 4603 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4604 __kmp_msg_null); 4605 } 4606 } 4607 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4608 } 4609 } 4610 #endif 4611 4612 #if KMP_AFFINITY_SUPPORTED 4613 4614 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4615 // It calculates the worker + primary thread's partition based upon the parent 4616 // thread's partition, and binds each worker to a thread in their partition. 4617 // The primary thread's partition should already include its current binding. 4618 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4619 // Do not partition places for the hidden helper team 4620 if (KMP_HIDDEN_HELPER_TEAM(team)) 4621 return; 4622 // Copy the primary thread's place partition to the team struct 4623 kmp_info_t *master_th = team->t.t_threads[0]; 4624 KMP_DEBUG_ASSERT(master_th != NULL); 4625 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4626 int first_place = master_th->th.th_first_place; 4627 int last_place = master_th->th.th_last_place; 4628 int masters_place = master_th->th.th_current_place; 4629 team->t.t_first_place = first_place; 4630 team->t.t_last_place = last_place; 4631 4632 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4633 "bound to place %d partition = [%d,%d]\n", 4634 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4635 team->t.t_id, masters_place, first_place, last_place)); 4636 4637 switch (proc_bind) { 4638 4639 case proc_bind_default: 4640 // Serial teams might have the proc_bind policy set to proc_bind_default. 4641 // Not an issue -- we don't rebind primary thread for any proc_bind policy. 4642 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4643 break; 4644 4645 case proc_bind_primary: { 4646 int f; 4647 int n_th = team->t.t_nproc; 4648 for (f = 1; f < n_th; f++) { 4649 kmp_info_t *th = team->t.t_threads[f]; 4650 KMP_DEBUG_ASSERT(th != NULL); 4651 th->th.th_first_place = first_place; 4652 th->th.th_last_place = last_place; 4653 th->th.th_new_place = masters_place; 4654 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4655 team->t.t_display_affinity != 1) { 4656 team->t.t_display_affinity = 1; 4657 } 4658 4659 KA_TRACE(100, ("__kmp_partition_places: primary: T#%d(%d:%d) place %d " 4660 "partition = [%d,%d]\n", 4661 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4662 f, masters_place, first_place, last_place)); 4663 } 4664 } break; 4665 4666 case proc_bind_close: { 4667 int f; 4668 int n_th = team->t.t_nproc; 4669 int n_places; 4670 if (first_place <= last_place) { 4671 n_places = last_place - first_place + 1; 4672 } else { 4673 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4674 } 4675 if (n_th <= n_places) { 4676 int place = masters_place; 4677 for (f = 1; f < n_th; f++) { 4678 kmp_info_t *th = team->t.t_threads[f]; 4679 KMP_DEBUG_ASSERT(th != NULL); 4680 4681 if (place == last_place) { 4682 place = first_place; 4683 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4684 place = 0; 4685 } else { 4686 place++; 4687 } 4688 th->th.th_first_place = first_place; 4689 th->th.th_last_place = last_place; 4690 th->th.th_new_place = place; 4691 if (__kmp_display_affinity && place != th->th.th_current_place && 4692 team->t.t_display_affinity != 1) { 4693 team->t.t_display_affinity = 1; 4694 } 4695 4696 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4697 "partition = [%d,%d]\n", 4698 __kmp_gtid_from_thread(team->t.t_threads[f]), 4699 team->t.t_id, f, place, first_place, last_place)); 4700 } 4701 } else { 4702 int S, rem, gap, s_count; 4703 S = n_th / n_places; 4704 s_count = 0; 4705 rem = n_th - (S * n_places); 4706 gap = rem > 0 ? n_places / rem : n_places; 4707 int place = masters_place; 4708 int gap_ct = gap; 4709 for (f = 0; f < n_th; f++) { 4710 kmp_info_t *th = team->t.t_threads[f]; 4711 KMP_DEBUG_ASSERT(th != NULL); 4712 4713 th->th.th_first_place = first_place; 4714 th->th.th_last_place = last_place; 4715 th->th.th_new_place = place; 4716 if (__kmp_display_affinity && place != th->th.th_current_place && 4717 team->t.t_display_affinity != 1) { 4718 team->t.t_display_affinity = 1; 4719 } 4720 s_count++; 4721 4722 if ((s_count == S) && rem && (gap_ct == gap)) { 4723 // do nothing, add an extra thread to place on next iteration 4724 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4725 // we added an extra thread to this place; move to next place 4726 if (place == last_place) { 4727 place = first_place; 4728 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4729 place = 0; 4730 } else { 4731 place++; 4732 } 4733 s_count = 0; 4734 gap_ct = 1; 4735 rem--; 4736 } else if (s_count == S) { // place full; don't add extra 4737 if (place == last_place) { 4738 place = first_place; 4739 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4740 place = 0; 4741 } else { 4742 place++; 4743 } 4744 gap_ct++; 4745 s_count = 0; 4746 } 4747 4748 KA_TRACE(100, 4749 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4750 "partition = [%d,%d]\n", 4751 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4752 th->th.th_new_place, first_place, last_place)); 4753 } 4754 KMP_DEBUG_ASSERT(place == masters_place); 4755 } 4756 } break; 4757 4758 case proc_bind_spread: { 4759 int f; 4760 int n_th = team->t.t_nproc; 4761 int n_places; 4762 int thidx; 4763 if (first_place <= last_place) { 4764 n_places = last_place - first_place + 1; 4765 } else { 4766 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4767 } 4768 if (n_th <= n_places) { 4769 int place = -1; 4770 4771 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4772 int S = n_places / n_th; 4773 int s_count, rem, gap, gap_ct; 4774 4775 place = masters_place; 4776 rem = n_places - n_th * S; 4777 gap = rem ? n_th / rem : 1; 4778 gap_ct = gap; 4779 thidx = n_th; 4780 if (update_master_only == 1) 4781 thidx = 1; 4782 for (f = 0; f < thidx; f++) { 4783 kmp_info_t *th = team->t.t_threads[f]; 4784 KMP_DEBUG_ASSERT(th != NULL); 4785 4786 th->th.th_first_place = place; 4787 th->th.th_new_place = place; 4788 if (__kmp_display_affinity && place != th->th.th_current_place && 4789 team->t.t_display_affinity != 1) { 4790 team->t.t_display_affinity = 1; 4791 } 4792 s_count = 1; 4793 while (s_count < S) { 4794 if (place == last_place) { 4795 place = first_place; 4796 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4797 place = 0; 4798 } else { 4799 place++; 4800 } 4801 s_count++; 4802 } 4803 if (rem && (gap_ct == gap)) { 4804 if (place == last_place) { 4805 place = first_place; 4806 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4807 place = 0; 4808 } else { 4809 place++; 4810 } 4811 rem--; 4812 gap_ct = 0; 4813 } 4814 th->th.th_last_place = place; 4815 gap_ct++; 4816 4817 if (place == last_place) { 4818 place = first_place; 4819 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4820 place = 0; 4821 } else { 4822 place++; 4823 } 4824 4825 KA_TRACE(100, 4826 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4827 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4828 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4829 f, th->th.th_new_place, th->th.th_first_place, 4830 th->th.th_last_place, __kmp_affinity_num_masks)); 4831 } 4832 } else { 4833 /* Having uniform space of available computation places I can create 4834 T partitions of round(P/T) size and put threads into the first 4835 place of each partition. */ 4836 double current = static_cast<double>(masters_place); 4837 double spacing = 4838 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4839 int first, last; 4840 kmp_info_t *th; 4841 4842 thidx = n_th + 1; 4843 if (update_master_only == 1) 4844 thidx = 1; 4845 for (f = 0; f < thidx; f++) { 4846 first = static_cast<int>(current); 4847 last = static_cast<int>(current + spacing) - 1; 4848 KMP_DEBUG_ASSERT(last >= first); 4849 if (first >= n_places) { 4850 if (masters_place) { 4851 first -= n_places; 4852 last -= n_places; 4853 if (first == (masters_place + 1)) { 4854 KMP_DEBUG_ASSERT(f == n_th); 4855 first--; 4856 } 4857 if (last == masters_place) { 4858 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4859 last--; 4860 } 4861 } else { 4862 KMP_DEBUG_ASSERT(f == n_th); 4863 first = 0; 4864 last = 0; 4865 } 4866 } 4867 if (last >= n_places) { 4868 last = (n_places - 1); 4869 } 4870 place = first; 4871 current += spacing; 4872 if (f < n_th) { 4873 KMP_DEBUG_ASSERT(0 <= first); 4874 KMP_DEBUG_ASSERT(n_places > first); 4875 KMP_DEBUG_ASSERT(0 <= last); 4876 KMP_DEBUG_ASSERT(n_places > last); 4877 KMP_DEBUG_ASSERT(last_place >= first_place); 4878 th = team->t.t_threads[f]; 4879 KMP_DEBUG_ASSERT(th); 4880 th->th.th_first_place = first; 4881 th->th.th_new_place = place; 4882 th->th.th_last_place = last; 4883 if (__kmp_display_affinity && place != th->th.th_current_place && 4884 team->t.t_display_affinity != 1) { 4885 team->t.t_display_affinity = 1; 4886 } 4887 KA_TRACE(100, 4888 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4889 "partition = [%d,%d], spacing = %.4f\n", 4890 __kmp_gtid_from_thread(team->t.t_threads[f]), 4891 team->t.t_id, f, th->th.th_new_place, 4892 th->th.th_first_place, th->th.th_last_place, spacing)); 4893 } 4894 } 4895 } 4896 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4897 } else { 4898 int S, rem, gap, s_count; 4899 S = n_th / n_places; 4900 s_count = 0; 4901 rem = n_th - (S * n_places); 4902 gap = rem > 0 ? n_places / rem : n_places; 4903 int place = masters_place; 4904 int gap_ct = gap; 4905 thidx = n_th; 4906 if (update_master_only == 1) 4907 thidx = 1; 4908 for (f = 0; f < thidx; f++) { 4909 kmp_info_t *th = team->t.t_threads[f]; 4910 KMP_DEBUG_ASSERT(th != NULL); 4911 4912 th->th.th_first_place = place; 4913 th->th.th_last_place = place; 4914 th->th.th_new_place = place; 4915 if (__kmp_display_affinity && place != th->th.th_current_place && 4916 team->t.t_display_affinity != 1) { 4917 team->t.t_display_affinity = 1; 4918 } 4919 s_count++; 4920 4921 if ((s_count == S) && rem && (gap_ct == gap)) { 4922 // do nothing, add an extra thread to place on next iteration 4923 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4924 // we added an extra thread to this place; move on to next place 4925 if (place == last_place) { 4926 place = first_place; 4927 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4928 place = 0; 4929 } else { 4930 place++; 4931 } 4932 s_count = 0; 4933 gap_ct = 1; 4934 rem--; 4935 } else if (s_count == S) { // place is full; don't add extra thread 4936 if (place == last_place) { 4937 place = first_place; 4938 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4939 place = 0; 4940 } else { 4941 place++; 4942 } 4943 gap_ct++; 4944 s_count = 0; 4945 } 4946 4947 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4948 "partition = [%d,%d]\n", 4949 __kmp_gtid_from_thread(team->t.t_threads[f]), 4950 team->t.t_id, f, th->th.th_new_place, 4951 th->th.th_first_place, th->th.th_last_place)); 4952 } 4953 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4954 } 4955 } break; 4956 4957 default: 4958 break; 4959 } 4960 4961 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4962 } 4963 4964 #endif // KMP_AFFINITY_SUPPORTED 4965 4966 /* allocate a new team data structure to use. take one off of the free pool if 4967 available */ 4968 kmp_team_t * 4969 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4970 #if OMPT_SUPPORT 4971 ompt_data_t ompt_parallel_data, 4972 #endif 4973 kmp_proc_bind_t new_proc_bind, 4974 kmp_internal_control_t *new_icvs, 4975 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4976 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4977 int f; 4978 kmp_team_t *team; 4979 int use_hot_team = !root->r.r_active; 4980 int level = 0; 4981 4982 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4983 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4984 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4985 KMP_MB(); 4986 4987 #if KMP_NESTED_HOT_TEAMS 4988 kmp_hot_team_ptr_t *hot_teams; 4989 if (master) { 4990 team = master->th.th_team; 4991 level = team->t.t_active_level; 4992 if (master->th.th_teams_microtask) { // in teams construct? 4993 if (master->th.th_teams_size.nteams > 1 && 4994 ( // #teams > 1 4995 team->t.t_pkfn == 4996 (microtask_t)__kmp_teams_master || // inner fork of the teams 4997 master->th.th_teams_level < 4998 team->t.t_level)) { // or nested parallel inside the teams 4999 ++level; // not increment if #teams==1, or for outer fork of the teams; 5000 // increment otherwise 5001 } 5002 } 5003 hot_teams = master->th.th_hot_teams; 5004 if (level < __kmp_hot_teams_max_level && hot_teams && 5005 hot_teams[level].hot_team) { 5006 // hot team has already been allocated for given level 5007 use_hot_team = 1; 5008 } else { 5009 use_hot_team = 0; 5010 } 5011 } else { 5012 // check we won't access uninitialized hot_teams, just in case 5013 KMP_DEBUG_ASSERT(new_nproc == 1); 5014 } 5015 #endif 5016 // Optimization to use a "hot" team 5017 if (use_hot_team && new_nproc > 1) { 5018 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 5019 #if KMP_NESTED_HOT_TEAMS 5020 team = hot_teams[level].hot_team; 5021 #else 5022 team = root->r.r_hot_team; 5023 #endif 5024 #if KMP_DEBUG 5025 if (__kmp_tasking_mode != tskm_immediate_exec) { 5026 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5027 "task_team[1] = %p before reinit\n", 5028 team->t.t_task_team[0], team->t.t_task_team[1])); 5029 } 5030 #endif 5031 5032 // Has the number of threads changed? 5033 /* Let's assume the most common case is that the number of threads is 5034 unchanged, and put that case first. */ 5035 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5036 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5037 // This case can mean that omp_set_num_threads() was called and the hot 5038 // team size was already reduced, so we check the special flag 5039 if (team->t.t_size_changed == -1) { 5040 team->t.t_size_changed = 1; 5041 } else { 5042 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5043 } 5044 5045 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5046 kmp_r_sched_t new_sched = new_icvs->sched; 5047 // set primary thread's schedule as new run-time schedule 5048 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5049 5050 __kmp_reinitialize_team(team, new_icvs, 5051 root->r.r_uber_thread->th.th_ident); 5052 5053 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5054 team->t.t_threads[0], team)); 5055 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5056 5057 #if KMP_AFFINITY_SUPPORTED 5058 if ((team->t.t_size_changed == 0) && 5059 (team->t.t_proc_bind == new_proc_bind)) { 5060 if (new_proc_bind == proc_bind_spread) { 5061 __kmp_partition_places( 5062 team, 1); // add flag to update only master for spread 5063 } 5064 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5065 "proc_bind = %d, partition = [%d,%d]\n", 5066 team->t.t_id, new_proc_bind, team->t.t_first_place, 5067 team->t.t_last_place)); 5068 } else { 5069 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5070 __kmp_partition_places(team); 5071 } 5072 #else 5073 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5074 #endif /* KMP_AFFINITY_SUPPORTED */ 5075 } else if (team->t.t_nproc > new_nproc) { 5076 KA_TRACE(20, 5077 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5078 new_nproc)); 5079 5080 team->t.t_size_changed = 1; 5081 #if KMP_NESTED_HOT_TEAMS 5082 if (__kmp_hot_teams_mode == 0) { 5083 // AC: saved number of threads should correspond to team's value in this 5084 // mode, can be bigger in mode 1, when hot team has threads in reserve 5085 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5086 hot_teams[level].hot_team_nth = new_nproc; 5087 #endif // KMP_NESTED_HOT_TEAMS 5088 /* release the extra threads we don't need any more */ 5089 for (f = new_nproc; f < team->t.t_nproc; f++) { 5090 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5091 if (__kmp_tasking_mode != tskm_immediate_exec) { 5092 // When decreasing team size, threads no longer in the team should 5093 // unref task team. 5094 team->t.t_threads[f]->th.th_task_team = NULL; 5095 } 5096 __kmp_free_thread(team->t.t_threads[f]); 5097 team->t.t_threads[f] = NULL; 5098 } 5099 #if KMP_NESTED_HOT_TEAMS 5100 } // (__kmp_hot_teams_mode == 0) 5101 else { 5102 // When keeping extra threads in team, switch threads to wait on own 5103 // b_go flag 5104 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5105 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5106 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5107 for (int b = 0; b < bs_last_barrier; ++b) { 5108 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5109 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5110 } 5111 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5112 } 5113 } 5114 } 5115 #endif // KMP_NESTED_HOT_TEAMS 5116 team->t.t_nproc = new_nproc; 5117 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5118 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5119 __kmp_reinitialize_team(team, new_icvs, 5120 root->r.r_uber_thread->th.th_ident); 5121 5122 // Update remaining threads 5123 for (f = 0; f < new_nproc; ++f) { 5124 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5125 } 5126 5127 // restore the current task state of the primary thread: should be the 5128 // implicit task 5129 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5130 team->t.t_threads[0], team)); 5131 5132 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5133 5134 #ifdef KMP_DEBUG 5135 for (f = 0; f < team->t.t_nproc; f++) { 5136 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5137 team->t.t_threads[f]->th.th_team_nproc == 5138 team->t.t_nproc); 5139 } 5140 #endif 5141 5142 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5143 #if KMP_AFFINITY_SUPPORTED 5144 __kmp_partition_places(team); 5145 #endif 5146 } else { // team->t.t_nproc < new_nproc 5147 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5148 kmp_affin_mask_t *old_mask; 5149 if (KMP_AFFINITY_CAPABLE()) { 5150 KMP_CPU_ALLOC(old_mask); 5151 } 5152 #endif 5153 5154 KA_TRACE(20, 5155 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5156 new_nproc)); 5157 5158 team->t.t_size_changed = 1; 5159 5160 #if KMP_NESTED_HOT_TEAMS 5161 int avail_threads = hot_teams[level].hot_team_nth; 5162 if (new_nproc < avail_threads) 5163 avail_threads = new_nproc; 5164 kmp_info_t **other_threads = team->t.t_threads; 5165 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5166 // Adjust barrier data of reserved threads (if any) of the team 5167 // Other data will be set in __kmp_initialize_info() below. 5168 int b; 5169 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5170 for (b = 0; b < bs_last_barrier; ++b) { 5171 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5172 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5173 #if USE_DEBUGGER 5174 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5175 #endif 5176 } 5177 } 5178 if (hot_teams[level].hot_team_nth >= new_nproc) { 5179 // we have all needed threads in reserve, no need to allocate any 5180 // this only possible in mode 1, cannot have reserved threads in mode 0 5181 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5182 team->t.t_nproc = new_nproc; // just get reserved threads involved 5183 } else { 5184 // we may have some threads in reserve, but not enough 5185 team->t.t_nproc = 5186 hot_teams[level] 5187 .hot_team_nth; // get reserved threads involved if any 5188 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5189 #endif // KMP_NESTED_HOT_TEAMS 5190 if (team->t.t_max_nproc < new_nproc) { 5191 /* reallocate larger arrays */ 5192 __kmp_reallocate_team_arrays(team, new_nproc); 5193 __kmp_reinitialize_team(team, new_icvs, NULL); 5194 } 5195 5196 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5197 /* Temporarily set full mask for primary thread before creation of 5198 workers. The reason is that workers inherit the affinity from the 5199 primary thread, so if a lot of workers are created on the single 5200 core quickly, they don't get a chance to set their own affinity for 5201 a long time. */ 5202 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5203 #endif 5204 5205 /* allocate new threads for the hot team */ 5206 for (f = team->t.t_nproc; f < new_nproc; f++) { 5207 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5208 KMP_DEBUG_ASSERT(new_worker); 5209 team->t.t_threads[f] = new_worker; 5210 5211 KA_TRACE(20, 5212 ("__kmp_allocate_team: team %d init T#%d arrived: " 5213 "join=%llu, plain=%llu\n", 5214 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5215 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5216 team->t.t_bar[bs_plain_barrier].b_arrived)); 5217 5218 { // Initialize barrier data for new threads. 5219 int b; 5220 kmp_balign_t *balign = new_worker->th.th_bar; 5221 for (b = 0; b < bs_last_barrier; ++b) { 5222 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5223 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5224 KMP_BARRIER_PARENT_FLAG); 5225 #if USE_DEBUGGER 5226 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5227 #endif 5228 } 5229 } 5230 } 5231 5232 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5233 if (KMP_AFFINITY_CAPABLE()) { 5234 /* Restore initial primary thread's affinity mask */ 5235 __kmp_set_system_affinity(old_mask, TRUE); 5236 KMP_CPU_FREE(old_mask); 5237 } 5238 #endif 5239 #if KMP_NESTED_HOT_TEAMS 5240 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5241 #endif // KMP_NESTED_HOT_TEAMS 5242 /* make sure everyone is syncronized */ 5243 int old_nproc = team->t.t_nproc; // save old value and use to update only 5244 // new threads below 5245 __kmp_initialize_team(team, new_nproc, new_icvs, 5246 root->r.r_uber_thread->th.th_ident); 5247 5248 /* reinitialize the threads */ 5249 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5250 for (f = 0; f < team->t.t_nproc; ++f) 5251 __kmp_initialize_info(team->t.t_threads[f], team, f, 5252 __kmp_gtid_from_tid(f, team)); 5253 5254 if (level) { // set th_task_state for new threads in nested hot team 5255 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5256 // only need to set the th_task_state for the new threads. th_task_state 5257 // for primary thread will not be accurate until after this in 5258 // __kmp_fork_call(), so we look to the primary thread's memo_stack to 5259 // get the correct value. 5260 for (f = old_nproc; f < team->t.t_nproc; ++f) 5261 team->t.t_threads[f]->th.th_task_state = 5262 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5263 } else { // set th_task_state for new threads in non-nested hot team 5264 // copy primary thread's state 5265 kmp_uint8 old_state = team->t.t_threads[0]->th.th_task_state; 5266 for (f = old_nproc; f < team->t.t_nproc; ++f) 5267 team->t.t_threads[f]->th.th_task_state = old_state; 5268 } 5269 5270 #ifdef KMP_DEBUG 5271 for (f = 0; f < team->t.t_nproc; ++f) { 5272 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5273 team->t.t_threads[f]->th.th_team_nproc == 5274 team->t.t_nproc); 5275 } 5276 #endif 5277 5278 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5279 #if KMP_AFFINITY_SUPPORTED 5280 __kmp_partition_places(team); 5281 #endif 5282 } // Check changes in number of threads 5283 5284 kmp_info_t *master = team->t.t_threads[0]; 5285 if (master->th.th_teams_microtask) { 5286 for (f = 1; f < new_nproc; ++f) { 5287 // propagate teams construct specific info to workers 5288 kmp_info_t *thr = team->t.t_threads[f]; 5289 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5290 thr->th.th_teams_level = master->th.th_teams_level; 5291 thr->th.th_teams_size = master->th.th_teams_size; 5292 } 5293 } 5294 #if KMP_NESTED_HOT_TEAMS 5295 if (level) { 5296 // Sync barrier state for nested hot teams, not needed for outermost hot 5297 // team. 5298 for (f = 1; f < new_nproc; ++f) { 5299 kmp_info_t *thr = team->t.t_threads[f]; 5300 int b; 5301 kmp_balign_t *balign = thr->th.th_bar; 5302 for (b = 0; b < bs_last_barrier; ++b) { 5303 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5304 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5305 #if USE_DEBUGGER 5306 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5307 #endif 5308 } 5309 } 5310 } 5311 #endif // KMP_NESTED_HOT_TEAMS 5312 5313 /* reallocate space for arguments if necessary */ 5314 __kmp_alloc_argv_entries(argc, team, TRUE); 5315 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5316 // The hot team re-uses the previous task team, 5317 // if untouched during the previous release->gather phase. 5318 5319 KF_TRACE(10, (" hot_team = %p\n", team)); 5320 5321 #if KMP_DEBUG 5322 if (__kmp_tasking_mode != tskm_immediate_exec) { 5323 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5324 "task_team[1] = %p after reinit\n", 5325 team->t.t_task_team[0], team->t.t_task_team[1])); 5326 } 5327 #endif 5328 5329 #if OMPT_SUPPORT 5330 __ompt_team_assign_id(team, ompt_parallel_data); 5331 #endif 5332 5333 KMP_MB(); 5334 5335 return team; 5336 } 5337 5338 /* next, let's try to take one from the team pool */ 5339 KMP_MB(); 5340 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5341 /* TODO: consider resizing undersized teams instead of reaping them, now 5342 that we have a resizing mechanism */ 5343 if (team->t.t_max_nproc >= max_nproc) { 5344 /* take this team from the team pool */ 5345 __kmp_team_pool = team->t.t_next_pool; 5346 5347 /* setup the team for fresh use */ 5348 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5349 5350 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5351 "task_team[1] %p to NULL\n", 5352 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5353 team->t.t_task_team[0] = NULL; 5354 team->t.t_task_team[1] = NULL; 5355 5356 /* reallocate space for arguments if necessary */ 5357 __kmp_alloc_argv_entries(argc, team, TRUE); 5358 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5359 5360 KA_TRACE( 5361 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5362 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5363 { // Initialize barrier data. 5364 int b; 5365 for (b = 0; b < bs_last_barrier; ++b) { 5366 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5367 #if USE_DEBUGGER 5368 team->t.t_bar[b].b_master_arrived = 0; 5369 team->t.t_bar[b].b_team_arrived = 0; 5370 #endif 5371 } 5372 } 5373 5374 team->t.t_proc_bind = new_proc_bind; 5375 5376 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5377 team->t.t_id)); 5378 5379 #if OMPT_SUPPORT 5380 __ompt_team_assign_id(team, ompt_parallel_data); 5381 #endif 5382 5383 KMP_MB(); 5384 5385 return team; 5386 } 5387 5388 /* reap team if it is too small, then loop back and check the next one */ 5389 // not sure if this is wise, but, will be redone during the hot-teams 5390 // rewrite. 5391 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5392 team = __kmp_reap_team(team); 5393 __kmp_team_pool = team; 5394 } 5395 5396 /* nothing available in the pool, no matter, make a new team! */ 5397 KMP_MB(); 5398 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5399 5400 /* and set it up */ 5401 team->t.t_max_nproc = max_nproc; 5402 /* NOTE well, for some reason allocating one big buffer and dividing it up 5403 seems to really hurt performance a lot on the P4, so, let's not use this */ 5404 __kmp_allocate_team_arrays(team, max_nproc); 5405 5406 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5407 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5408 5409 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5410 "%p to NULL\n", 5411 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5412 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5413 // memory, no need to duplicate 5414 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5415 // memory, no need to duplicate 5416 5417 if (__kmp_storage_map) { 5418 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5419 } 5420 5421 /* allocate space for arguments */ 5422 __kmp_alloc_argv_entries(argc, team, FALSE); 5423 team->t.t_argc = argc; 5424 5425 KA_TRACE(20, 5426 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5427 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5428 { // Initialize barrier data. 5429 int b; 5430 for (b = 0; b < bs_last_barrier; ++b) { 5431 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5432 #if USE_DEBUGGER 5433 team->t.t_bar[b].b_master_arrived = 0; 5434 team->t.t_bar[b].b_team_arrived = 0; 5435 #endif 5436 } 5437 } 5438 5439 team->t.t_proc_bind = new_proc_bind; 5440 5441 #if OMPT_SUPPORT 5442 __ompt_team_assign_id(team, ompt_parallel_data); 5443 team->t.ompt_serialized_team_info = NULL; 5444 #endif 5445 5446 KMP_MB(); 5447 5448 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5449 team->t.t_id)); 5450 5451 return team; 5452 } 5453 5454 /* TODO implement hot-teams at all levels */ 5455 /* TODO implement lazy thread release on demand (disband request) */ 5456 5457 /* free the team. return it to the team pool. release all the threads 5458 * associated with it */ 5459 void __kmp_free_team(kmp_root_t *root, 5460 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5461 int f; 5462 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5463 team->t.t_id)); 5464 5465 /* verify state */ 5466 KMP_DEBUG_ASSERT(root); 5467 KMP_DEBUG_ASSERT(team); 5468 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5469 KMP_DEBUG_ASSERT(team->t.t_threads); 5470 5471 int use_hot_team = team == root->r.r_hot_team; 5472 #if KMP_NESTED_HOT_TEAMS 5473 int level; 5474 kmp_hot_team_ptr_t *hot_teams; 5475 if (master) { 5476 level = team->t.t_active_level - 1; 5477 if (master->th.th_teams_microtask) { // in teams construct? 5478 if (master->th.th_teams_size.nteams > 1) { 5479 ++level; // level was not increased in teams construct for 5480 // team_of_masters 5481 } 5482 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5483 master->th.th_teams_level == team->t.t_level) { 5484 ++level; // level was not increased in teams construct for 5485 // team_of_workers before the parallel 5486 } // team->t.t_level will be increased inside parallel 5487 } 5488 hot_teams = master->th.th_hot_teams; 5489 if (level < __kmp_hot_teams_max_level) { 5490 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5491 use_hot_team = 1; 5492 } 5493 } 5494 #endif // KMP_NESTED_HOT_TEAMS 5495 5496 /* team is done working */ 5497 TCW_SYNC_PTR(team->t.t_pkfn, 5498 NULL); // Important for Debugging Support Library. 5499 #if KMP_OS_WINDOWS 5500 team->t.t_copyin_counter = 0; // init counter for possible reuse 5501 #endif 5502 // Do not reset pointer to parent team to NULL for hot teams. 5503 5504 /* if we are non-hot team, release our threads */ 5505 if (!use_hot_team) { 5506 if (__kmp_tasking_mode != tskm_immediate_exec) { 5507 // Wait for threads to reach reapable state 5508 for (f = 1; f < team->t.t_nproc; ++f) { 5509 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5510 kmp_info_t *th = team->t.t_threads[f]; 5511 volatile kmp_uint32 *state = &th->th.th_reap_state; 5512 while (*state != KMP_SAFE_TO_REAP) { 5513 #if KMP_OS_WINDOWS 5514 // On Windows a thread can be killed at any time, check this 5515 DWORD ecode; 5516 if (!__kmp_is_thread_alive(th, &ecode)) { 5517 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5518 break; 5519 } 5520 #endif 5521 // first check if thread is sleeping 5522 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5523 if (fl.is_sleeping()) 5524 fl.resume(__kmp_gtid_from_thread(th)); 5525 KMP_CPU_PAUSE(); 5526 } 5527 } 5528 5529 // Delete task teams 5530 int tt_idx; 5531 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5532 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5533 if (task_team != NULL) { 5534 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5535 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5536 team->t.t_threads[f]->th.th_task_team = NULL; 5537 } 5538 KA_TRACE( 5539 20, 5540 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5541 __kmp_get_gtid(), task_team, team->t.t_id)); 5542 #if KMP_NESTED_HOT_TEAMS 5543 __kmp_free_task_team(master, task_team); 5544 #endif 5545 team->t.t_task_team[tt_idx] = NULL; 5546 } 5547 } 5548 } 5549 5550 // Reset pointer to parent team only for non-hot teams. 5551 team->t.t_parent = NULL; 5552 team->t.t_level = 0; 5553 team->t.t_active_level = 0; 5554 5555 /* free the worker threads */ 5556 for (f = 1; f < team->t.t_nproc; ++f) { 5557 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5558 __kmp_free_thread(team->t.t_threads[f]); 5559 team->t.t_threads[f] = NULL; 5560 } 5561 5562 /* put the team back in the team pool */ 5563 /* TODO limit size of team pool, call reap_team if pool too large */ 5564 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5565 __kmp_team_pool = (volatile kmp_team_t *)team; 5566 } else { // Check if team was created for primary threads in teams construct 5567 // See if first worker is a CG root 5568 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5569 team->t.t_threads[1]->th.th_cg_roots); 5570 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5571 // Clean up the CG root nodes on workers so that this team can be re-used 5572 for (f = 1; f < team->t.t_nproc; ++f) { 5573 kmp_info_t *thr = team->t.t_threads[f]; 5574 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5575 thr->th.th_cg_roots->cg_root == thr); 5576 // Pop current CG root off list 5577 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5578 thr->th.th_cg_roots = tmp->up; 5579 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5580 " up to node %p. cg_nthreads was %d\n", 5581 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5582 int i = tmp->cg_nthreads--; 5583 if (i == 1) { 5584 __kmp_free(tmp); // free CG if we are the last thread in it 5585 } 5586 // Restore current task's thread_limit from CG root 5587 if (thr->th.th_cg_roots) 5588 thr->th.th_current_task->td_icvs.thread_limit = 5589 thr->th.th_cg_roots->cg_thread_limit; 5590 } 5591 } 5592 } 5593 5594 KMP_MB(); 5595 } 5596 5597 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5598 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5599 kmp_team_t *next_pool = team->t.t_next_pool; 5600 5601 KMP_DEBUG_ASSERT(team); 5602 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5603 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5604 KMP_DEBUG_ASSERT(team->t.t_threads); 5605 KMP_DEBUG_ASSERT(team->t.t_argv); 5606 5607 /* TODO clean the threads that are a part of this? */ 5608 5609 /* free stuff */ 5610 __kmp_free_team_arrays(team); 5611 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5612 __kmp_free((void *)team->t.t_argv); 5613 __kmp_free(team); 5614 5615 KMP_MB(); 5616 return next_pool; 5617 } 5618 5619 // Free the thread. Don't reap it, just place it on the pool of available 5620 // threads. 5621 // 5622 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5623 // binding for the affinity mechanism to be useful. 5624 // 5625 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5626 // However, we want to avoid a potential performance problem by always 5627 // scanning through the list to find the correct point at which to insert 5628 // the thread (potential N**2 behavior). To do this we keep track of the 5629 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5630 // With single-level parallelism, threads will always be added to the tail 5631 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5632 // parallelism, all bets are off and we may need to scan through the entire 5633 // free list. 5634 // 5635 // This change also has a potentially large performance benefit, for some 5636 // applications. Previously, as threads were freed from the hot team, they 5637 // would be placed back on the free list in inverse order. If the hot team 5638 // grew back to it's original size, then the freed thread would be placed 5639 // back on the hot team in reverse order. This could cause bad cache 5640 // locality problems on programs where the size of the hot team regularly 5641 // grew and shrunk. 5642 // 5643 // Now, for single-level parallelism, the OMP tid is always == gtid. 5644 void __kmp_free_thread(kmp_info_t *this_th) { 5645 int gtid; 5646 kmp_info_t **scan; 5647 5648 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5649 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5650 5651 KMP_DEBUG_ASSERT(this_th); 5652 5653 // When moving thread to pool, switch thread to wait on own b_go flag, and 5654 // uninitialized (NULL team). 5655 int b; 5656 kmp_balign_t *balign = this_th->th.th_bar; 5657 for (b = 0; b < bs_last_barrier; ++b) { 5658 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5659 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5660 balign[b].bb.team = NULL; 5661 balign[b].bb.leaf_kids = 0; 5662 } 5663 this_th->th.th_task_state = 0; 5664 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5665 5666 /* put thread back on the free pool */ 5667 TCW_PTR(this_th->th.th_team, NULL); 5668 TCW_PTR(this_th->th.th_root, NULL); 5669 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5670 5671 while (this_th->th.th_cg_roots) { 5672 this_th->th.th_cg_roots->cg_nthreads--; 5673 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5674 " %p of thread %p to %d\n", 5675 this_th, this_th->th.th_cg_roots, 5676 this_th->th.th_cg_roots->cg_root, 5677 this_th->th.th_cg_roots->cg_nthreads)); 5678 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5679 if (tmp->cg_root == this_th) { // Thread is a cg_root 5680 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5681 KA_TRACE( 5682 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5683 this_th->th.th_cg_roots = tmp->up; 5684 __kmp_free(tmp); 5685 } else { // Worker thread 5686 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5687 __kmp_free(tmp); 5688 } 5689 this_th->th.th_cg_roots = NULL; 5690 break; 5691 } 5692 } 5693 5694 /* If the implicit task assigned to this thread can be used by other threads 5695 * -> multiple threads can share the data and try to free the task at 5696 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5697 * with higher probability when hot team is disabled but can occurs even when 5698 * the hot team is enabled */ 5699 __kmp_free_implicit_task(this_th); 5700 this_th->th.th_current_task = NULL; 5701 5702 // If the __kmp_thread_pool_insert_pt is already past the new insert 5703 // point, then we need to re-scan the entire list. 5704 gtid = this_th->th.th_info.ds.ds_gtid; 5705 if (__kmp_thread_pool_insert_pt != NULL) { 5706 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5707 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5708 __kmp_thread_pool_insert_pt = NULL; 5709 } 5710 } 5711 5712 // Scan down the list to find the place to insert the thread. 5713 // scan is the address of a link in the list, possibly the address of 5714 // __kmp_thread_pool itself. 5715 // 5716 // In the absence of nested parallelism, the for loop will have 0 iterations. 5717 if (__kmp_thread_pool_insert_pt != NULL) { 5718 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5719 } else { 5720 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5721 } 5722 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5723 scan = &((*scan)->th.th_next_pool)) 5724 ; 5725 5726 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5727 // to its address. 5728 TCW_PTR(this_th->th.th_next_pool, *scan); 5729 __kmp_thread_pool_insert_pt = *scan = this_th; 5730 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5731 (this_th->th.th_info.ds.ds_gtid < 5732 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5733 TCW_4(this_th->th.th_in_pool, TRUE); 5734 __kmp_suspend_initialize_thread(this_th); 5735 __kmp_lock_suspend_mx(this_th); 5736 if (this_th->th.th_active == TRUE) { 5737 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5738 this_th->th.th_active_in_pool = TRUE; 5739 } 5740 #if KMP_DEBUG 5741 else { 5742 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5743 } 5744 #endif 5745 __kmp_unlock_suspend_mx(this_th); 5746 5747 TCW_4(__kmp_nth, __kmp_nth - 1); 5748 5749 #ifdef KMP_ADJUST_BLOCKTIME 5750 /* Adjust blocktime back to user setting or default if necessary */ 5751 /* Middle initialization might never have occurred */ 5752 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5753 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5754 if (__kmp_nth <= __kmp_avail_proc) { 5755 __kmp_zero_bt = FALSE; 5756 } 5757 } 5758 #endif /* KMP_ADJUST_BLOCKTIME */ 5759 5760 KMP_MB(); 5761 } 5762 5763 /* ------------------------------------------------------------------------ */ 5764 5765 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5766 #if OMP_PROFILING_SUPPORT 5767 ProfileTraceFile = getenv("LIBOMPTARGET_PROFILE"); 5768 // TODO: add a configuration option for time granularity 5769 if (ProfileTraceFile) 5770 llvm::timeTraceProfilerInitialize(500 /* us */, "libomptarget"); 5771 #endif 5772 5773 int gtid = this_thr->th.th_info.ds.ds_gtid; 5774 /* void *stack_data;*/ 5775 kmp_team_t **volatile pteam; 5776 5777 KMP_MB(); 5778 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5779 5780 if (__kmp_env_consistency_check) { 5781 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5782 } 5783 5784 #if OMPD_SUPPORT 5785 if (ompd_state & OMPD_ENABLE_BP) 5786 ompd_bp_thread_begin(); 5787 #endif 5788 5789 #if OMPT_SUPPORT 5790 ompt_data_t *thread_data = nullptr; 5791 if (ompt_enabled.enabled) { 5792 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5793 *thread_data = ompt_data_none; 5794 5795 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5796 this_thr->th.ompt_thread_info.wait_id = 0; 5797 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5798 this_thr->th.ompt_thread_info.parallel_flags = 0; 5799 if (ompt_enabled.ompt_callback_thread_begin) { 5800 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5801 ompt_thread_worker, thread_data); 5802 } 5803 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5804 } 5805 #endif 5806 5807 /* This is the place where threads wait for work */ 5808 while (!TCR_4(__kmp_global.g.g_done)) { 5809 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5810 KMP_MB(); 5811 5812 /* wait for work to do */ 5813 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5814 5815 /* No tid yet since not part of a team */ 5816 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5817 5818 #if OMPT_SUPPORT 5819 if (ompt_enabled.enabled) { 5820 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5821 } 5822 #endif 5823 5824 pteam = &this_thr->th.th_team; 5825 5826 /* have we been allocated? */ 5827 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5828 /* we were just woken up, so run our new task */ 5829 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5830 int rc; 5831 KA_TRACE(20, 5832 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5833 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5834 (*pteam)->t.t_pkfn)); 5835 5836 updateHWFPControl(*pteam); 5837 5838 #if OMPT_SUPPORT 5839 if (ompt_enabled.enabled) { 5840 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5841 } 5842 #endif 5843 5844 rc = (*pteam)->t.t_invoke(gtid); 5845 KMP_ASSERT(rc); 5846 5847 KMP_MB(); 5848 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5849 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5850 (*pteam)->t.t_pkfn)); 5851 } 5852 #if OMPT_SUPPORT 5853 if (ompt_enabled.enabled) { 5854 /* no frame set while outside task */ 5855 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5856 5857 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5858 } 5859 #endif 5860 /* join barrier after parallel region */ 5861 __kmp_join_barrier(gtid); 5862 } 5863 } 5864 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5865 5866 #if OMPD_SUPPORT 5867 if (ompd_state & OMPD_ENABLE_BP) 5868 ompd_bp_thread_end(); 5869 #endif 5870 5871 #if OMPT_SUPPORT 5872 if (ompt_enabled.ompt_callback_thread_end) { 5873 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5874 } 5875 #endif 5876 5877 this_thr->th.th_task_team = NULL; 5878 /* run the destructors for the threadprivate data for this thread */ 5879 __kmp_common_destroy_gtid(gtid); 5880 5881 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5882 KMP_MB(); 5883 5884 #if OMP_PROFILING_SUPPORT 5885 llvm::timeTraceProfilerFinishThread(); 5886 #endif 5887 return this_thr; 5888 } 5889 5890 /* ------------------------------------------------------------------------ */ 5891 5892 void __kmp_internal_end_dest(void *specific_gtid) { 5893 // Make sure no significant bits are lost 5894 int gtid; 5895 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5896 5897 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5898 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5899 * this is because 0 is reserved for the nothing-stored case */ 5900 5901 __kmp_internal_end_thread(gtid); 5902 } 5903 5904 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5905 5906 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5907 __kmp_internal_end_atexit(); 5908 } 5909 5910 #endif 5911 5912 /* [Windows] josh: when the atexit handler is called, there may still be more 5913 than one thread alive */ 5914 void __kmp_internal_end_atexit(void) { 5915 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5916 /* [Windows] 5917 josh: ideally, we want to completely shutdown the library in this atexit 5918 handler, but stat code that depends on thread specific data for gtid fails 5919 because that data becomes unavailable at some point during the shutdown, so 5920 we call __kmp_internal_end_thread instead. We should eventually remove the 5921 dependency on __kmp_get_specific_gtid in the stat code and use 5922 __kmp_internal_end_library to cleanly shutdown the library. 5923 5924 // TODO: Can some of this comment about GVS be removed? 5925 I suspect that the offending stat code is executed when the calling thread 5926 tries to clean up a dead root thread's data structures, resulting in GVS 5927 code trying to close the GVS structures for that thread, but since the stat 5928 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5929 the calling thread is cleaning up itself instead of another thread, it get 5930 confused. This happens because allowing a thread to unregister and cleanup 5931 another thread is a recent modification for addressing an issue. 5932 Based on the current design (20050722), a thread may end up 5933 trying to unregister another thread only if thread death does not trigger 5934 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5935 thread specific data destructor function to detect thread death. For 5936 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5937 is nothing. Thus, the workaround is applicable only for Windows static 5938 stat library. */ 5939 __kmp_internal_end_library(-1); 5940 #if KMP_OS_WINDOWS 5941 __kmp_close_console(); 5942 #endif 5943 } 5944 5945 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5946 // It is assumed __kmp_forkjoin_lock is acquired. 5947 5948 int gtid; 5949 5950 KMP_DEBUG_ASSERT(thread != NULL); 5951 5952 gtid = thread->th.th_info.ds.ds_gtid; 5953 5954 if (!is_root) { 5955 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5956 /* Assume the threads are at the fork barrier here */ 5957 KA_TRACE( 5958 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5959 gtid)); 5960 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5961 * (GEH) */ 5962 ANNOTATE_HAPPENS_BEFORE(thread); 5963 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5964 thread); 5965 __kmp_release_64(&flag); 5966 } 5967 5968 // Terminate OS thread. 5969 __kmp_reap_worker(thread); 5970 5971 // The thread was killed asynchronously. If it was actively 5972 // spinning in the thread pool, decrement the global count. 5973 // 5974 // There is a small timing hole here - if the worker thread was just waking 5975 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5976 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5977 // the global counter might not get updated. 5978 // 5979 // Currently, this can only happen as the library is unloaded, 5980 // so there are no harmful side effects. 5981 if (thread->th.th_active_in_pool) { 5982 thread->th.th_active_in_pool = FALSE; 5983 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5984 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5985 } 5986 } 5987 5988 __kmp_free_implicit_task(thread); 5989 5990 // Free the fast memory for tasking 5991 #if USE_FAST_MEMORY 5992 __kmp_free_fast_memory(thread); 5993 #endif /* USE_FAST_MEMORY */ 5994 5995 __kmp_suspend_uninitialize_thread(thread); 5996 5997 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5998 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5999 6000 --__kmp_all_nth; 6001 // __kmp_nth was decremented when thread is added to the pool. 6002 6003 #ifdef KMP_ADJUST_BLOCKTIME 6004 /* Adjust blocktime back to user setting or default if necessary */ 6005 /* Middle initialization might never have occurred */ 6006 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6007 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6008 if (__kmp_nth <= __kmp_avail_proc) { 6009 __kmp_zero_bt = FALSE; 6010 } 6011 } 6012 #endif /* KMP_ADJUST_BLOCKTIME */ 6013 6014 /* free the memory being used */ 6015 if (__kmp_env_consistency_check) { 6016 if (thread->th.th_cons) { 6017 __kmp_free_cons_stack(thread->th.th_cons); 6018 thread->th.th_cons = NULL; 6019 } 6020 } 6021 6022 if (thread->th.th_pri_common != NULL) { 6023 __kmp_free(thread->th.th_pri_common); 6024 thread->th.th_pri_common = NULL; 6025 } 6026 6027 if (thread->th.th_task_state_memo_stack != NULL) { 6028 __kmp_free(thread->th.th_task_state_memo_stack); 6029 thread->th.th_task_state_memo_stack = NULL; 6030 } 6031 6032 #if KMP_USE_BGET 6033 if (thread->th.th_local.bget_data != NULL) { 6034 __kmp_finalize_bget(thread); 6035 } 6036 #endif 6037 6038 #if KMP_AFFINITY_SUPPORTED 6039 if (thread->th.th_affin_mask != NULL) { 6040 KMP_CPU_FREE(thread->th.th_affin_mask); 6041 thread->th.th_affin_mask = NULL; 6042 } 6043 #endif /* KMP_AFFINITY_SUPPORTED */ 6044 6045 #if KMP_USE_HIER_SCHED 6046 if (thread->th.th_hier_bar_data != NULL) { 6047 __kmp_free(thread->th.th_hier_bar_data); 6048 thread->th.th_hier_bar_data = NULL; 6049 } 6050 #endif 6051 6052 __kmp_reap_team(thread->th.th_serial_team); 6053 thread->th.th_serial_team = NULL; 6054 __kmp_free(thread); 6055 6056 KMP_MB(); 6057 6058 } // __kmp_reap_thread 6059 6060 static void __kmp_internal_end(void) { 6061 int i; 6062 6063 /* First, unregister the library */ 6064 __kmp_unregister_library(); 6065 6066 #if KMP_OS_WINDOWS 6067 /* In Win static library, we can't tell when a root actually dies, so we 6068 reclaim the data structures for any root threads that have died but not 6069 unregistered themselves, in order to shut down cleanly. 6070 In Win dynamic library we also can't tell when a thread dies. */ 6071 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6072 // dead roots 6073 #endif 6074 6075 for (i = 0; i < __kmp_threads_capacity; i++) 6076 if (__kmp_root[i]) 6077 if (__kmp_root[i]->r.r_active) 6078 break; 6079 KMP_MB(); /* Flush all pending memory write invalidates. */ 6080 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6081 6082 if (i < __kmp_threads_capacity) { 6083 #if KMP_USE_MONITOR 6084 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6085 KMP_MB(); /* Flush all pending memory write invalidates. */ 6086 6087 // Need to check that monitor was initialized before reaping it. If we are 6088 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6089 // __kmp_monitor will appear to contain valid data, but it is only valid in 6090 // the parent process, not the child. 6091 // New behavior (201008): instead of keying off of the flag 6092 // __kmp_init_parallel, the monitor thread creation is keyed off 6093 // of the new flag __kmp_init_monitor. 6094 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6095 if (TCR_4(__kmp_init_monitor)) { 6096 __kmp_reap_monitor(&__kmp_monitor); 6097 TCW_4(__kmp_init_monitor, 0); 6098 } 6099 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6100 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6101 #endif // KMP_USE_MONITOR 6102 } else { 6103 /* TODO move this to cleanup code */ 6104 #ifdef KMP_DEBUG 6105 /* make sure that everything has properly ended */ 6106 for (i = 0; i < __kmp_threads_capacity; i++) { 6107 if (__kmp_root[i]) { 6108 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6109 // there can be uber threads alive here 6110 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6111 } 6112 } 6113 #endif 6114 6115 KMP_MB(); 6116 6117 // Reap the worker threads. 6118 // This is valid for now, but be careful if threads are reaped sooner. 6119 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6120 // Get the next thread from the pool. 6121 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6122 __kmp_thread_pool = thread->th.th_next_pool; 6123 // Reap it. 6124 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6125 thread->th.th_next_pool = NULL; 6126 thread->th.th_in_pool = FALSE; 6127 __kmp_reap_thread(thread, 0); 6128 } 6129 __kmp_thread_pool_insert_pt = NULL; 6130 6131 // Reap teams. 6132 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6133 // Get the next team from the pool. 6134 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6135 __kmp_team_pool = team->t.t_next_pool; 6136 // Reap it. 6137 team->t.t_next_pool = NULL; 6138 __kmp_reap_team(team); 6139 } 6140 6141 __kmp_reap_task_teams(); 6142 6143 #if KMP_OS_UNIX 6144 // Threads that are not reaped should not access any resources since they 6145 // are going to be deallocated soon, so the shutdown sequence should wait 6146 // until all threads either exit the final spin-waiting loop or begin 6147 // sleeping after the given blocktime. 6148 for (i = 0; i < __kmp_threads_capacity; i++) { 6149 kmp_info_t *thr = __kmp_threads[i]; 6150 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6151 KMP_CPU_PAUSE(); 6152 } 6153 #endif 6154 6155 for (i = 0; i < __kmp_threads_capacity; ++i) { 6156 // TBD: Add some checking... 6157 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6158 } 6159 6160 /* Make sure all threadprivate destructors get run by joining with all 6161 worker threads before resetting this flag */ 6162 TCW_SYNC_4(__kmp_init_common, FALSE); 6163 6164 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6165 KMP_MB(); 6166 6167 #if KMP_USE_MONITOR 6168 // See note above: One of the possible fixes for CQ138434 / CQ140126 6169 // 6170 // FIXME: push both code fragments down and CSE them? 6171 // push them into __kmp_cleanup() ? 6172 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6173 if (TCR_4(__kmp_init_monitor)) { 6174 __kmp_reap_monitor(&__kmp_monitor); 6175 TCW_4(__kmp_init_monitor, 0); 6176 } 6177 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6178 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6179 #endif 6180 } /* else !__kmp_global.t_active */ 6181 TCW_4(__kmp_init_gtid, FALSE); 6182 KMP_MB(); /* Flush all pending memory write invalidates. */ 6183 6184 __kmp_cleanup(); 6185 #if OMPT_SUPPORT 6186 ompt_fini(); 6187 #endif 6188 } 6189 6190 void __kmp_internal_end_library(int gtid_req) { 6191 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6192 /* this shouldn't be a race condition because __kmp_internal_end() is the 6193 only place to clear __kmp_serial_init */ 6194 /* we'll check this later too, after we get the lock */ 6195 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6196 // redundant, because the next check will work in any case. 6197 if (__kmp_global.g.g_abort) { 6198 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6199 /* TODO abort? */ 6200 return; 6201 } 6202 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6203 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6204 return; 6205 } 6206 6207 // If hidden helper team has been initialized, we need to deinit it 6208 if (TCR_4(__kmp_init_hidden_helper) && 6209 !TCR_4(__kmp_hidden_helper_team_done)) { 6210 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6211 // First release the main thread to let it continue its work 6212 __kmp_hidden_helper_main_thread_release(); 6213 // Wait until the hidden helper team has been destroyed 6214 __kmp_hidden_helper_threads_deinitz_wait(); 6215 } 6216 6217 KMP_MB(); /* Flush all pending memory write invalidates. */ 6218 /* find out who we are and what we should do */ 6219 { 6220 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6221 KA_TRACE( 6222 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6223 if (gtid == KMP_GTID_SHUTDOWN) { 6224 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6225 "already shutdown\n")); 6226 return; 6227 } else if (gtid == KMP_GTID_MONITOR) { 6228 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6229 "registered, or system shutdown\n")); 6230 return; 6231 } else if (gtid == KMP_GTID_DNE) { 6232 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6233 "shutdown\n")); 6234 /* we don't know who we are, but we may still shutdown the library */ 6235 } else if (KMP_UBER_GTID(gtid)) { 6236 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6237 if (__kmp_root[gtid]->r.r_active) { 6238 __kmp_global.g.g_abort = -1; 6239 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6240 __kmp_unregister_library(); 6241 KA_TRACE(10, 6242 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6243 gtid)); 6244 return; 6245 } else { 6246 KA_TRACE( 6247 10, 6248 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6249 __kmp_unregister_root_current_thread(gtid); 6250 } 6251 } else { 6252 /* worker threads may call this function through the atexit handler, if they 6253 * call exit() */ 6254 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6255 TODO: do a thorough shutdown instead */ 6256 #ifdef DUMP_DEBUG_ON_EXIT 6257 if (__kmp_debug_buf) 6258 __kmp_dump_debug_buffer(); 6259 #endif 6260 // added unregister library call here when we switch to shm linux 6261 // if we don't, it will leave lots of files in /dev/shm 6262 // cleanup shared memory file before exiting. 6263 __kmp_unregister_library(); 6264 return; 6265 } 6266 } 6267 /* synchronize the termination process */ 6268 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6269 6270 /* have we already finished */ 6271 if (__kmp_global.g.g_abort) { 6272 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6273 /* TODO abort? */ 6274 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6275 return; 6276 } 6277 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6278 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6279 return; 6280 } 6281 6282 /* We need this lock to enforce mutex between this reading of 6283 __kmp_threads_capacity and the writing by __kmp_register_root. 6284 Alternatively, we can use a counter of roots that is atomically updated by 6285 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6286 __kmp_internal_end_*. */ 6287 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6288 6289 /* now we can safely conduct the actual termination */ 6290 __kmp_internal_end(); 6291 6292 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6293 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6294 6295 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6296 6297 #ifdef DUMP_DEBUG_ON_EXIT 6298 if (__kmp_debug_buf) 6299 __kmp_dump_debug_buffer(); 6300 #endif 6301 6302 #if KMP_OS_WINDOWS 6303 __kmp_close_console(); 6304 #endif 6305 6306 __kmp_fini_allocator(); 6307 6308 } // __kmp_internal_end_library 6309 6310 void __kmp_internal_end_thread(int gtid_req) { 6311 int i; 6312 6313 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6314 /* this shouldn't be a race condition because __kmp_internal_end() is the 6315 * only place to clear __kmp_serial_init */ 6316 /* we'll check this later too, after we get the lock */ 6317 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6318 // redundant, because the next check will work in any case. 6319 if (__kmp_global.g.g_abort) { 6320 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6321 /* TODO abort? */ 6322 return; 6323 } 6324 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6325 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6326 return; 6327 } 6328 6329 // If hidden helper team has been initialized, we need to deinit it 6330 if (TCR_4(__kmp_init_hidden_helper) && 6331 !TCR_4(__kmp_hidden_helper_team_done)) { 6332 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6333 // First release the main thread to let it continue its work 6334 __kmp_hidden_helper_main_thread_release(); 6335 // Wait until the hidden helper team has been destroyed 6336 __kmp_hidden_helper_threads_deinitz_wait(); 6337 } 6338 6339 KMP_MB(); /* Flush all pending memory write invalidates. */ 6340 6341 /* find out who we are and what we should do */ 6342 { 6343 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6344 KA_TRACE(10, 6345 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6346 if (gtid == KMP_GTID_SHUTDOWN) { 6347 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6348 "already shutdown\n")); 6349 return; 6350 } else if (gtid == KMP_GTID_MONITOR) { 6351 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6352 "registered, or system shutdown\n")); 6353 return; 6354 } else if (gtid == KMP_GTID_DNE) { 6355 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6356 "shutdown\n")); 6357 return; 6358 /* we don't know who we are */ 6359 } else if (KMP_UBER_GTID(gtid)) { 6360 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6361 if (__kmp_root[gtid]->r.r_active) { 6362 __kmp_global.g.g_abort = -1; 6363 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6364 KA_TRACE(10, 6365 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6366 gtid)); 6367 return; 6368 } else { 6369 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6370 gtid)); 6371 __kmp_unregister_root_current_thread(gtid); 6372 } 6373 } else { 6374 /* just a worker thread, let's leave */ 6375 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6376 6377 if (gtid >= 0) { 6378 __kmp_threads[gtid]->th.th_task_team = NULL; 6379 } 6380 6381 KA_TRACE(10, 6382 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6383 gtid)); 6384 return; 6385 } 6386 } 6387 #if KMP_DYNAMIC_LIB 6388 if (__kmp_pause_status != kmp_hard_paused) 6389 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6390 // because we will better shutdown later in the library destructor. 6391 { 6392 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6393 return; 6394 } 6395 #endif 6396 /* synchronize the termination process */ 6397 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6398 6399 /* have we already finished */ 6400 if (__kmp_global.g.g_abort) { 6401 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6402 /* TODO abort? */ 6403 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6404 return; 6405 } 6406 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6407 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6408 return; 6409 } 6410 6411 /* We need this lock to enforce mutex between this reading of 6412 __kmp_threads_capacity and the writing by __kmp_register_root. 6413 Alternatively, we can use a counter of roots that is atomically updated by 6414 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6415 __kmp_internal_end_*. */ 6416 6417 /* should we finish the run-time? are all siblings done? */ 6418 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6419 6420 for (i = 0; i < __kmp_threads_capacity; ++i) { 6421 if (KMP_UBER_GTID(i)) { 6422 KA_TRACE( 6423 10, 6424 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6425 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6426 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6427 return; 6428 } 6429 } 6430 6431 /* now we can safely conduct the actual termination */ 6432 6433 __kmp_internal_end(); 6434 6435 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6436 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6437 6438 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6439 6440 #ifdef DUMP_DEBUG_ON_EXIT 6441 if (__kmp_debug_buf) 6442 __kmp_dump_debug_buffer(); 6443 #endif 6444 } // __kmp_internal_end_thread 6445 6446 // ----------------------------------------------------------------------------- 6447 // Library registration stuff. 6448 6449 static long __kmp_registration_flag = 0; 6450 // Random value used to indicate library initialization. 6451 static char *__kmp_registration_str = NULL; 6452 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6453 6454 static inline char *__kmp_reg_status_name() { 6455 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6456 each thread. If registration and unregistration go in different threads 6457 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6458 env var can not be found, because the name will contain different pid. */ 6459 // macOS* complains about name being too long with additional getuid() 6460 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6461 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6462 (int)getuid()); 6463 #else 6464 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6465 #endif 6466 } // __kmp_reg_status_get 6467 6468 void __kmp_register_library_startup(void) { 6469 6470 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6471 int done = 0; 6472 union { 6473 double dtime; 6474 long ltime; 6475 } time; 6476 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6477 __kmp_initialize_system_tick(); 6478 #endif 6479 __kmp_read_system_time(&time.dtime); 6480 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6481 __kmp_registration_str = 6482 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6483 __kmp_registration_flag, KMP_LIBRARY_FILE); 6484 6485 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6486 __kmp_registration_str)); 6487 6488 while (!done) { 6489 6490 char *value = NULL; // Actual value of the environment variable. 6491 6492 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6493 char *shm_name = __kmp_str_format("/%s", name); 6494 int shm_preexist = 0; 6495 char *data1; 6496 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6497 if ((fd1 == -1) && (errno == EEXIST)) { 6498 // file didn't open because it already exists. 6499 // try opening existing file 6500 fd1 = shm_open(shm_name, O_RDWR, 0666); 6501 if (fd1 == -1) { // file didn't open 6502 // error out here 6503 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6504 __kmp_msg_null); 6505 } else { 6506 // able to open existing file 6507 shm_preexist = 1; 6508 } 6509 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6510 // already exists. 6511 // error out here. 6512 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6513 __kmp_msg_null); 6514 } 6515 if (shm_preexist == 0) { 6516 // we created SHM now set size 6517 if (ftruncate(fd1, SHM_SIZE) == -1) { 6518 // error occured setting size; 6519 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6520 KMP_ERR(errno), __kmp_msg_null); 6521 } 6522 } 6523 data1 = 6524 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6525 if (data1 == MAP_FAILED) { 6526 // failed to map shared memory 6527 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6528 __kmp_msg_null); 6529 } 6530 if (shm_preexist == 0) { // set data to SHM, set value 6531 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6532 } 6533 // Read value from either what we just wrote or existing file. 6534 value = __kmp_str_format("%s", data1); // read value from SHM 6535 munmap(data1, SHM_SIZE); 6536 close(fd1); 6537 #else // Windows and unix with static library 6538 // Set environment variable, but do not overwrite if it is exist. 6539 __kmp_env_set(name, __kmp_registration_str, 0); 6540 // read value to see if it got set 6541 value = __kmp_env_get(name); 6542 #endif 6543 6544 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6545 done = 1; // Ok, environment variable set successfully, exit the loop. 6546 } else { 6547 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6548 // Check whether it alive or dead. 6549 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6550 char *tail = value; 6551 char *flag_addr_str = NULL; 6552 char *flag_val_str = NULL; 6553 char const *file_name = NULL; 6554 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6555 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6556 file_name = tail; 6557 if (tail != NULL) { 6558 long *flag_addr = 0; 6559 unsigned long flag_val = 0; 6560 KMP_SSCANF(flag_addr_str, "%p", RCAST(void **, &flag_addr)); 6561 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6562 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6563 // First, check whether environment-encoded address is mapped into 6564 // addr space. 6565 // If so, dereference it to see if it still has the right value. 6566 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6567 neighbor = 1; 6568 } else { 6569 // If not, then we know the other copy of the library is no longer 6570 // running. 6571 neighbor = 2; 6572 } 6573 } 6574 } 6575 switch (neighbor) { 6576 case 0: // Cannot parse environment variable -- neighbor status unknown. 6577 // Assume it is the incompatible format of future version of the 6578 // library. Assume the other library is alive. 6579 // WARN( ... ); // TODO: Issue a warning. 6580 file_name = "unknown library"; 6581 KMP_FALLTHROUGH(); 6582 // Attention! Falling to the next case. That's intentional. 6583 case 1: { // Neighbor is alive. 6584 // Check it is allowed. 6585 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6586 if (!__kmp_str_match_true(duplicate_ok)) { 6587 // That's not allowed. Issue fatal error. 6588 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6589 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6590 } 6591 KMP_INTERNAL_FREE(duplicate_ok); 6592 __kmp_duplicate_library_ok = 1; 6593 done = 1; // Exit the loop. 6594 } break; 6595 case 2: { // Neighbor is dead. 6596 6597 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6598 // close shared memory. 6599 shm_unlink(shm_name); // this removes file in /dev/shm 6600 #else 6601 // Clear the variable and try to register library again. 6602 __kmp_env_unset(name); 6603 #endif 6604 } break; 6605 default: { 6606 KMP_DEBUG_ASSERT(0); 6607 } break; 6608 } 6609 } 6610 KMP_INTERNAL_FREE((void *)value); 6611 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6612 KMP_INTERNAL_FREE((void *)shm_name); 6613 #endif 6614 } // while 6615 KMP_INTERNAL_FREE((void *)name); 6616 6617 } // func __kmp_register_library_startup 6618 6619 void __kmp_unregister_library(void) { 6620 6621 char *name = __kmp_reg_status_name(); 6622 char *value = NULL; 6623 6624 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6625 char *shm_name = __kmp_str_format("/%s", name); 6626 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6627 if (fd1 == -1) { 6628 // file did not open. return. 6629 return; 6630 } 6631 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6632 if (data1 != MAP_FAILED) { 6633 value = __kmp_str_format("%s", data1); // read value from SHM 6634 munmap(data1, SHM_SIZE); 6635 } 6636 close(fd1); 6637 #else 6638 value = __kmp_env_get(name); 6639 #endif 6640 6641 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6642 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6643 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6644 // Ok, this is our variable. Delete it. 6645 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6646 shm_unlink(shm_name); // this removes file in /dev/shm 6647 #else 6648 __kmp_env_unset(name); 6649 #endif 6650 } 6651 6652 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6653 KMP_INTERNAL_FREE(shm_name); 6654 #endif 6655 6656 KMP_INTERNAL_FREE(__kmp_registration_str); 6657 KMP_INTERNAL_FREE(value); 6658 KMP_INTERNAL_FREE(name); 6659 6660 __kmp_registration_flag = 0; 6661 __kmp_registration_str = NULL; 6662 6663 } // __kmp_unregister_library 6664 6665 // End of Library registration stuff. 6666 // ----------------------------------------------------------------------------- 6667 6668 #if KMP_MIC_SUPPORTED 6669 6670 static void __kmp_check_mic_type() { 6671 kmp_cpuid_t cpuid_state = {0}; 6672 kmp_cpuid_t *cs_p = &cpuid_state; 6673 __kmp_x86_cpuid(1, 0, cs_p); 6674 // We don't support mic1 at the moment 6675 if ((cs_p->eax & 0xff0) == 0xB10) { 6676 __kmp_mic_type = mic2; 6677 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6678 __kmp_mic_type = mic3; 6679 } else { 6680 __kmp_mic_type = non_mic; 6681 } 6682 } 6683 6684 #endif /* KMP_MIC_SUPPORTED */ 6685 6686 #if KMP_HAVE_UMWAIT 6687 static void __kmp_user_level_mwait_init() { 6688 struct kmp_cpuid buf; 6689 __kmp_x86_cpuid(7, 0, &buf); 6690 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6691 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6692 __kmp_umwait_enabled)); 6693 } 6694 #elif KMP_HAVE_MWAIT 6695 #ifndef AT_INTELPHIUSERMWAIT 6696 // Spurious, non-existent value that should always fail to return anything. 6697 // Will be replaced with the correct value when we know that. 6698 #define AT_INTELPHIUSERMWAIT 10000 6699 #endif 6700 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6701 // earlier OS is used to build the RTL, we'll use the following internal 6702 // function when the entry is not found. 6703 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6704 unsigned long getauxval(unsigned long) { return 0; } 6705 6706 static void __kmp_user_level_mwait_init() { 6707 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6708 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6709 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6710 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6711 if (__kmp_mic_type == mic3) { 6712 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6713 if ((res & 0x1) || __kmp_user_level_mwait) { 6714 __kmp_mwait_enabled = TRUE; 6715 if (__kmp_user_level_mwait) { 6716 KMP_INFORM(EnvMwaitWarn); 6717 } 6718 } else { 6719 __kmp_mwait_enabled = FALSE; 6720 } 6721 } 6722 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6723 "__kmp_mwait_enabled = %d\n", 6724 __kmp_mic_type, __kmp_mwait_enabled)); 6725 } 6726 #endif /* KMP_HAVE_UMWAIT */ 6727 6728 static void __kmp_do_serial_initialize(void) { 6729 int i, gtid; 6730 size_t size; 6731 6732 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6733 6734 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6735 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6736 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6737 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6738 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6739 6740 #if OMPT_SUPPORT 6741 ompt_pre_init(); 6742 #endif 6743 #if OMPD_SUPPORT 6744 __kmp_env_dump(); 6745 ompd_init(); 6746 #endif 6747 6748 __kmp_validate_locks(); 6749 6750 /* Initialize internal memory allocator */ 6751 __kmp_init_allocator(); 6752 6753 /* Register the library startup via an environment variable and check to see 6754 whether another copy of the library is already registered. */ 6755 6756 __kmp_register_library_startup(); 6757 6758 /* TODO reinitialization of library */ 6759 if (TCR_4(__kmp_global.g.g_done)) { 6760 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6761 } 6762 6763 __kmp_global.g.g_abort = 0; 6764 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6765 6766 /* initialize the locks */ 6767 #if KMP_USE_ADAPTIVE_LOCKS 6768 #if KMP_DEBUG_ADAPTIVE_LOCKS 6769 __kmp_init_speculative_stats(); 6770 #endif 6771 #endif 6772 #if KMP_STATS_ENABLED 6773 __kmp_stats_init(); 6774 #endif 6775 __kmp_init_lock(&__kmp_global_lock); 6776 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6777 __kmp_init_lock(&__kmp_debug_lock); 6778 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6779 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6780 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6781 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6782 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6783 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6784 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6785 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6786 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6787 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6788 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6789 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6790 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6791 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6792 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6793 #if KMP_USE_MONITOR 6794 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6795 #endif 6796 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6797 6798 /* conduct initialization and initial setup of configuration */ 6799 6800 __kmp_runtime_initialize(); 6801 6802 #if KMP_MIC_SUPPORTED 6803 __kmp_check_mic_type(); 6804 #endif 6805 6806 // Some global variable initialization moved here from kmp_env_initialize() 6807 #ifdef KMP_DEBUG 6808 kmp_diag = 0; 6809 #endif 6810 __kmp_abort_delay = 0; 6811 6812 // From __kmp_init_dflt_team_nth() 6813 /* assume the entire machine will be used */ 6814 __kmp_dflt_team_nth_ub = __kmp_xproc; 6815 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6816 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6817 } 6818 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6819 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6820 } 6821 __kmp_max_nth = __kmp_sys_max_nth; 6822 __kmp_cg_max_nth = __kmp_sys_max_nth; 6823 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6824 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6825 __kmp_teams_max_nth = __kmp_sys_max_nth; 6826 } 6827 6828 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6829 // part 6830 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6831 #if KMP_USE_MONITOR 6832 __kmp_monitor_wakeups = 6833 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6834 __kmp_bt_intervals = 6835 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6836 #endif 6837 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6838 __kmp_library = library_throughput; 6839 // From KMP_SCHEDULE initialization 6840 __kmp_static = kmp_sch_static_balanced; 6841 // AC: do not use analytical here, because it is non-monotonous 6842 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6843 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6844 // need to repeat assignment 6845 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6846 // bit control and barrier method control parts 6847 #if KMP_FAST_REDUCTION_BARRIER 6848 #define kmp_reduction_barrier_gather_bb ((int)1) 6849 #define kmp_reduction_barrier_release_bb ((int)1) 6850 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6851 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6852 #endif // KMP_FAST_REDUCTION_BARRIER 6853 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6854 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6855 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6856 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6857 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6858 #if KMP_FAST_REDUCTION_BARRIER 6859 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6860 // lin_64 ): hyper,1 6861 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6862 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6863 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6864 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6865 } 6866 #endif // KMP_FAST_REDUCTION_BARRIER 6867 } 6868 #if KMP_FAST_REDUCTION_BARRIER 6869 #undef kmp_reduction_barrier_release_pat 6870 #undef kmp_reduction_barrier_gather_pat 6871 #undef kmp_reduction_barrier_release_bb 6872 #undef kmp_reduction_barrier_gather_bb 6873 #endif // KMP_FAST_REDUCTION_BARRIER 6874 #if KMP_MIC_SUPPORTED 6875 if (__kmp_mic_type == mic2) { // KNC 6876 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6877 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6878 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6879 1; // forkjoin release 6880 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6881 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6882 } 6883 #if KMP_FAST_REDUCTION_BARRIER 6884 if (__kmp_mic_type == mic2) { // KNC 6885 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6886 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6887 } 6888 #endif // KMP_FAST_REDUCTION_BARRIER 6889 #endif // KMP_MIC_SUPPORTED 6890 6891 // From KMP_CHECKS initialization 6892 #ifdef KMP_DEBUG 6893 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6894 #else 6895 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6896 #endif 6897 6898 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6899 __kmp_foreign_tp = TRUE; 6900 6901 __kmp_global.g.g_dynamic = FALSE; 6902 __kmp_global.g.g_dynamic_mode = dynamic_default; 6903 6904 __kmp_init_nesting_mode(); 6905 6906 __kmp_env_initialize(NULL); 6907 6908 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6909 __kmp_user_level_mwait_init(); 6910 #endif 6911 // Print all messages in message catalog for testing purposes. 6912 #ifdef KMP_DEBUG 6913 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6914 if (__kmp_str_match_true(val)) { 6915 kmp_str_buf_t buffer; 6916 __kmp_str_buf_init(&buffer); 6917 __kmp_i18n_dump_catalog(&buffer); 6918 __kmp_printf("%s", buffer.str); 6919 __kmp_str_buf_free(&buffer); 6920 } 6921 __kmp_env_free(&val); 6922 #endif 6923 6924 __kmp_threads_capacity = 6925 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6926 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6927 __kmp_tp_capacity = __kmp_default_tp_capacity( 6928 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6929 6930 // If the library is shut down properly, both pools must be NULL. Just in 6931 // case, set them to NULL -- some memory may leak, but subsequent code will 6932 // work even if pools are not freed. 6933 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6934 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6935 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6936 __kmp_thread_pool = NULL; 6937 __kmp_thread_pool_insert_pt = NULL; 6938 __kmp_team_pool = NULL; 6939 6940 /* Allocate all of the variable sized records */ 6941 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6942 * expandable */ 6943 /* Since allocation is cache-aligned, just add extra padding at the end */ 6944 size = 6945 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6946 CACHE_LINE; 6947 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6948 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6949 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6950 6951 /* init thread counts */ 6952 KMP_DEBUG_ASSERT(__kmp_all_nth == 6953 0); // Asserts fail if the library is reinitializing and 6954 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6955 __kmp_all_nth = 0; 6956 __kmp_nth = 0; 6957 6958 /* setup the uber master thread and hierarchy */ 6959 gtid = __kmp_register_root(TRUE); 6960 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6961 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6962 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6963 6964 KMP_MB(); /* Flush all pending memory write invalidates. */ 6965 6966 __kmp_common_initialize(); 6967 6968 #if KMP_OS_UNIX 6969 /* invoke the child fork handler */ 6970 __kmp_register_atfork(); 6971 #endif 6972 6973 #if !KMP_DYNAMIC_LIB 6974 { 6975 /* Invoke the exit handler when the program finishes, only for static 6976 library. For dynamic library, we already have _fini and DllMain. */ 6977 int rc = atexit(__kmp_internal_end_atexit); 6978 if (rc != 0) { 6979 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6980 __kmp_msg_null); 6981 } 6982 } 6983 #endif 6984 6985 #if KMP_HANDLE_SIGNALS 6986 #if KMP_OS_UNIX 6987 /* NOTE: make sure that this is called before the user installs their own 6988 signal handlers so that the user handlers are called first. this way they 6989 can return false, not call our handler, avoid terminating the library, and 6990 continue execution where they left off. */ 6991 __kmp_install_signals(FALSE); 6992 #endif /* KMP_OS_UNIX */ 6993 #if KMP_OS_WINDOWS 6994 __kmp_install_signals(TRUE); 6995 #endif /* KMP_OS_WINDOWS */ 6996 #endif 6997 6998 /* we have finished the serial initialization */ 6999 __kmp_init_counter++; 7000 7001 __kmp_init_serial = TRUE; 7002 7003 if (__kmp_settings) { 7004 __kmp_env_print(); 7005 } 7006 7007 if (__kmp_display_env || __kmp_display_env_verbose) { 7008 __kmp_env_print_2(); 7009 } 7010 7011 #if OMPT_SUPPORT 7012 ompt_post_init(); 7013 #endif 7014 7015 KMP_MB(); 7016 7017 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 7018 } 7019 7020 void __kmp_serial_initialize(void) { 7021 if (__kmp_init_serial) { 7022 return; 7023 } 7024 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7025 if (__kmp_init_serial) { 7026 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7027 return; 7028 } 7029 __kmp_do_serial_initialize(); 7030 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7031 } 7032 7033 static void __kmp_do_middle_initialize(void) { 7034 int i, j; 7035 int prev_dflt_team_nth; 7036 7037 if (!__kmp_init_serial) { 7038 __kmp_do_serial_initialize(); 7039 } 7040 7041 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 7042 7043 // Save the previous value for the __kmp_dflt_team_nth so that 7044 // we can avoid some reinitialization if it hasn't changed. 7045 prev_dflt_team_nth = __kmp_dflt_team_nth; 7046 7047 #if KMP_AFFINITY_SUPPORTED 7048 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 7049 // number of cores on the machine. 7050 __kmp_affinity_initialize(); 7051 7052 #endif /* KMP_AFFINITY_SUPPORTED */ 7053 7054 KMP_ASSERT(__kmp_xproc > 0); 7055 if (__kmp_avail_proc == 0) { 7056 __kmp_avail_proc = __kmp_xproc; 7057 } 7058 7059 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 7060 // correct them now 7061 j = 0; 7062 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7063 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7064 __kmp_avail_proc; 7065 j++; 7066 } 7067 7068 if (__kmp_dflt_team_nth == 0) { 7069 #ifdef KMP_DFLT_NTH_CORES 7070 // Default #threads = #cores 7071 __kmp_dflt_team_nth = __kmp_ncores; 7072 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7073 "__kmp_ncores (%d)\n", 7074 __kmp_dflt_team_nth)); 7075 #else 7076 // Default #threads = #available OS procs 7077 __kmp_dflt_team_nth = __kmp_avail_proc; 7078 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7079 "__kmp_avail_proc(%d)\n", 7080 __kmp_dflt_team_nth)); 7081 #endif /* KMP_DFLT_NTH_CORES */ 7082 } 7083 7084 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7085 __kmp_dflt_team_nth = KMP_MIN_NTH; 7086 } 7087 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7088 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7089 } 7090 7091 if (__kmp_nesting_mode > 0) 7092 __kmp_set_nesting_mode_threads(); 7093 7094 // There's no harm in continuing if the following check fails, 7095 // but it indicates an error in the previous logic. 7096 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7097 7098 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7099 // Run through the __kmp_threads array and set the num threads icv for each 7100 // root thread that is currently registered with the RTL (which has not 7101 // already explicitly set its nthreads-var with a call to 7102 // omp_set_num_threads()). 7103 for (i = 0; i < __kmp_threads_capacity; i++) { 7104 kmp_info_t *thread = __kmp_threads[i]; 7105 if (thread == NULL) 7106 continue; 7107 if (thread->th.th_current_task->td_icvs.nproc != 0) 7108 continue; 7109 7110 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7111 } 7112 } 7113 KA_TRACE( 7114 20, 7115 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7116 __kmp_dflt_team_nth)); 7117 7118 #ifdef KMP_ADJUST_BLOCKTIME 7119 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7120 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7121 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7122 if (__kmp_nth > __kmp_avail_proc) { 7123 __kmp_zero_bt = TRUE; 7124 } 7125 } 7126 #endif /* KMP_ADJUST_BLOCKTIME */ 7127 7128 /* we have finished middle initialization */ 7129 TCW_SYNC_4(__kmp_init_middle, TRUE); 7130 7131 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7132 } 7133 7134 void __kmp_middle_initialize(void) { 7135 if (__kmp_init_middle) { 7136 return; 7137 } 7138 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7139 if (__kmp_init_middle) { 7140 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7141 return; 7142 } 7143 __kmp_do_middle_initialize(); 7144 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7145 } 7146 7147 void __kmp_parallel_initialize(void) { 7148 int gtid = __kmp_entry_gtid(); // this might be a new root 7149 7150 /* synchronize parallel initialization (for sibling) */ 7151 if (TCR_4(__kmp_init_parallel)) 7152 return; 7153 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7154 if (TCR_4(__kmp_init_parallel)) { 7155 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7156 return; 7157 } 7158 7159 /* TODO reinitialization after we have already shut down */ 7160 if (TCR_4(__kmp_global.g.g_done)) { 7161 KA_TRACE( 7162 10, 7163 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7164 __kmp_infinite_loop(); 7165 } 7166 7167 /* jc: The lock __kmp_initz_lock is already held, so calling 7168 __kmp_serial_initialize would cause a deadlock. So we call 7169 __kmp_do_serial_initialize directly. */ 7170 if (!__kmp_init_middle) { 7171 __kmp_do_middle_initialize(); 7172 } 7173 __kmp_assign_root_init_mask(); 7174 __kmp_resume_if_hard_paused(); 7175 7176 /* begin initialization */ 7177 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7178 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7179 7180 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7181 // Save the FP control regs. 7182 // Worker threads will set theirs to these values at thread startup. 7183 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7184 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7185 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7186 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7187 7188 #if KMP_OS_UNIX 7189 #if KMP_HANDLE_SIGNALS 7190 /* must be after __kmp_serial_initialize */ 7191 __kmp_install_signals(TRUE); 7192 #endif 7193 #endif 7194 7195 __kmp_suspend_initialize(); 7196 7197 #if defined(USE_LOAD_BALANCE) 7198 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7199 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7200 } 7201 #else 7202 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7203 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7204 } 7205 #endif 7206 7207 if (__kmp_version) { 7208 __kmp_print_version_2(); 7209 } 7210 7211 /* we have finished parallel initialization */ 7212 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7213 7214 KMP_MB(); 7215 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7216 7217 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7218 } 7219 7220 void __kmp_hidden_helper_initialize() { 7221 if (TCR_4(__kmp_init_hidden_helper)) 7222 return; 7223 7224 // __kmp_parallel_initialize is required before we initialize hidden helper 7225 if (!TCR_4(__kmp_init_parallel)) 7226 __kmp_parallel_initialize(); 7227 7228 // Double check. Note that this double check should not be placed before 7229 // __kmp_parallel_initialize as it will cause dead lock. 7230 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7231 if (TCR_4(__kmp_init_hidden_helper)) { 7232 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7233 return; 7234 } 7235 7236 // Set the count of hidden helper tasks to be executed to zero 7237 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7238 7239 // Set the global variable indicating that we're initializing hidden helper 7240 // team/threads 7241 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7242 7243 // Platform independent initialization 7244 __kmp_do_initialize_hidden_helper_threads(); 7245 7246 // Wait here for the finish of initialization of hidden helper teams 7247 __kmp_hidden_helper_threads_initz_wait(); 7248 7249 // We have finished hidden helper initialization 7250 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7251 7252 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7253 } 7254 7255 /* ------------------------------------------------------------------------ */ 7256 7257 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7258 kmp_team_t *team) { 7259 kmp_disp_t *dispatch; 7260 7261 KMP_MB(); 7262 7263 /* none of the threads have encountered any constructs, yet. */ 7264 this_thr->th.th_local.this_construct = 0; 7265 #if KMP_CACHE_MANAGE 7266 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7267 #endif /* KMP_CACHE_MANAGE */ 7268 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7269 KMP_DEBUG_ASSERT(dispatch); 7270 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7271 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7272 // this_thr->th.th_info.ds.ds_tid ] ); 7273 7274 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7275 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7276 if (__kmp_env_consistency_check) 7277 __kmp_push_parallel(gtid, team->t.t_ident); 7278 7279 KMP_MB(); /* Flush all pending memory write invalidates. */ 7280 } 7281 7282 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7283 kmp_team_t *team) { 7284 if (__kmp_env_consistency_check) 7285 __kmp_pop_parallel(gtid, team->t.t_ident); 7286 7287 __kmp_finish_implicit_task(this_thr); 7288 } 7289 7290 int __kmp_invoke_task_func(int gtid) { 7291 int rc; 7292 int tid = __kmp_tid_from_gtid(gtid); 7293 kmp_info_t *this_thr = __kmp_threads[gtid]; 7294 kmp_team_t *team = this_thr->th.th_team; 7295 7296 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7297 #if USE_ITT_BUILD 7298 if (__itt_stack_caller_create_ptr) { 7299 // inform ittnotify about entering user's code 7300 if (team->t.t_stack_id != NULL) { 7301 __kmp_itt_stack_callee_enter((__itt_caller)team->t.t_stack_id); 7302 } else { 7303 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7304 __kmp_itt_stack_callee_enter( 7305 (__itt_caller)team->t.t_parent->t.t_stack_id); 7306 } 7307 } 7308 #endif /* USE_ITT_BUILD */ 7309 #if INCLUDE_SSC_MARKS 7310 SSC_MARK_INVOKING(); 7311 #endif 7312 7313 #if OMPT_SUPPORT 7314 void *dummy; 7315 void **exit_frame_p; 7316 ompt_data_t *my_task_data; 7317 ompt_data_t *my_parallel_data; 7318 int ompt_team_size; 7319 7320 if (ompt_enabled.enabled) { 7321 exit_frame_p = &(team->t.t_implicit_task_taskdata[tid] 7322 .ompt_task_info.frame.exit_frame.ptr); 7323 } else { 7324 exit_frame_p = &dummy; 7325 } 7326 7327 my_task_data = 7328 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7329 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7330 if (ompt_enabled.ompt_callback_implicit_task) { 7331 ompt_team_size = team->t.t_nproc; 7332 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7333 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7334 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7335 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7336 } 7337 #endif 7338 7339 #if KMP_STATS_ENABLED 7340 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7341 if (previous_state == stats_state_e::TEAMS_REGION) { 7342 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7343 } else { 7344 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7345 } 7346 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7347 #endif 7348 7349 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7350 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7351 #if OMPT_SUPPORT 7352 , 7353 exit_frame_p 7354 #endif 7355 ); 7356 #if OMPT_SUPPORT 7357 *exit_frame_p = NULL; 7358 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7359 #endif 7360 7361 #if KMP_STATS_ENABLED 7362 if (previous_state == stats_state_e::TEAMS_REGION) { 7363 KMP_SET_THREAD_STATE(previous_state); 7364 } 7365 KMP_POP_PARTITIONED_TIMER(); 7366 #endif 7367 7368 #if USE_ITT_BUILD 7369 if (__itt_stack_caller_create_ptr) { 7370 // inform ittnotify about leaving user's code 7371 if (team->t.t_stack_id != NULL) { 7372 __kmp_itt_stack_callee_leave((__itt_caller)team->t.t_stack_id); 7373 } else { 7374 KMP_DEBUG_ASSERT(team->t.t_parent->t.t_stack_id != NULL); 7375 __kmp_itt_stack_callee_leave( 7376 (__itt_caller)team->t.t_parent->t.t_stack_id); 7377 } 7378 } 7379 #endif /* USE_ITT_BUILD */ 7380 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7381 7382 return rc; 7383 } 7384 7385 void __kmp_teams_master(int gtid) { 7386 // This routine is called by all primary threads in teams construct 7387 kmp_info_t *thr = __kmp_threads[gtid]; 7388 kmp_team_t *team = thr->th.th_team; 7389 ident_t *loc = team->t.t_ident; 7390 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7391 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7392 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7393 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7394 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7395 7396 // This thread is a new CG root. Set up the proper variables. 7397 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7398 tmp->cg_root = thr; // Make thr the CG root 7399 // Init to thread limit stored when league primary threads were forked 7400 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7401 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7402 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7403 " cg_nthreads to 1\n", 7404 thr, tmp)); 7405 tmp->up = thr->th.th_cg_roots; 7406 thr->th.th_cg_roots = tmp; 7407 7408 // Launch league of teams now, but not let workers execute 7409 // (they hang on fork barrier until next parallel) 7410 #if INCLUDE_SSC_MARKS 7411 SSC_MARK_FORKING(); 7412 #endif 7413 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7414 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7415 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7416 #if INCLUDE_SSC_MARKS 7417 SSC_MARK_JOINING(); 7418 #endif 7419 // If the team size was reduced from the limit, set it to the new size 7420 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7421 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7422 // AC: last parameter "1" eliminates join barrier which won't work because 7423 // worker threads are in a fork barrier waiting for more parallel regions 7424 __kmp_join_call(loc, gtid 7425 #if OMPT_SUPPORT 7426 , 7427 fork_context_intel 7428 #endif 7429 , 7430 1); 7431 } 7432 7433 int __kmp_invoke_teams_master(int gtid) { 7434 kmp_info_t *this_thr = __kmp_threads[gtid]; 7435 kmp_team_t *team = this_thr->th.th_team; 7436 #if KMP_DEBUG 7437 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7438 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7439 (void *)__kmp_teams_master); 7440 #endif 7441 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7442 #if OMPT_SUPPORT 7443 int tid = __kmp_tid_from_gtid(gtid); 7444 ompt_data_t *task_data = 7445 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7446 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7447 if (ompt_enabled.ompt_callback_implicit_task) { 7448 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7449 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7450 ompt_task_initial); 7451 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7452 } 7453 #endif 7454 __kmp_teams_master(gtid); 7455 #if OMPT_SUPPORT 7456 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7457 #endif 7458 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7459 return 1; 7460 } 7461 7462 /* this sets the requested number of threads for the next parallel region 7463 encountered by this team. since this should be enclosed in the forkjoin 7464 critical section it should avoid race conditions with asymmetrical nested 7465 parallelism */ 7466 7467 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7468 kmp_info_t *thr = __kmp_threads[gtid]; 7469 7470 if (num_threads > 0) 7471 thr->th.th_set_nproc = num_threads; 7472 } 7473 7474 static void __kmp_push_thread_limit(kmp_info_t *thr, int num_teams, 7475 int num_threads) { 7476 KMP_DEBUG_ASSERT(thr); 7477 // Remember the number of threads for inner parallel regions 7478 if (!TCR_4(__kmp_init_middle)) 7479 __kmp_middle_initialize(); // get internal globals calculated 7480 __kmp_assign_root_init_mask(); 7481 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7482 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7483 7484 if (num_threads == 0) { 7485 if (__kmp_teams_thread_limit > 0) { 7486 num_threads = __kmp_teams_thread_limit; 7487 } else { 7488 num_threads = __kmp_avail_proc / num_teams; 7489 } 7490 // adjust num_threads w/o warning as it is not user setting 7491 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7492 // no thread_limit clause specified - do not change thread-limit-var ICV 7493 if (num_threads > __kmp_dflt_team_nth) { 7494 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7495 } 7496 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7497 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7498 } // prevent team size to exceed thread-limit-var 7499 if (num_teams * num_threads > __kmp_teams_max_nth) { 7500 num_threads = __kmp_teams_max_nth / num_teams; 7501 } 7502 if (num_threads == 0) { 7503 num_threads = 1; 7504 } 7505 } else { 7506 // This thread will be the primary thread of the league primary threads 7507 // Store new thread limit; old limit is saved in th_cg_roots list 7508 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7509 // num_threads = min(num_threads, nthreads-var) 7510 if (num_threads > __kmp_dflt_team_nth) { 7511 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7512 } 7513 if (num_teams * num_threads > __kmp_teams_max_nth) { 7514 int new_threads = __kmp_teams_max_nth / num_teams; 7515 if (new_threads == 0) { 7516 new_threads = 1; 7517 } 7518 if (new_threads != num_threads) { 7519 if (!__kmp_reserve_warn) { // user asked for too many threads 7520 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7521 __kmp_msg(kmp_ms_warning, 7522 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7523 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7524 } 7525 } 7526 num_threads = new_threads; 7527 } 7528 } 7529 thr->th.th_teams_size.nth = num_threads; 7530 } 7531 7532 /* this sets the requested number of teams for the teams region and/or 7533 the number of threads for the next parallel region encountered */ 7534 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7535 int num_threads) { 7536 kmp_info_t *thr = __kmp_threads[gtid]; 7537 KMP_DEBUG_ASSERT(num_teams >= 0); 7538 KMP_DEBUG_ASSERT(num_threads >= 0); 7539 7540 if (num_teams == 0) { 7541 if (__kmp_nteams > 0) { 7542 num_teams = __kmp_nteams; 7543 } else { 7544 num_teams = 1; // default number of teams is 1. 7545 } 7546 } 7547 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7548 if (!__kmp_reserve_warn) { 7549 __kmp_reserve_warn = 1; 7550 __kmp_msg(kmp_ms_warning, 7551 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7552 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7553 } 7554 num_teams = __kmp_teams_max_nth; 7555 } 7556 // Set number of teams (number of threads in the outer "parallel" of the 7557 // teams) 7558 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7559 7560 __kmp_push_thread_limit(thr, num_teams, num_threads); 7561 } 7562 7563 /* This sets the requested number of teams for the teams region and/or 7564 the number of threads for the next parallel region encountered */ 7565 void __kmp_push_num_teams_51(ident_t *id, int gtid, int num_teams_lb, 7566 int num_teams_ub, int num_threads) { 7567 kmp_info_t *thr = __kmp_threads[gtid]; 7568 KMP_DEBUG_ASSERT(num_teams_lb >= 0 && num_teams_ub >= 0); 7569 KMP_DEBUG_ASSERT(num_teams_ub >= num_teams_lb); 7570 KMP_DEBUG_ASSERT(num_threads >= 0); 7571 7572 if (num_teams_lb > num_teams_ub) { 7573 __kmp_fatal(KMP_MSG(FailedToCreateTeam, num_teams_lb, num_teams_ub), 7574 KMP_HNT(SetNewBound, __kmp_teams_max_nth), __kmp_msg_null); 7575 } 7576 7577 int num_teams = 1; // defalt number of teams is 1. 7578 7579 if (num_teams_lb == 0 && num_teams_ub > 0) 7580 num_teams_lb = num_teams_ub; 7581 7582 if (num_teams_lb == 0 && num_teams_ub == 0) { // no num_teams clause 7583 num_teams = (__kmp_nteams > 0) ? __kmp_nteams : num_teams; 7584 if (num_teams > __kmp_teams_max_nth) { 7585 if (!__kmp_reserve_warn) { 7586 __kmp_reserve_warn = 1; 7587 __kmp_msg(kmp_ms_warning, 7588 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7589 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7590 } 7591 num_teams = __kmp_teams_max_nth; 7592 } 7593 } else if (num_teams_lb == num_teams_ub) { // requires exact number of teams 7594 num_teams = num_teams_ub; 7595 } else { // num_teams_lb <= num_teams <= num_teams_ub 7596 if (num_threads == 0) { 7597 if (num_teams_ub > __kmp_teams_max_nth) { 7598 num_teams = num_teams_lb; 7599 } else { 7600 num_teams = num_teams_ub; 7601 } 7602 } else { 7603 num_teams = (num_threads > __kmp_teams_max_nth) 7604 ? num_teams 7605 : __kmp_teams_max_nth / num_threads; 7606 if (num_teams < num_teams_lb) { 7607 num_teams = num_teams_lb; 7608 } else if (num_teams > num_teams_ub) { 7609 num_teams = num_teams_ub; 7610 } 7611 } 7612 } 7613 // Set number of teams (number of threads in the outer "parallel" of the 7614 // teams) 7615 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7616 7617 __kmp_push_thread_limit(thr, num_teams, num_threads); 7618 } 7619 7620 // Set the proc_bind var to use in the following parallel region. 7621 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7622 kmp_info_t *thr = __kmp_threads[gtid]; 7623 thr->th.th_set_proc_bind = proc_bind; 7624 } 7625 7626 /* Launch the worker threads into the microtask. */ 7627 7628 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7629 kmp_info_t *this_thr = __kmp_threads[gtid]; 7630 7631 #ifdef KMP_DEBUG 7632 int f; 7633 #endif /* KMP_DEBUG */ 7634 7635 KMP_DEBUG_ASSERT(team); 7636 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7637 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7638 KMP_MB(); /* Flush all pending memory write invalidates. */ 7639 7640 team->t.t_construct = 0; /* no single directives seen yet */ 7641 team->t.t_ordered.dt.t_value = 7642 0; /* thread 0 enters the ordered section first */ 7643 7644 /* Reset the identifiers on the dispatch buffer */ 7645 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7646 if (team->t.t_max_nproc > 1) { 7647 int i; 7648 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7649 team->t.t_disp_buffer[i].buffer_index = i; 7650 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7651 } 7652 } else { 7653 team->t.t_disp_buffer[0].buffer_index = 0; 7654 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7655 } 7656 7657 KMP_MB(); /* Flush all pending memory write invalidates. */ 7658 KMP_ASSERT(this_thr->th.th_team == team); 7659 7660 #ifdef KMP_DEBUG 7661 for (f = 0; f < team->t.t_nproc; f++) { 7662 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7663 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7664 } 7665 #endif /* KMP_DEBUG */ 7666 7667 /* release the worker threads so they may begin working */ 7668 __kmp_fork_barrier(gtid, 0); 7669 } 7670 7671 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7672 kmp_info_t *this_thr = __kmp_threads[gtid]; 7673 7674 KMP_DEBUG_ASSERT(team); 7675 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7676 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7677 KMP_MB(); /* Flush all pending memory write invalidates. */ 7678 7679 /* Join barrier after fork */ 7680 7681 #ifdef KMP_DEBUG 7682 if (__kmp_threads[gtid] && 7683 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7684 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7685 __kmp_threads[gtid]); 7686 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7687 "team->t.t_nproc=%d\n", 7688 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7689 team->t.t_nproc); 7690 __kmp_print_structure(); 7691 } 7692 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7693 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7694 #endif /* KMP_DEBUG */ 7695 7696 __kmp_join_barrier(gtid); /* wait for everyone */ 7697 #if OMPT_SUPPORT 7698 if (ompt_enabled.enabled && 7699 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7700 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7701 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7702 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7703 #if OMPT_OPTIONAL 7704 void *codeptr = NULL; 7705 if (KMP_MASTER_TID(ds_tid) && 7706 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7707 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7708 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7709 7710 if (ompt_enabled.ompt_callback_sync_region_wait) { 7711 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7712 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7713 codeptr); 7714 } 7715 if (ompt_enabled.ompt_callback_sync_region) { 7716 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7717 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7718 codeptr); 7719 } 7720 #endif 7721 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7722 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7723 ompt_scope_end, NULL, task_data, 0, ds_tid, 7724 ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7725 } 7726 } 7727 #endif 7728 7729 KMP_MB(); /* Flush all pending memory write invalidates. */ 7730 KMP_ASSERT(this_thr->th.th_team == team); 7731 } 7732 7733 /* ------------------------------------------------------------------------ */ 7734 7735 #ifdef USE_LOAD_BALANCE 7736 7737 // Return the worker threads actively spinning in the hot team, if we 7738 // are at the outermost level of parallelism. Otherwise, return 0. 7739 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7740 int i; 7741 int retval; 7742 kmp_team_t *hot_team; 7743 7744 if (root->r.r_active) { 7745 return 0; 7746 } 7747 hot_team = root->r.r_hot_team; 7748 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7749 return hot_team->t.t_nproc - 1; // Don't count primary thread 7750 } 7751 7752 // Skip the primary thread - it is accounted for elsewhere. 7753 retval = 0; 7754 for (i = 1; i < hot_team->t.t_nproc; i++) { 7755 if (hot_team->t.t_threads[i]->th.th_active) { 7756 retval++; 7757 } 7758 } 7759 return retval; 7760 } 7761 7762 // Perform an automatic adjustment to the number of 7763 // threads used by the next parallel region. 7764 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7765 int retval; 7766 int pool_active; 7767 int hot_team_active; 7768 int team_curr_active; 7769 int system_active; 7770 7771 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7772 set_nproc)); 7773 KMP_DEBUG_ASSERT(root); 7774 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7775 ->th.th_current_task->td_icvs.dynamic == TRUE); 7776 KMP_DEBUG_ASSERT(set_nproc > 1); 7777 7778 if (set_nproc == 1) { 7779 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7780 return 1; 7781 } 7782 7783 // Threads that are active in the thread pool, active in the hot team for this 7784 // particular root (if we are at the outer par level), and the currently 7785 // executing thread (to become the primary thread) are available to add to the 7786 // new team, but are currently contributing to the system load, and must be 7787 // accounted for. 7788 pool_active = __kmp_thread_pool_active_nth; 7789 hot_team_active = __kmp_active_hot_team_nproc(root); 7790 team_curr_active = pool_active + hot_team_active + 1; 7791 7792 // Check the system load. 7793 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7794 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7795 "hot team active = %d\n", 7796 system_active, pool_active, hot_team_active)); 7797 7798 if (system_active < 0) { 7799 // There was an error reading the necessary info from /proc, so use the 7800 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7801 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7802 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7803 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7804 7805 // Make this call behave like the thread limit algorithm. 7806 retval = __kmp_avail_proc - __kmp_nth + 7807 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7808 if (retval > set_nproc) { 7809 retval = set_nproc; 7810 } 7811 if (retval < KMP_MIN_NTH) { 7812 retval = KMP_MIN_NTH; 7813 } 7814 7815 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7816 retval)); 7817 return retval; 7818 } 7819 7820 // There is a slight delay in the load balance algorithm in detecting new 7821 // running procs. The real system load at this instant should be at least as 7822 // large as the #active omp thread that are available to add to the team. 7823 if (system_active < team_curr_active) { 7824 system_active = team_curr_active; 7825 } 7826 retval = __kmp_avail_proc - system_active + team_curr_active; 7827 if (retval > set_nproc) { 7828 retval = set_nproc; 7829 } 7830 if (retval < KMP_MIN_NTH) { 7831 retval = KMP_MIN_NTH; 7832 } 7833 7834 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7835 return retval; 7836 } // __kmp_load_balance_nproc() 7837 7838 #endif /* USE_LOAD_BALANCE */ 7839 7840 /* ------------------------------------------------------------------------ */ 7841 7842 /* NOTE: this is called with the __kmp_init_lock held */ 7843 void __kmp_cleanup(void) { 7844 int f; 7845 7846 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7847 7848 if (TCR_4(__kmp_init_parallel)) { 7849 #if KMP_HANDLE_SIGNALS 7850 __kmp_remove_signals(); 7851 #endif 7852 TCW_4(__kmp_init_parallel, FALSE); 7853 } 7854 7855 if (TCR_4(__kmp_init_middle)) { 7856 #if KMP_AFFINITY_SUPPORTED 7857 __kmp_affinity_uninitialize(); 7858 #endif /* KMP_AFFINITY_SUPPORTED */ 7859 __kmp_cleanup_hierarchy(); 7860 TCW_4(__kmp_init_middle, FALSE); 7861 } 7862 7863 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7864 7865 if (__kmp_init_serial) { 7866 __kmp_runtime_destroy(); 7867 __kmp_init_serial = FALSE; 7868 } 7869 7870 __kmp_cleanup_threadprivate_caches(); 7871 7872 for (f = 0; f < __kmp_threads_capacity; f++) { 7873 if (__kmp_root[f] != NULL) { 7874 __kmp_free(__kmp_root[f]); 7875 __kmp_root[f] = NULL; 7876 } 7877 } 7878 __kmp_free(__kmp_threads); 7879 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7880 // there is no need in freeing __kmp_root. 7881 __kmp_threads = NULL; 7882 __kmp_root = NULL; 7883 __kmp_threads_capacity = 0; 7884 7885 #if KMP_USE_DYNAMIC_LOCK 7886 __kmp_cleanup_indirect_user_locks(); 7887 #else 7888 __kmp_cleanup_user_locks(); 7889 #endif 7890 #if OMPD_SUPPORT 7891 if (ompd_state) { 7892 __kmp_free(ompd_env_block); 7893 ompd_env_block = NULL; 7894 ompd_env_block_size = 0; 7895 } 7896 #endif 7897 7898 #if KMP_AFFINITY_SUPPORTED 7899 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7900 __kmp_cpuinfo_file = NULL; 7901 #endif /* KMP_AFFINITY_SUPPORTED */ 7902 7903 #if KMP_USE_ADAPTIVE_LOCKS 7904 #if KMP_DEBUG_ADAPTIVE_LOCKS 7905 __kmp_print_speculative_stats(); 7906 #endif 7907 #endif 7908 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7909 __kmp_nested_nth.nth = NULL; 7910 __kmp_nested_nth.size = 0; 7911 __kmp_nested_nth.used = 0; 7912 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7913 __kmp_nested_proc_bind.bind_types = NULL; 7914 __kmp_nested_proc_bind.size = 0; 7915 __kmp_nested_proc_bind.used = 0; 7916 if (__kmp_affinity_format) { 7917 KMP_INTERNAL_FREE(__kmp_affinity_format); 7918 __kmp_affinity_format = NULL; 7919 } 7920 7921 __kmp_i18n_catclose(); 7922 7923 #if KMP_USE_HIER_SCHED 7924 __kmp_hier_scheds.deallocate(); 7925 #endif 7926 7927 #if KMP_STATS_ENABLED 7928 __kmp_stats_fini(); 7929 #endif 7930 7931 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7932 } 7933 7934 /* ------------------------------------------------------------------------ */ 7935 7936 int __kmp_ignore_mppbeg(void) { 7937 char *env; 7938 7939 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7940 if (__kmp_str_match_false(env)) 7941 return FALSE; 7942 } 7943 // By default __kmpc_begin() is no-op. 7944 return TRUE; 7945 } 7946 7947 int __kmp_ignore_mppend(void) { 7948 char *env; 7949 7950 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7951 if (__kmp_str_match_false(env)) 7952 return FALSE; 7953 } 7954 // By default __kmpc_end() is no-op. 7955 return TRUE; 7956 } 7957 7958 void __kmp_internal_begin(void) { 7959 int gtid; 7960 kmp_root_t *root; 7961 7962 /* this is a very important step as it will register new sibling threads 7963 and assign these new uber threads a new gtid */ 7964 gtid = __kmp_entry_gtid(); 7965 root = __kmp_threads[gtid]->th.th_root; 7966 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7967 7968 if (root->r.r_begin) 7969 return; 7970 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7971 if (root->r.r_begin) { 7972 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7973 return; 7974 } 7975 7976 root->r.r_begin = TRUE; 7977 7978 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7979 } 7980 7981 /* ------------------------------------------------------------------------ */ 7982 7983 void __kmp_user_set_library(enum library_type arg) { 7984 int gtid; 7985 kmp_root_t *root; 7986 kmp_info_t *thread; 7987 7988 /* first, make sure we are initialized so we can get our gtid */ 7989 7990 gtid = __kmp_entry_gtid(); 7991 thread = __kmp_threads[gtid]; 7992 7993 root = thread->th.th_root; 7994 7995 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7996 library_serial)); 7997 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7998 thread */ 7999 KMP_WARNING(SetLibraryIncorrectCall); 8000 return; 8001 } 8002 8003 switch (arg) { 8004 case library_serial: 8005 thread->th.th_set_nproc = 0; 8006 set__nproc(thread, 1); 8007 break; 8008 case library_turnaround: 8009 thread->th.th_set_nproc = 0; 8010 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8011 : __kmp_dflt_team_nth_ub); 8012 break; 8013 case library_throughput: 8014 thread->th.th_set_nproc = 0; 8015 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 8016 : __kmp_dflt_team_nth_ub); 8017 break; 8018 default: 8019 KMP_FATAL(UnknownLibraryType, arg); 8020 } 8021 8022 __kmp_aux_set_library(arg); 8023 } 8024 8025 void __kmp_aux_set_stacksize(size_t arg) { 8026 if (!__kmp_init_serial) 8027 __kmp_serial_initialize(); 8028 8029 #if KMP_OS_DARWIN 8030 if (arg & (0x1000 - 1)) { 8031 arg &= ~(0x1000 - 1); 8032 if (arg + 0x1000) /* check for overflow if we round up */ 8033 arg += 0x1000; 8034 } 8035 #endif 8036 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8037 8038 /* only change the default stacksize before the first parallel region */ 8039 if (!TCR_4(__kmp_init_parallel)) { 8040 size_t value = arg; /* argument is in bytes */ 8041 8042 if (value < __kmp_sys_min_stksize) 8043 value = __kmp_sys_min_stksize; 8044 else if (value > KMP_MAX_STKSIZE) 8045 value = KMP_MAX_STKSIZE; 8046 8047 __kmp_stksize = value; 8048 8049 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 8050 } 8051 8052 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8053 } 8054 8055 /* set the behaviour of the runtime library */ 8056 /* TODO this can cause some odd behaviour with sibling parallelism... */ 8057 void __kmp_aux_set_library(enum library_type arg) { 8058 __kmp_library = arg; 8059 8060 switch (__kmp_library) { 8061 case library_serial: { 8062 KMP_INFORM(LibraryIsSerial); 8063 } break; 8064 case library_turnaround: 8065 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 8066 __kmp_use_yield = 2; // only yield when oversubscribed 8067 break; 8068 case library_throughput: 8069 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 8070 __kmp_dflt_blocktime = 200; 8071 break; 8072 default: 8073 KMP_FATAL(UnknownLibraryType, arg); 8074 } 8075 } 8076 8077 /* Getting team information common for all team API */ 8078 // Returns NULL if not in teams construct 8079 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 8080 kmp_info_t *thr = __kmp_entry_thread(); 8081 teams_serialized = 0; 8082 if (thr->th.th_teams_microtask) { 8083 kmp_team_t *team = thr->th.th_team; 8084 int tlevel = thr->th.th_teams_level; // the level of the teams construct 8085 int ii = team->t.t_level; 8086 teams_serialized = team->t.t_serialized; 8087 int level = tlevel + 1; 8088 KMP_DEBUG_ASSERT(ii >= tlevel); 8089 while (ii > level) { 8090 for (teams_serialized = team->t.t_serialized; 8091 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 8092 } 8093 if (team->t.t_serialized && (!teams_serialized)) { 8094 team = team->t.t_parent; 8095 continue; 8096 } 8097 if (ii > level) { 8098 team = team->t.t_parent; 8099 ii--; 8100 } 8101 } 8102 return team; 8103 } 8104 return NULL; 8105 } 8106 8107 int __kmp_aux_get_team_num() { 8108 int serialized; 8109 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8110 if (team) { 8111 if (serialized > 1) { 8112 return 0; // teams region is serialized ( 1 team of 1 thread ). 8113 } else { 8114 return team->t.t_master_tid; 8115 } 8116 } 8117 return 0; 8118 } 8119 8120 int __kmp_aux_get_num_teams() { 8121 int serialized; 8122 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 8123 if (team) { 8124 if (serialized > 1) { 8125 return 1; 8126 } else { 8127 return team->t.t_parent->t.t_nproc; 8128 } 8129 } 8130 return 1; 8131 } 8132 8133 /* ------------------------------------------------------------------------ */ 8134 8135 /* 8136 * Affinity Format Parser 8137 * 8138 * Field is in form of: %[[[0].]size]type 8139 * % and type are required (%% means print a literal '%') 8140 * type is either single char or long name surrounded by {}, 8141 * e.g., N or {num_threads} 8142 * 0 => leading zeros 8143 * . => right justified when size is specified 8144 * by default output is left justified 8145 * size is the *minimum* field length 8146 * All other characters are printed as is 8147 * 8148 * Available field types: 8149 * L {thread_level} - omp_get_level() 8150 * n {thread_num} - omp_get_thread_num() 8151 * h {host} - name of host machine 8152 * P {process_id} - process id (integer) 8153 * T {thread_identifier} - native thread identifier (integer) 8154 * N {num_threads} - omp_get_num_threads() 8155 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 8156 * a {thread_affinity} - comma separated list of integers or integer ranges 8157 * (values of affinity mask) 8158 * 8159 * Implementation-specific field types can be added 8160 * If a type is unknown, print "undefined" 8161 */ 8162 8163 // Structure holding the short name, long name, and corresponding data type 8164 // for snprintf. A table of these will represent the entire valid keyword 8165 // field types. 8166 typedef struct kmp_affinity_format_field_t { 8167 char short_name; // from spec e.g., L -> thread level 8168 const char *long_name; // from spec thread_level -> thread level 8169 char field_format; // data type for snprintf (typically 'd' or 's' 8170 // for integer or string) 8171 } kmp_affinity_format_field_t; 8172 8173 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8174 #if KMP_AFFINITY_SUPPORTED 8175 {'A', "thread_affinity", 's'}, 8176 #endif 8177 {'t', "team_num", 'd'}, 8178 {'T', "num_teams", 'd'}, 8179 {'L', "nesting_level", 'd'}, 8180 {'n', "thread_num", 'd'}, 8181 {'N', "num_threads", 'd'}, 8182 {'a', "ancestor_tnum", 'd'}, 8183 {'H', "host", 's'}, 8184 {'P', "process_id", 'd'}, 8185 {'i', "native_thread_id", 'd'}}; 8186 8187 // Return the number of characters it takes to hold field 8188 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8189 const char **ptr, 8190 kmp_str_buf_t *field_buffer) { 8191 int rc, format_index, field_value; 8192 const char *width_left, *width_right; 8193 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8194 static const int FORMAT_SIZE = 20; 8195 char format[FORMAT_SIZE] = {0}; 8196 char absolute_short_name = 0; 8197 8198 KMP_DEBUG_ASSERT(gtid >= 0); 8199 KMP_DEBUG_ASSERT(th); 8200 KMP_DEBUG_ASSERT(**ptr == '%'); 8201 KMP_DEBUG_ASSERT(field_buffer); 8202 8203 __kmp_str_buf_clear(field_buffer); 8204 8205 // Skip the initial % 8206 (*ptr)++; 8207 8208 // Check for %% first 8209 if (**ptr == '%') { 8210 __kmp_str_buf_cat(field_buffer, "%", 1); 8211 (*ptr)++; // skip over the second % 8212 return 1; 8213 } 8214 8215 // Parse field modifiers if they are present 8216 pad_zeros = false; 8217 if (**ptr == '0') { 8218 pad_zeros = true; 8219 (*ptr)++; // skip over 0 8220 } 8221 right_justify = false; 8222 if (**ptr == '.') { 8223 right_justify = true; 8224 (*ptr)++; // skip over . 8225 } 8226 // Parse width of field: [width_left, width_right) 8227 width_left = width_right = NULL; 8228 if (**ptr >= '0' && **ptr <= '9') { 8229 width_left = *ptr; 8230 SKIP_DIGITS(*ptr); 8231 width_right = *ptr; 8232 } 8233 8234 // Create the format for KMP_SNPRINTF based on flags parsed above 8235 format_index = 0; 8236 format[format_index++] = '%'; 8237 if (!right_justify) 8238 format[format_index++] = '-'; 8239 if (pad_zeros) 8240 format[format_index++] = '0'; 8241 if (width_left && width_right) { 8242 int i = 0; 8243 // Only allow 8 digit number widths. 8244 // This also prevents overflowing format variable 8245 while (i < 8 && width_left < width_right) { 8246 format[format_index++] = *width_left; 8247 width_left++; 8248 i++; 8249 } 8250 } 8251 8252 // Parse a name (long or short) 8253 // Canonicalize the name into absolute_short_name 8254 found_valid_name = false; 8255 parse_long_name = (**ptr == '{'); 8256 if (parse_long_name) 8257 (*ptr)++; // skip initial left brace 8258 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8259 sizeof(__kmp_affinity_format_table[0]); 8260 ++i) { 8261 char short_name = __kmp_affinity_format_table[i].short_name; 8262 const char *long_name = __kmp_affinity_format_table[i].long_name; 8263 char field_format = __kmp_affinity_format_table[i].field_format; 8264 if (parse_long_name) { 8265 size_t length = KMP_STRLEN(long_name); 8266 if (strncmp(*ptr, long_name, length) == 0) { 8267 found_valid_name = true; 8268 (*ptr) += length; // skip the long name 8269 } 8270 } else if (**ptr == short_name) { 8271 found_valid_name = true; 8272 (*ptr)++; // skip the short name 8273 } 8274 if (found_valid_name) { 8275 format[format_index++] = field_format; 8276 format[format_index++] = '\0'; 8277 absolute_short_name = short_name; 8278 break; 8279 } 8280 } 8281 if (parse_long_name) { 8282 if (**ptr != '}') { 8283 absolute_short_name = 0; 8284 } else { 8285 (*ptr)++; // skip over the right brace 8286 } 8287 } 8288 8289 // Attempt to fill the buffer with the requested 8290 // value using snprintf within __kmp_str_buf_print() 8291 switch (absolute_short_name) { 8292 case 't': 8293 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8294 break; 8295 case 'T': 8296 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8297 break; 8298 case 'L': 8299 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8300 break; 8301 case 'n': 8302 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8303 break; 8304 case 'H': { 8305 static const int BUFFER_SIZE = 256; 8306 char buf[BUFFER_SIZE]; 8307 __kmp_expand_host_name(buf, BUFFER_SIZE); 8308 rc = __kmp_str_buf_print(field_buffer, format, buf); 8309 } break; 8310 case 'P': 8311 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8312 break; 8313 case 'i': 8314 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8315 break; 8316 case 'N': 8317 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8318 break; 8319 case 'a': 8320 field_value = 8321 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8322 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8323 break; 8324 #if KMP_AFFINITY_SUPPORTED 8325 case 'A': { 8326 kmp_str_buf_t buf; 8327 __kmp_str_buf_init(&buf); 8328 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8329 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8330 __kmp_str_buf_free(&buf); 8331 } break; 8332 #endif 8333 default: 8334 // According to spec, If an implementation does not have info for field 8335 // type, then "undefined" is printed 8336 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8337 // Skip the field 8338 if (parse_long_name) { 8339 SKIP_TOKEN(*ptr); 8340 if (**ptr == '}') 8341 (*ptr)++; 8342 } else { 8343 (*ptr)++; 8344 } 8345 } 8346 8347 KMP_ASSERT(format_index <= FORMAT_SIZE); 8348 return rc; 8349 } 8350 8351 /* 8352 * Return number of characters needed to hold the affinity string 8353 * (not including null byte character) 8354 * The resultant string is printed to buffer, which the caller can then 8355 * handle afterwards 8356 */ 8357 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8358 kmp_str_buf_t *buffer) { 8359 const char *parse_ptr; 8360 size_t retval; 8361 const kmp_info_t *th; 8362 kmp_str_buf_t field; 8363 8364 KMP_DEBUG_ASSERT(buffer); 8365 KMP_DEBUG_ASSERT(gtid >= 0); 8366 8367 __kmp_str_buf_init(&field); 8368 __kmp_str_buf_clear(buffer); 8369 8370 th = __kmp_threads[gtid]; 8371 retval = 0; 8372 8373 // If format is NULL or zero-length string, then we use 8374 // affinity-format-var ICV 8375 parse_ptr = format; 8376 if (parse_ptr == NULL || *parse_ptr == '\0') { 8377 parse_ptr = __kmp_affinity_format; 8378 } 8379 KMP_DEBUG_ASSERT(parse_ptr); 8380 8381 while (*parse_ptr != '\0') { 8382 // Parse a field 8383 if (*parse_ptr == '%') { 8384 // Put field in the buffer 8385 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8386 __kmp_str_buf_catbuf(buffer, &field); 8387 retval += rc; 8388 } else { 8389 // Put literal character in buffer 8390 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8391 retval++; 8392 parse_ptr++; 8393 } 8394 } 8395 __kmp_str_buf_free(&field); 8396 return retval; 8397 } 8398 8399 // Displays the affinity string to stdout 8400 void __kmp_aux_display_affinity(int gtid, const char *format) { 8401 kmp_str_buf_t buf; 8402 __kmp_str_buf_init(&buf); 8403 __kmp_aux_capture_affinity(gtid, format, &buf); 8404 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8405 __kmp_str_buf_free(&buf); 8406 } 8407 8408 /* ------------------------------------------------------------------------ */ 8409 8410 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8411 int blocktime = arg; /* argument is in milliseconds */ 8412 #if KMP_USE_MONITOR 8413 int bt_intervals; 8414 #endif 8415 kmp_int8 bt_set; 8416 8417 __kmp_save_internal_controls(thread); 8418 8419 /* Normalize and set blocktime for the teams */ 8420 if (blocktime < KMP_MIN_BLOCKTIME) 8421 blocktime = KMP_MIN_BLOCKTIME; 8422 else if (blocktime > KMP_MAX_BLOCKTIME) 8423 blocktime = KMP_MAX_BLOCKTIME; 8424 8425 set__blocktime_team(thread->th.th_team, tid, blocktime); 8426 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8427 8428 #if KMP_USE_MONITOR 8429 /* Calculate and set blocktime intervals for the teams */ 8430 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8431 8432 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8433 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8434 #endif 8435 8436 /* Set whether blocktime has been set to "TRUE" */ 8437 bt_set = TRUE; 8438 8439 set__bt_set_team(thread->th.th_team, tid, bt_set); 8440 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8441 #if KMP_USE_MONITOR 8442 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8443 "bt_intervals=%d, monitor_updates=%d\n", 8444 __kmp_gtid_from_tid(tid, thread->th.th_team), 8445 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8446 __kmp_monitor_wakeups)); 8447 #else 8448 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8449 __kmp_gtid_from_tid(tid, thread->th.th_team), 8450 thread->th.th_team->t.t_id, tid, blocktime)); 8451 #endif 8452 } 8453 8454 void __kmp_aux_set_defaults(char const *str, size_t len) { 8455 if (!__kmp_init_serial) { 8456 __kmp_serial_initialize(); 8457 } 8458 __kmp_env_initialize(str); 8459 8460 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8461 __kmp_env_print(); 8462 } 8463 } // __kmp_aux_set_defaults 8464 8465 /* ------------------------------------------------------------------------ */ 8466 /* internal fast reduction routines */ 8467 8468 PACKED_REDUCTION_METHOD_T 8469 __kmp_determine_reduction_method( 8470 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8471 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8472 kmp_critical_name *lck) { 8473 8474 // Default reduction method: critical construct ( lck != NULL, like in current 8475 // PAROPT ) 8476 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8477 // can be selected by RTL 8478 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8479 // can be selected by RTL 8480 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8481 // among generated by PAROPT. 8482 8483 PACKED_REDUCTION_METHOD_T retval; 8484 8485 int team_size; 8486 8487 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8488 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8489 8490 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8491 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8492 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8493 8494 retval = critical_reduce_block; 8495 8496 // another choice of getting a team size (with 1 dynamic deference) is slower 8497 team_size = __kmp_get_team_num_threads(global_tid); 8498 if (team_size == 1) { 8499 8500 retval = empty_reduce_block; 8501 8502 } else { 8503 8504 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8505 8506 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8507 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8508 8509 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8510 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8511 8512 int teamsize_cutoff = 4; 8513 8514 #if KMP_MIC_SUPPORTED 8515 if (__kmp_mic_type != non_mic) { 8516 teamsize_cutoff = 8; 8517 } 8518 #endif 8519 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8520 if (tree_available) { 8521 if (team_size <= teamsize_cutoff) { 8522 if (atomic_available) { 8523 retval = atomic_reduce_block; 8524 } 8525 } else { 8526 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8527 } 8528 } else if (atomic_available) { 8529 retval = atomic_reduce_block; 8530 } 8531 #else 8532 #error "Unknown or unsupported OS" 8533 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8534 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8535 8536 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8537 8538 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8539 8540 // basic tuning 8541 8542 if (atomic_available) { 8543 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8544 retval = atomic_reduce_block; 8545 } 8546 } // otherwise: use critical section 8547 8548 #elif KMP_OS_DARWIN 8549 8550 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8551 if (atomic_available && (num_vars <= 3)) { 8552 retval = atomic_reduce_block; 8553 } else if (tree_available) { 8554 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8555 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8556 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8557 } 8558 } // otherwise: use critical section 8559 8560 #else 8561 #error "Unknown or unsupported OS" 8562 #endif 8563 8564 #else 8565 #error "Unknown or unsupported architecture" 8566 #endif 8567 } 8568 8569 // KMP_FORCE_REDUCTION 8570 8571 // If the team is serialized (team_size == 1), ignore the forced reduction 8572 // method and stay with the unsynchronized method (empty_reduce_block) 8573 if (__kmp_force_reduction_method != reduction_method_not_defined && 8574 team_size != 1) { 8575 8576 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8577 8578 int atomic_available, tree_available; 8579 8580 switch ((forced_retval = __kmp_force_reduction_method)) { 8581 case critical_reduce_block: 8582 KMP_ASSERT(lck); // lck should be != 0 8583 break; 8584 8585 case atomic_reduce_block: 8586 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8587 if (!atomic_available) { 8588 KMP_WARNING(RedMethodNotSupported, "atomic"); 8589 forced_retval = critical_reduce_block; 8590 } 8591 break; 8592 8593 case tree_reduce_block: 8594 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8595 if (!tree_available) { 8596 KMP_WARNING(RedMethodNotSupported, "tree"); 8597 forced_retval = critical_reduce_block; 8598 } else { 8599 #if KMP_FAST_REDUCTION_BARRIER 8600 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8601 #endif 8602 } 8603 break; 8604 8605 default: 8606 KMP_ASSERT(0); // "unsupported method specified" 8607 } 8608 8609 retval = forced_retval; 8610 } 8611 8612 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8613 8614 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8615 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8616 8617 return (retval); 8618 } 8619 // this function is for testing set/get/determine reduce method 8620 kmp_int32 __kmp_get_reduce_method(void) { 8621 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8622 } 8623 8624 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8625 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8626 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8627 8628 // Hard pause shuts down the runtime completely. Resume happens naturally when 8629 // OpenMP is used subsequently. 8630 void __kmp_hard_pause() { 8631 __kmp_pause_status = kmp_hard_paused; 8632 __kmp_internal_end_thread(-1); 8633 } 8634 8635 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8636 void __kmp_resume_if_soft_paused() { 8637 if (__kmp_pause_status == kmp_soft_paused) { 8638 __kmp_pause_status = kmp_not_paused; 8639 8640 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8641 kmp_info_t *thread = __kmp_threads[gtid]; 8642 if (thread) { // Wake it if sleeping 8643 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8644 thread); 8645 if (fl.is_sleeping()) 8646 fl.resume(gtid); 8647 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8648 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8649 } else { // thread holds the lock and may sleep soon 8650 do { // until either the thread sleeps, or we can get the lock 8651 if (fl.is_sleeping()) { 8652 fl.resume(gtid); 8653 break; 8654 } else if (__kmp_try_suspend_mx(thread)) { 8655 __kmp_unlock_suspend_mx(thread); 8656 break; 8657 } 8658 } while (1); 8659 } 8660 } 8661 } 8662 } 8663 } 8664 8665 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8666 // TODO: add warning messages 8667 int __kmp_pause_resource(kmp_pause_status_t level) { 8668 if (level == kmp_not_paused) { // requesting resume 8669 if (__kmp_pause_status == kmp_not_paused) { 8670 // error message about runtime not being paused, so can't resume 8671 return 1; 8672 } else { 8673 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8674 __kmp_pause_status == kmp_hard_paused); 8675 __kmp_pause_status = kmp_not_paused; 8676 return 0; 8677 } 8678 } else if (level == kmp_soft_paused) { // requesting soft pause 8679 if (__kmp_pause_status != kmp_not_paused) { 8680 // error message about already being paused 8681 return 1; 8682 } else { 8683 __kmp_soft_pause(); 8684 return 0; 8685 } 8686 } else if (level == kmp_hard_paused) { // requesting hard pause 8687 if (__kmp_pause_status != kmp_not_paused) { 8688 // error message about already being paused 8689 return 1; 8690 } else { 8691 __kmp_hard_pause(); 8692 return 0; 8693 } 8694 } else { 8695 // error message about invalid level 8696 return 1; 8697 } 8698 } 8699 8700 void __kmp_omp_display_env(int verbose) { 8701 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8702 if (__kmp_init_serial == 0) 8703 __kmp_do_serial_initialize(); 8704 __kmp_display_env_impl(!verbose, verbose); 8705 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8706 } 8707 8708 // Globals and functions for hidden helper task 8709 kmp_info_t **__kmp_hidden_helper_threads; 8710 kmp_info_t *__kmp_hidden_helper_main_thread; 8711 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8712 #if KMP_OS_LINUX 8713 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8714 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8715 #else 8716 kmp_int32 __kmp_hidden_helper_threads_num = 0; 8717 kmp_int32 __kmp_enable_hidden_helper = FALSE; 8718 #endif 8719 8720 namespace { 8721 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8722 8723 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8724 // This is an explicit synchronization on all hidden helper threads in case 8725 // that when a regular thread pushes a hidden helper task to one hidden 8726 // helper thread, the thread has not been awaken once since they're released 8727 // by the main thread after creating the team. 8728 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8729 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8730 __kmp_hidden_helper_threads_num) 8731 ; 8732 8733 // If main thread, then wait for signal 8734 if (__kmpc_master(nullptr, *gtid)) { 8735 // First, unset the initial state and release the initial thread 8736 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8737 __kmp_hidden_helper_initz_release(); 8738 __kmp_hidden_helper_main_thread_wait(); 8739 // Now wake up all worker threads 8740 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8741 __kmp_hidden_helper_worker_thread_signal(); 8742 } 8743 } 8744 } 8745 } // namespace 8746 8747 void __kmp_hidden_helper_threads_initz_routine() { 8748 // Create a new root for hidden helper team/threads 8749 const int gtid = __kmp_register_root(TRUE); 8750 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8751 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8752 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8753 __kmp_hidden_helper_threads_num; 8754 8755 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8756 8757 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8758 8759 // Set the initialization flag to FALSE 8760 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8761 8762 __kmp_hidden_helper_threads_deinitz_release(); 8763 } 8764 8765 /* Nesting Mode: 8766 Set via KMP_NESTING_MODE, which takes an integer. 8767 Note: we skip duplicate topology levels, and skip levels with only 8768 one entity. 8769 KMP_NESTING_MODE=0 is the default, and doesn't use nesting mode. 8770 KMP_NESTING_MODE=1 sets as many nesting levels as there are distinct levels 8771 in the topology, and initializes the number of threads at each of those 8772 levels to the number of entities at each level, respectively, below the 8773 entity at the parent level. 8774 KMP_NESTING_MODE=N, where N>1, attempts to create up to N nesting levels, 8775 but starts with nesting OFF -- max-active-levels-var is 1 -- and requires 8776 the user to turn nesting on explicitly. This is an even more experimental 8777 option to this experimental feature, and may change or go away in the 8778 future. 8779 */ 8780 8781 // Allocate space to store nesting levels 8782 void __kmp_init_nesting_mode() { 8783 int levels = KMP_HW_LAST; 8784 __kmp_nesting_mode_nlevels = levels; 8785 __kmp_nesting_nth_level = (int *)KMP_INTERNAL_MALLOC(levels * sizeof(int)); 8786 for (int i = 0; i < levels; ++i) 8787 __kmp_nesting_nth_level[i] = 0; 8788 if (__kmp_nested_nth.size < levels) { 8789 __kmp_nested_nth.nth = 8790 (int *)KMP_INTERNAL_REALLOC(__kmp_nested_nth.nth, levels * sizeof(int)); 8791 __kmp_nested_nth.size = levels; 8792 } 8793 } 8794 8795 // Set # threads for top levels of nesting; must be called after topology set 8796 void __kmp_set_nesting_mode_threads() { 8797 kmp_info_t *thread = __kmp_threads[__kmp_entry_gtid()]; 8798 8799 if (__kmp_nesting_mode == 1) 8800 __kmp_nesting_mode_nlevels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 8801 else if (__kmp_nesting_mode > 1) 8802 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8803 8804 if (__kmp_topology) { // use topology info 8805 int loc, hw_level; 8806 for (loc = 0, hw_level = 0; hw_level < __kmp_topology->get_depth() && 8807 loc < __kmp_nesting_mode_nlevels; 8808 loc++, hw_level++) { 8809 __kmp_nesting_nth_level[loc] = __kmp_topology->get_ratio(hw_level); 8810 if (__kmp_nesting_nth_level[loc] == 1) 8811 loc--; 8812 } 8813 // Make sure all cores are used 8814 if (__kmp_nesting_mode > 1 && loc > 1) { 8815 int core_level = __kmp_topology->get_level(KMP_HW_CORE); 8816 int num_cores = __kmp_topology->get_count(core_level); 8817 int upper_levels = 1; 8818 for (int level = 0; level < loc - 1; ++level) 8819 upper_levels *= __kmp_nesting_nth_level[level]; 8820 if (upper_levels * __kmp_nesting_nth_level[loc - 1] < num_cores) 8821 __kmp_nesting_nth_level[loc - 1] = 8822 num_cores / __kmp_nesting_nth_level[loc - 2]; 8823 } 8824 __kmp_nesting_mode_nlevels = loc; 8825 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8826 } else { // no topology info available; provide a reasonable guesstimation 8827 if (__kmp_avail_proc >= 4) { 8828 __kmp_nesting_nth_level[0] = __kmp_avail_proc / 2; 8829 __kmp_nesting_nth_level[1] = 2; 8830 __kmp_nesting_mode_nlevels = 2; 8831 } else { 8832 __kmp_nesting_nth_level[0] = __kmp_avail_proc; 8833 __kmp_nesting_mode_nlevels = 1; 8834 } 8835 __kmp_nested_nth.used = __kmp_nesting_mode_nlevels; 8836 } 8837 for (int i = 0; i < __kmp_nesting_mode_nlevels; ++i) { 8838 __kmp_nested_nth.nth[i] = __kmp_nesting_nth_level[i]; 8839 } 8840 set__nproc(thread, __kmp_nesting_nth_level[0]); 8841 if (__kmp_nesting_mode > 1 && __kmp_nesting_mode_nlevels > __kmp_nesting_mode) 8842 __kmp_nesting_mode_nlevels = __kmp_nesting_mode; 8843 if (get__max_active_levels(thread) > 1) { 8844 // if max levels was set, set nesting mode levels to same 8845 __kmp_nesting_mode_nlevels = get__max_active_levels(thread); 8846 } 8847 if (__kmp_nesting_mode == 1) // turn on nesting for this case only 8848 set__max_active_levels(thread, __kmp_nesting_mode_nlevels); 8849 } 8850