1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 /* these are temporary issues to be dealt with */ 36 #define KMP_USE_PRCTL 0 37 38 #if KMP_OS_WINDOWS 39 #include <process.h> 40 #endif 41 42 #include "tsan_annotations.h" 43 44 #if KMP_OS_WINDOWS 45 // windows does not need include files as it doesn't use shared memory 46 #else 47 #include <sys/mman.h> 48 #include <sys/stat.h> 49 #include <fcntl.h> 50 #define SHM_SIZE 1024 51 #endif 52 53 #if defined(KMP_GOMP_COMPAT) 54 char const __kmp_version_alt_comp[] = 55 KMP_VERSION_PREFIX "alternative compiler support: yes"; 56 #endif /* defined(KMP_GOMP_COMPAT) */ 57 58 char const __kmp_version_omp_api[] = 59 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 60 61 #ifdef KMP_DEBUG 62 char const __kmp_version_lock[] = 63 KMP_VERSION_PREFIX "lock type: run time selectable"; 64 #endif /* KMP_DEBUG */ 65 66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 67 68 /* ------------------------------------------------------------------------ */ 69 70 #if KMP_USE_MONITOR 71 kmp_info_t __kmp_monitor; 72 #endif 73 74 /* Forward declarations */ 75 76 void __kmp_cleanup(void); 77 78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 79 int gtid); 80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 81 kmp_internal_control_t *new_icvs, 82 ident_t *loc); 83 #if KMP_AFFINITY_SUPPORTED 84 static void __kmp_partition_places(kmp_team_t *team, 85 int update_master_only = 0); 86 #endif 87 static void __kmp_do_serial_initialize(void); 88 void __kmp_fork_barrier(int gtid, int tid); 89 void __kmp_join_barrier(int gtid); 90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 91 kmp_internal_control_t *new_icvs, ident_t *loc); 92 93 #ifdef USE_LOAD_BALANCE 94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 95 #endif 96 97 static int __kmp_expand_threads(int nNeed); 98 #if KMP_OS_WINDOWS 99 static int __kmp_unregister_root_other_thread(int gtid); 100 #endif 101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 103 104 /* Calculate the identifier of the current thread */ 105 /* fast (and somewhat portable) way to get unique identifier of executing 106 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 107 int __kmp_get_global_thread_id() { 108 int i; 109 kmp_info_t **other_threads; 110 size_t stack_data; 111 char *stack_addr; 112 size_t stack_size; 113 char *stack_base; 114 115 KA_TRACE( 116 1000, 117 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 118 __kmp_nth, __kmp_all_nth)); 119 120 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 121 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 122 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 123 __kmp_init_gtid for this to work. */ 124 125 if (!TCR_4(__kmp_init_gtid)) 126 return KMP_GTID_DNE; 127 128 #ifdef KMP_TDATA_GTID 129 if (TCR_4(__kmp_gtid_mode) >= 3) { 130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 131 return __kmp_gtid; 132 } 133 #endif 134 if (TCR_4(__kmp_gtid_mode) >= 2) { 135 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 136 return __kmp_gtid_get_specific(); 137 } 138 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 139 140 stack_addr = (char *)&stack_data; 141 other_threads = __kmp_threads; 142 143 /* ATT: The code below is a source of potential bugs due to unsynchronized 144 access to __kmp_threads array. For example: 145 1. Current thread loads other_threads[i] to thr and checks it, it is 146 non-NULL. 147 2. Current thread is suspended by OS. 148 3. Another thread unregisters and finishes (debug versions of free() 149 may fill memory with something like 0xEF). 150 4. Current thread is resumed. 151 5. Current thread reads junk from *thr. 152 TODO: Fix it. --ln */ 153 154 for (i = 0; i < __kmp_threads_capacity; i++) { 155 156 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 157 if (!thr) 158 continue; 159 160 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 161 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 162 163 /* stack grows down -- search through all of the active threads */ 164 165 if (stack_addr <= stack_base) { 166 size_t stack_diff = stack_base - stack_addr; 167 168 if (stack_diff <= stack_size) { 169 /* The only way we can be closer than the allocated */ 170 /* stack size is if we are running on this thread. */ 171 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 172 return i; 173 } 174 } 175 } 176 177 /* get specific to try and determine our gtid */ 178 KA_TRACE(1000, 179 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 180 "thread, using TLS\n")); 181 i = __kmp_gtid_get_specific(); 182 183 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 184 185 /* if we havn't been assigned a gtid, then return code */ 186 if (i < 0) 187 return i; 188 189 /* dynamically updated stack window for uber threads to avoid get_specific 190 call */ 191 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 192 KMP_FATAL(StackOverflow, i); 193 } 194 195 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 196 if (stack_addr > stack_base) { 197 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 198 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 199 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 200 stack_base); 201 } else { 202 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 203 stack_base - stack_addr); 204 } 205 206 /* Reprint stack bounds for ubermaster since they have been refined */ 207 if (__kmp_storage_map) { 208 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 209 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 210 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 211 other_threads[i]->th.th_info.ds.ds_stacksize, 212 "th_%d stack (refinement)", i); 213 } 214 return i; 215 } 216 217 int __kmp_get_global_thread_id_reg() { 218 int gtid; 219 220 if (!__kmp_init_serial) { 221 gtid = KMP_GTID_DNE; 222 } else 223 #ifdef KMP_TDATA_GTID 224 if (TCR_4(__kmp_gtid_mode) >= 3) { 225 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 226 gtid = __kmp_gtid; 227 } else 228 #endif 229 if (TCR_4(__kmp_gtid_mode) >= 2) { 230 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 231 gtid = __kmp_gtid_get_specific(); 232 } else { 233 KA_TRACE(1000, 234 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 235 gtid = __kmp_get_global_thread_id(); 236 } 237 238 /* we must be a new uber master sibling thread */ 239 if (gtid == KMP_GTID_DNE) { 240 KA_TRACE(10, 241 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 242 "Registering a new gtid.\n")); 243 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 244 if (!__kmp_init_serial) { 245 __kmp_do_serial_initialize(); 246 gtid = __kmp_gtid_get_specific(); 247 } else { 248 gtid = __kmp_register_root(FALSE); 249 } 250 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 251 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 252 } 253 254 KMP_DEBUG_ASSERT(gtid >= 0); 255 256 return gtid; 257 } 258 259 /* caller must hold forkjoin_lock */ 260 void __kmp_check_stack_overlap(kmp_info_t *th) { 261 int f; 262 char *stack_beg = NULL; 263 char *stack_end = NULL; 264 int gtid; 265 266 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 267 if (__kmp_storage_map) { 268 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 269 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 270 271 gtid = __kmp_gtid_from_thread(th); 272 273 if (gtid == KMP_GTID_MONITOR) { 274 __kmp_print_storage_map_gtid( 275 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 276 "th_%s stack (%s)", "mon", 277 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 278 } else { 279 __kmp_print_storage_map_gtid( 280 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 281 "th_%d stack (%s)", gtid, 282 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 283 } 284 } 285 286 /* No point in checking ubermaster threads since they use refinement and 287 * cannot overlap */ 288 gtid = __kmp_gtid_from_thread(th); 289 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 290 KA_TRACE(10, 291 ("__kmp_check_stack_overlap: performing extensive checking\n")); 292 if (stack_beg == NULL) { 293 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 294 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 295 } 296 297 for (f = 0; f < __kmp_threads_capacity; f++) { 298 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 299 300 if (f_th && f_th != th) { 301 char *other_stack_end = 302 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 303 char *other_stack_beg = 304 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 305 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 306 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 307 308 /* Print the other stack values before the abort */ 309 if (__kmp_storage_map) 310 __kmp_print_storage_map_gtid( 311 -1, other_stack_beg, other_stack_end, 312 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 313 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 314 315 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 316 __kmp_msg_null); 317 } 318 } 319 } 320 } 321 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 322 } 323 324 /* ------------------------------------------------------------------------ */ 325 326 void __kmp_infinite_loop(void) { 327 static int done = FALSE; 328 329 while (!done) { 330 KMP_YIELD(TRUE); 331 } 332 } 333 334 #define MAX_MESSAGE 512 335 336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 337 char const *format, ...) { 338 char buffer[MAX_MESSAGE]; 339 va_list ap; 340 341 va_start(ap, format); 342 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 343 p2, (unsigned long)size, format); 344 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 345 __kmp_vprintf(kmp_err, buffer, ap); 346 #if KMP_PRINT_DATA_PLACEMENT 347 int node; 348 if (gtid >= 0) { 349 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 350 if (__kmp_storage_map_verbose) { 351 node = __kmp_get_host_node(p1); 352 if (node < 0) /* doesn't work, so don't try this next time */ 353 __kmp_storage_map_verbose = FALSE; 354 else { 355 char *last; 356 int lastNode; 357 int localProc = __kmp_get_cpu_from_gtid(gtid); 358 359 const int page_size = KMP_GET_PAGE_SIZE(); 360 361 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 362 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 363 if (localProc >= 0) 364 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 365 localProc >> 1); 366 else 367 __kmp_printf_no_lock(" GTID %d\n", gtid); 368 #if KMP_USE_PRCTL 369 /* The more elaborate format is disabled for now because of the prctl 370 * hanging bug. */ 371 do { 372 last = p1; 373 lastNode = node; 374 /* This loop collates adjacent pages with the same host node. */ 375 do { 376 (char *)p1 += page_size; 377 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 378 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 379 lastNode); 380 } while (p1 <= p2); 381 #else 382 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 383 (char *)p1 + (page_size - 1), 384 __kmp_get_host_node(p1)); 385 if (p1 < p2) { 386 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 387 (char *)p2 + (page_size - 1), 388 __kmp_get_host_node(p2)); 389 } 390 #endif 391 } 392 } 393 } else 394 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 395 } 396 #endif /* KMP_PRINT_DATA_PLACEMENT */ 397 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 398 } 399 400 void __kmp_warn(char const *format, ...) { 401 char buffer[MAX_MESSAGE]; 402 va_list ap; 403 404 if (__kmp_generate_warnings == kmp_warnings_off) { 405 return; 406 } 407 408 va_start(ap, format); 409 410 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 411 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 412 __kmp_vprintf(kmp_err, buffer, ap); 413 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 414 415 va_end(ap); 416 } 417 418 void __kmp_abort_process() { 419 // Later threads may stall here, but that's ok because abort() will kill them. 420 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 421 422 if (__kmp_debug_buf) { 423 __kmp_dump_debug_buffer(); 424 } 425 426 if (KMP_OS_WINDOWS) { 427 // Let other threads know of abnormal termination and prevent deadlock 428 // if abort happened during library initialization or shutdown 429 __kmp_global.g.g_abort = SIGABRT; 430 431 /* On Windows* OS by default abort() causes pop-up error box, which stalls 432 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 433 boxes. _set_abort_behavior() works well, but this function is not 434 available in VS7 (this is not problem for DLL, but it is a problem for 435 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 436 help, at least in some versions of MS C RTL. 437 438 It seems following sequence is the only way to simulate abort() and 439 avoid pop-up error box. */ 440 raise(SIGABRT); 441 _exit(3); // Just in case, if signal ignored, exit anyway. 442 } else { 443 __kmp_unregister_library(); 444 abort(); 445 } 446 447 __kmp_infinite_loop(); 448 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 449 450 } // __kmp_abort_process 451 452 void __kmp_abort_thread(void) { 453 // TODO: Eliminate g_abort global variable and this function. 454 // In case of abort just call abort(), it will kill all the threads. 455 __kmp_infinite_loop(); 456 } // __kmp_abort_thread 457 458 /* Print out the storage map for the major kmp_info_t thread data structures 459 that are allocated together. */ 460 461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 462 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 463 gtid); 464 465 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 466 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 467 468 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 469 sizeof(kmp_local_t), "th_%d.th_local", gtid); 470 471 __kmp_print_storage_map_gtid( 472 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 473 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 474 475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 476 &thr->th.th_bar[bs_plain_barrier + 1], 477 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 478 gtid); 479 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 481 &thr->th.th_bar[bs_forkjoin_barrier + 1], 482 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 483 gtid); 484 485 #if KMP_FAST_REDUCTION_BARRIER 486 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 487 &thr->th.th_bar[bs_reduction_barrier + 1], 488 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 489 gtid); 490 #endif // KMP_FAST_REDUCTION_BARRIER 491 } 492 493 /* Print out the storage map for the major kmp_team_t team data structures 494 that are allocated together. */ 495 496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 497 int team_id, int num_thr) { 498 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 499 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 500 header, team_id); 501 502 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 503 &team->t.t_bar[bs_last_barrier], 504 sizeof(kmp_balign_team_t) * bs_last_barrier, 505 "%s_%d.t_bar", header, team_id); 506 507 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 508 &team->t.t_bar[bs_plain_barrier + 1], 509 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 510 header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 513 &team->t.t_bar[bs_forkjoin_barrier + 1], 514 sizeof(kmp_balign_team_t), 515 "%s_%d.t_bar[forkjoin]", header, team_id); 516 517 #if KMP_FAST_REDUCTION_BARRIER 518 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 519 &team->t.t_bar[bs_reduction_barrier + 1], 520 sizeof(kmp_balign_team_t), 521 "%s_%d.t_bar[reduction]", header, team_id); 522 #endif // KMP_FAST_REDUCTION_BARRIER 523 524 __kmp_print_storage_map_gtid( 525 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 526 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 527 528 __kmp_print_storage_map_gtid( 529 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 530 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 531 532 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 533 &team->t.t_disp_buffer[num_disp_buff], 534 sizeof(dispatch_shared_info_t) * num_disp_buff, 535 "%s_%d.t_disp_buffer", header, team_id); 536 } 537 538 static void __kmp_init_allocator() { __kmp_init_memkind(); } 539 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 540 541 /* ------------------------------------------------------------------------ */ 542 543 #if KMP_DYNAMIC_LIB 544 #if KMP_OS_WINDOWS 545 546 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) { 547 // TODO: Change to __kmp_break_bootstrap_lock(). 548 __kmp_init_bootstrap_lock(lck); // make the lock released 549 } 550 551 static void __kmp_reset_locks_on_process_detach(int gtid_req) { 552 int i; 553 int thread_count; 554 555 // PROCESS_DETACH is expected to be called by a thread that executes 556 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one 557 // calling ProcessExit or FreeLibrary). So, it might be safe to access the 558 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some 559 // threads can be still alive here, although being about to be terminated. The 560 // threads in the array with ds_thread==0 are most suspicious. Actually, it 561 // can be not safe to access the __kmp_threads[]. 562 563 // TODO: does it make sense to check __kmp_roots[] ? 564 565 // Let's check that there are no other alive threads registered with the OMP 566 // lib. 567 while (1) { 568 thread_count = 0; 569 for (i = 0; i < __kmp_threads_capacity; ++i) { 570 if (!__kmp_threads) 571 continue; 572 kmp_info_t *th = __kmp_threads[i]; 573 if (th == NULL) 574 continue; 575 int gtid = th->th.th_info.ds.ds_gtid; 576 if (gtid == gtid_req) 577 continue; 578 if (gtid < 0) 579 continue; 580 DWORD exit_val; 581 int alive = __kmp_is_thread_alive(th, &exit_val); 582 if (alive) { 583 ++thread_count; 584 } 585 } 586 if (thread_count == 0) 587 break; // success 588 } 589 590 // Assume that I'm alone. Now it might be safe to check and reset locks. 591 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset. 592 __kmp_reset_lock(&__kmp_forkjoin_lock); 593 #ifdef KMP_DEBUG 594 __kmp_reset_lock(&__kmp_stdio_lock); 595 #endif // KMP_DEBUG 596 } 597 598 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 599 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 600 601 switch (fdwReason) { 602 603 case DLL_PROCESS_ATTACH: 604 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 605 606 return TRUE; 607 608 case DLL_PROCESS_DETACH: 609 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 610 611 if (lpReserved != NULL) { 612 // lpReserved is used for telling the difference: 613 // lpReserved == NULL when FreeLibrary() was called, 614 // lpReserved != NULL when the process terminates. 615 // When FreeLibrary() is called, worker threads remain alive. So they will 616 // release the forkjoin lock by themselves. When the process terminates, 617 // worker threads disappear triggering the problem of unreleased forkjoin 618 // lock as described below. 619 620 // A worker thread can take the forkjoin lock. The problem comes up if 621 // that worker thread becomes dead before it releases the forkjoin lock. 622 // The forkjoin lock remains taken, while the thread executing 623 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try 624 // to take the forkjoin lock and will always fail, so that the application 625 // will never finish [normally]. This scenario is possible if 626 // __kmpc_end() has not been executed. It looks like it's not a corner 627 // case, but common cases: 628 // - the main function was compiled by an alternative compiler; 629 // - the main function was compiled by icl but without /Qopenmp 630 // (application with plugins); 631 // - application terminates by calling C exit(), Fortran CALL EXIT() or 632 // Fortran STOP. 633 // - alive foreign thread prevented __kmpc_end from doing cleanup. 634 // 635 // This is a hack to work around the problem. 636 // TODO: !!! figure out something better. 637 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific()); 638 } 639 640 __kmp_internal_end_library(__kmp_gtid_get_specific()); 641 642 return TRUE; 643 644 case DLL_THREAD_ATTACH: 645 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 646 647 /* if we want to register new siblings all the time here call 648 * __kmp_get_gtid(); */ 649 return TRUE; 650 651 case DLL_THREAD_DETACH: 652 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 653 654 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 655 return TRUE; 656 } 657 658 return TRUE; 659 } 660 661 #endif /* KMP_OS_WINDOWS */ 662 #endif /* KMP_DYNAMIC_LIB */ 663 664 /* __kmp_parallel_deo -- Wait until it's our turn. */ 665 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 666 int gtid = *gtid_ref; 667 #ifdef BUILD_PARALLEL_ORDERED 668 kmp_team_t *team = __kmp_team_from_gtid(gtid); 669 #endif /* BUILD_PARALLEL_ORDERED */ 670 671 if (__kmp_env_consistency_check) { 672 if (__kmp_threads[gtid]->th.th_root->r.r_active) 673 #if KMP_USE_DYNAMIC_LOCK 674 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 675 #else 676 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 677 #endif 678 } 679 #ifdef BUILD_PARALLEL_ORDERED 680 if (!team->t.t_serialized) { 681 KMP_MB(); 682 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 683 NULL); 684 KMP_MB(); 685 } 686 #endif /* BUILD_PARALLEL_ORDERED */ 687 } 688 689 /* __kmp_parallel_dxo -- Signal the next task. */ 690 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 691 int gtid = *gtid_ref; 692 #ifdef BUILD_PARALLEL_ORDERED 693 int tid = __kmp_tid_from_gtid(gtid); 694 kmp_team_t *team = __kmp_team_from_gtid(gtid); 695 #endif /* BUILD_PARALLEL_ORDERED */ 696 697 if (__kmp_env_consistency_check) { 698 if (__kmp_threads[gtid]->th.th_root->r.r_active) 699 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 700 } 701 #ifdef BUILD_PARALLEL_ORDERED 702 if (!team->t.t_serialized) { 703 KMP_MB(); /* Flush all pending memory write invalidates. */ 704 705 /* use the tid of the next thread in this team */ 706 /* TODO replace with general release procedure */ 707 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 708 709 KMP_MB(); /* Flush all pending memory write invalidates. */ 710 } 711 #endif /* BUILD_PARALLEL_ORDERED */ 712 } 713 714 /* ------------------------------------------------------------------------ */ 715 /* The BARRIER for a SINGLE process section is always explicit */ 716 717 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 718 int status; 719 kmp_info_t *th; 720 kmp_team_t *team; 721 722 if (!TCR_4(__kmp_init_parallel)) 723 __kmp_parallel_initialize(); 724 __kmp_resume_if_soft_paused(); 725 726 th = __kmp_threads[gtid]; 727 team = th->th.th_team; 728 status = 0; 729 730 th->th.th_ident = id_ref; 731 732 if (team->t.t_serialized) { 733 status = 1; 734 } else { 735 kmp_int32 old_this = th->th.th_local.this_construct; 736 737 ++th->th.th_local.this_construct; 738 /* try to set team count to thread count--success means thread got the 739 single block */ 740 /* TODO: Should this be acquire or release? */ 741 if (team->t.t_construct == old_this) { 742 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 743 th->th.th_local.this_construct); 744 } 745 #if USE_ITT_BUILD 746 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 747 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 748 team->t.t_active_level == 749 1) { // Only report metadata by master of active team at level 1 750 __kmp_itt_metadata_single(id_ref); 751 } 752 #endif /* USE_ITT_BUILD */ 753 } 754 755 if (__kmp_env_consistency_check) { 756 if (status && push_ws) { 757 __kmp_push_workshare(gtid, ct_psingle, id_ref); 758 } else { 759 __kmp_check_workshare(gtid, ct_psingle, id_ref); 760 } 761 } 762 #if USE_ITT_BUILD 763 if (status) { 764 __kmp_itt_single_start(gtid); 765 } 766 #endif /* USE_ITT_BUILD */ 767 return status; 768 } 769 770 void __kmp_exit_single(int gtid) { 771 #if USE_ITT_BUILD 772 __kmp_itt_single_end(gtid); 773 #endif /* USE_ITT_BUILD */ 774 if (__kmp_env_consistency_check) 775 __kmp_pop_workshare(gtid, ct_psingle, NULL); 776 } 777 778 /* determine if we can go parallel or must use a serialized parallel region and 779 * how many threads we can use 780 * set_nproc is the number of threads requested for the team 781 * returns 0 if we should serialize or only use one thread, 782 * otherwise the number of threads to use 783 * The forkjoin lock is held by the caller. */ 784 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 785 int master_tid, int set_nthreads, 786 int enter_teams) { 787 int capacity; 788 int new_nthreads; 789 KMP_DEBUG_ASSERT(__kmp_init_serial); 790 KMP_DEBUG_ASSERT(root && parent_team); 791 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 792 793 // If dyn-var is set, dynamically adjust the number of desired threads, 794 // according to the method specified by dynamic_mode. 795 new_nthreads = set_nthreads; 796 if (!get__dynamic_2(parent_team, master_tid)) { 797 ; 798 } 799 #ifdef USE_LOAD_BALANCE 800 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 801 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 802 if (new_nthreads == 1) { 803 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 804 "reservation to 1 thread\n", 805 master_tid)); 806 return 1; 807 } 808 if (new_nthreads < set_nthreads) { 809 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 810 "reservation to %d threads\n", 811 master_tid, new_nthreads)); 812 } 813 } 814 #endif /* USE_LOAD_BALANCE */ 815 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 816 new_nthreads = __kmp_avail_proc - __kmp_nth + 817 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 818 if (new_nthreads <= 1) { 819 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 820 "reservation to 1 thread\n", 821 master_tid)); 822 return 1; 823 } 824 if (new_nthreads < set_nthreads) { 825 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 826 "reservation to %d threads\n", 827 master_tid, new_nthreads)); 828 } else { 829 new_nthreads = set_nthreads; 830 } 831 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 832 if (set_nthreads > 2) { 833 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 834 new_nthreads = (new_nthreads % set_nthreads) + 1; 835 if (new_nthreads == 1) { 836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 837 "reservation to 1 thread\n", 838 master_tid)); 839 return 1; 840 } 841 if (new_nthreads < set_nthreads) { 842 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 843 "reservation to %d threads\n", 844 master_tid, new_nthreads)); 845 } 846 } 847 } else { 848 KMP_ASSERT(0); 849 } 850 851 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 852 if (__kmp_nth + new_nthreads - 853 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 854 __kmp_max_nth) { 855 int tl_nthreads = __kmp_max_nth - __kmp_nth + 856 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 857 if (tl_nthreads <= 0) { 858 tl_nthreads = 1; 859 } 860 861 // If dyn-var is false, emit a 1-time warning. 862 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 863 __kmp_reserve_warn = 1; 864 __kmp_msg(kmp_ms_warning, 865 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 866 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 867 } 868 if (tl_nthreads == 1) { 869 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 870 "reduced reservation to 1 thread\n", 871 master_tid)); 872 return 1; 873 } 874 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 875 "reservation to %d threads\n", 876 master_tid, tl_nthreads)); 877 new_nthreads = tl_nthreads; 878 } 879 880 // Respect OMP_THREAD_LIMIT 881 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 882 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 883 if (cg_nthreads + new_nthreads - 884 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 885 max_cg_threads) { 886 int tl_nthreads = max_cg_threads - cg_nthreads + 887 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 888 if (tl_nthreads <= 0) { 889 tl_nthreads = 1; 890 } 891 892 // If dyn-var is false, emit a 1-time warning. 893 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 894 __kmp_reserve_warn = 1; 895 __kmp_msg(kmp_ms_warning, 896 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 897 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 898 } 899 if (tl_nthreads == 1) { 900 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 901 "reduced reservation to 1 thread\n", 902 master_tid)); 903 return 1; 904 } 905 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 906 "reservation to %d threads\n", 907 master_tid, tl_nthreads)); 908 new_nthreads = tl_nthreads; 909 } 910 911 // Check if the threads array is large enough, or needs expanding. 912 // See comment in __kmp_register_root() about the adjustment if 913 // __kmp_threads[0] == NULL. 914 capacity = __kmp_threads_capacity; 915 if (TCR_PTR(__kmp_threads[0]) == NULL) { 916 --capacity; 917 } 918 if (__kmp_nth + new_nthreads - 919 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 920 capacity) { 921 // Expand the threads array. 922 int slotsRequired = __kmp_nth + new_nthreads - 923 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 924 capacity; 925 int slotsAdded = __kmp_expand_threads(slotsRequired); 926 if (slotsAdded < slotsRequired) { 927 // The threads array was not expanded enough. 928 new_nthreads -= (slotsRequired - slotsAdded); 929 KMP_ASSERT(new_nthreads >= 1); 930 931 // If dyn-var is false, emit a 1-time warning. 932 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 933 __kmp_reserve_warn = 1; 934 if (__kmp_tp_cached) { 935 __kmp_msg(kmp_ms_warning, 936 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 937 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 938 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 939 } else { 940 __kmp_msg(kmp_ms_warning, 941 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 942 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 943 } 944 } 945 } 946 } 947 948 #ifdef KMP_DEBUG 949 if (new_nthreads == 1) { 950 KC_TRACE(10, 951 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 952 "dead roots and rechecking; requested %d threads\n", 953 __kmp_get_gtid(), set_nthreads)); 954 } else { 955 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 956 " %d threads\n", 957 __kmp_get_gtid(), new_nthreads, set_nthreads)); 958 } 959 #endif // KMP_DEBUG 960 return new_nthreads; 961 } 962 963 /* Allocate threads from the thread pool and assign them to the new team. We are 964 assured that there are enough threads available, because we checked on that 965 earlier within critical section forkjoin */ 966 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 967 kmp_info_t *master_th, int master_gtid) { 968 int i; 969 int use_hot_team; 970 971 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 972 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 973 KMP_MB(); 974 975 /* first, let's setup the master thread */ 976 master_th->th.th_info.ds.ds_tid = 0; 977 master_th->th.th_team = team; 978 master_th->th.th_team_nproc = team->t.t_nproc; 979 master_th->th.th_team_master = master_th; 980 master_th->th.th_team_serialized = FALSE; 981 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 982 983 /* make sure we are not the optimized hot team */ 984 #if KMP_NESTED_HOT_TEAMS 985 use_hot_team = 0; 986 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 987 if (hot_teams) { // hot teams array is not allocated if 988 // KMP_HOT_TEAMS_MAX_LEVEL=0 989 int level = team->t.t_active_level - 1; // index in array of hot teams 990 if (master_th->th.th_teams_microtask) { // are we inside the teams? 991 if (master_th->th.th_teams_size.nteams > 1) { 992 ++level; // level was not increased in teams construct for 993 // team_of_masters 994 } 995 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 996 master_th->th.th_teams_level == team->t.t_level) { 997 ++level; // level was not increased in teams construct for 998 // team_of_workers before the parallel 999 } // team->t.t_level will be increased inside parallel 1000 } 1001 if (level < __kmp_hot_teams_max_level) { 1002 if (hot_teams[level].hot_team) { 1003 // hot team has already been allocated for given level 1004 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 1005 use_hot_team = 1; // the team is ready to use 1006 } else { 1007 use_hot_team = 0; // AC: threads are not allocated yet 1008 hot_teams[level].hot_team = team; // remember new hot team 1009 hot_teams[level].hot_team_nth = team->t.t_nproc; 1010 } 1011 } else { 1012 use_hot_team = 0; 1013 } 1014 } 1015 #else 1016 use_hot_team = team == root->r.r_hot_team; 1017 #endif 1018 if (!use_hot_team) { 1019 1020 /* install the master thread */ 1021 team->t.t_threads[0] = master_th; 1022 __kmp_initialize_info(master_th, team, 0, master_gtid); 1023 1024 /* now, install the worker threads */ 1025 for (i = 1; i < team->t.t_nproc; i++) { 1026 1027 /* fork or reallocate a new thread and install it in team */ 1028 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1029 team->t.t_threads[i] = thr; 1030 KMP_DEBUG_ASSERT(thr); 1031 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1032 /* align team and thread arrived states */ 1033 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1034 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1035 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1036 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1037 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1038 team->t.t_bar[bs_plain_barrier].b_arrived)); 1039 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1040 thr->th.th_teams_level = master_th->th.th_teams_level; 1041 thr->th.th_teams_size = master_th->th.th_teams_size; 1042 { // Initialize threads' barrier data. 1043 int b; 1044 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1045 for (b = 0; b < bs_last_barrier; ++b) { 1046 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1047 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1048 #if USE_DEBUGGER 1049 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1050 #endif 1051 } 1052 } 1053 } 1054 1055 #if KMP_AFFINITY_SUPPORTED 1056 __kmp_partition_places(team); 1057 #endif 1058 } 1059 1060 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1061 for (i = 0; i < team->t.t_nproc; i++) { 1062 kmp_info_t *thr = team->t.t_threads[i]; 1063 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1064 thr->th.th_prev_level != team->t.t_level) { 1065 team->t.t_display_affinity = 1; 1066 break; 1067 } 1068 } 1069 } 1070 1071 KMP_MB(); 1072 } 1073 1074 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1075 // Propagate any changes to the floating point control registers out to the team 1076 // We try to avoid unnecessary writes to the relevant cache line in the team 1077 // structure, so we don't make changes unless they are needed. 1078 inline static void propagateFPControl(kmp_team_t *team) { 1079 if (__kmp_inherit_fp_control) { 1080 kmp_int16 x87_fpu_control_word; 1081 kmp_uint32 mxcsr; 1082 1083 // Get master values of FPU control flags (both X87 and vector) 1084 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1085 __kmp_store_mxcsr(&mxcsr); 1086 mxcsr &= KMP_X86_MXCSR_MASK; 1087 1088 // There is no point looking at t_fp_control_saved here. 1089 // If it is TRUE, we still have to update the values if they are different 1090 // from those we now have. If it is FALSE we didn't save anything yet, but 1091 // our objective is the same. We have to ensure that the values in the team 1092 // are the same as those we have. 1093 // So, this code achieves what we need whether or not t_fp_control_saved is 1094 // true. By checking whether the value needs updating we avoid unnecessary 1095 // writes that would put the cache-line into a written state, causing all 1096 // threads in the team to have to read it again. 1097 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1098 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1099 // Although we don't use this value, other code in the runtime wants to know 1100 // whether it should restore them. So we must ensure it is correct. 1101 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1102 } else { 1103 // Similarly here. Don't write to this cache-line in the team structure 1104 // unless we have to. 1105 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1106 } 1107 } 1108 1109 // Do the opposite, setting the hardware registers to the updated values from 1110 // the team. 1111 inline static void updateHWFPControl(kmp_team_t *team) { 1112 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1113 // Only reset the fp control regs if they have been changed in the team. 1114 // the parallel region that we are exiting. 1115 kmp_int16 x87_fpu_control_word; 1116 kmp_uint32 mxcsr; 1117 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1118 __kmp_store_mxcsr(&mxcsr); 1119 mxcsr &= KMP_X86_MXCSR_MASK; 1120 1121 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1122 __kmp_clear_x87_fpu_status_word(); 1123 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1124 } 1125 1126 if (team->t.t_mxcsr != mxcsr) { 1127 __kmp_load_mxcsr(&team->t.t_mxcsr); 1128 } 1129 } 1130 } 1131 #else 1132 #define propagateFPControl(x) ((void)0) 1133 #define updateHWFPControl(x) ((void)0) 1134 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1135 1136 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1137 int realloc); // forward declaration 1138 1139 /* Run a parallel region that has been serialized, so runs only in a team of the 1140 single master thread. */ 1141 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1142 kmp_info_t *this_thr; 1143 kmp_team_t *serial_team; 1144 1145 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1146 1147 /* Skip all this code for autopar serialized loops since it results in 1148 unacceptable overhead */ 1149 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1150 return; 1151 1152 if (!TCR_4(__kmp_init_parallel)) 1153 __kmp_parallel_initialize(); 1154 __kmp_resume_if_soft_paused(); 1155 1156 this_thr = __kmp_threads[global_tid]; 1157 serial_team = this_thr->th.th_serial_team; 1158 1159 /* utilize the serialized team held by this thread */ 1160 KMP_DEBUG_ASSERT(serial_team); 1161 KMP_MB(); 1162 1163 if (__kmp_tasking_mode != tskm_immediate_exec) { 1164 KMP_DEBUG_ASSERT( 1165 this_thr->th.th_task_team == 1166 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1167 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1168 NULL); 1169 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1170 "team %p, new task_team = NULL\n", 1171 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1172 this_thr->th.th_task_team = NULL; 1173 } 1174 1175 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1176 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1177 proc_bind = proc_bind_false; 1178 } else if (proc_bind == proc_bind_default) { 1179 // No proc_bind clause was specified, so use the current value 1180 // of proc-bind-var for this parallel region. 1181 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1182 } 1183 // Reset for next parallel region 1184 this_thr->th.th_set_proc_bind = proc_bind_default; 1185 1186 #if OMPT_SUPPORT 1187 ompt_data_t ompt_parallel_data = ompt_data_none; 1188 ompt_data_t *implicit_task_data; 1189 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1190 if (ompt_enabled.enabled && 1191 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1192 1193 ompt_task_info_t *parent_task_info; 1194 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1195 1196 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1197 if (ompt_enabled.ompt_callback_parallel_begin) { 1198 int team_size = 1; 1199 1200 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1201 &(parent_task_info->task_data), &(parent_task_info->frame), 1202 &ompt_parallel_data, team_size, 1203 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1204 } 1205 } 1206 #endif // OMPT_SUPPORT 1207 1208 if (this_thr->th.th_team != serial_team) { 1209 // Nested level will be an index in the nested nthreads array 1210 int level = this_thr->th.th_team->t.t_level; 1211 1212 if (serial_team->t.t_serialized) { 1213 /* this serial team was already used 1214 TODO increase performance by making this locks more specific */ 1215 kmp_team_t *new_team; 1216 1217 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1218 1219 new_team = 1220 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1221 #if OMPT_SUPPORT 1222 ompt_parallel_data, 1223 #endif 1224 proc_bind, &this_thr->th.th_current_task->td_icvs, 1225 0 USE_NESTED_HOT_ARG(NULL)); 1226 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1227 KMP_ASSERT(new_team); 1228 1229 /* setup new serialized team and install it */ 1230 new_team->t.t_threads[0] = this_thr; 1231 new_team->t.t_parent = this_thr->th.th_team; 1232 serial_team = new_team; 1233 this_thr->th.th_serial_team = serial_team; 1234 1235 KF_TRACE( 1236 10, 1237 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1238 global_tid, serial_team)); 1239 1240 /* TODO the above breaks the requirement that if we run out of resources, 1241 then we can still guarantee that serialized teams are ok, since we may 1242 need to allocate a new one */ 1243 } else { 1244 KF_TRACE( 1245 10, 1246 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1247 global_tid, serial_team)); 1248 } 1249 1250 /* we have to initialize this serial team */ 1251 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1252 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1253 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1254 serial_team->t.t_ident = loc; 1255 serial_team->t.t_serialized = 1; 1256 serial_team->t.t_nproc = 1; 1257 serial_team->t.t_parent = this_thr->th.th_team; 1258 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1259 this_thr->th.th_team = serial_team; 1260 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1261 1262 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1263 this_thr->th.th_current_task)); 1264 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1265 this_thr->th.th_current_task->td_flags.executing = 0; 1266 1267 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1268 1269 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1270 implicit task for each serialized task represented by 1271 team->t.t_serialized? */ 1272 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1273 &this_thr->th.th_current_task->td_parent->td_icvs); 1274 1275 // Thread value exists in the nested nthreads array for the next nested 1276 // level 1277 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1278 this_thr->th.th_current_task->td_icvs.nproc = 1279 __kmp_nested_nth.nth[level + 1]; 1280 } 1281 1282 if (__kmp_nested_proc_bind.used && 1283 (level + 1 < __kmp_nested_proc_bind.used)) { 1284 this_thr->th.th_current_task->td_icvs.proc_bind = 1285 __kmp_nested_proc_bind.bind_types[level + 1]; 1286 } 1287 1288 #if USE_DEBUGGER 1289 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1290 #endif 1291 this_thr->th.th_info.ds.ds_tid = 0; 1292 1293 /* set thread cache values */ 1294 this_thr->th.th_team_nproc = 1; 1295 this_thr->th.th_team_master = this_thr; 1296 this_thr->th.th_team_serialized = 1; 1297 1298 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1299 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1300 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1301 1302 propagateFPControl(serial_team); 1303 1304 /* check if we need to allocate dispatch buffers stack */ 1305 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1306 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1307 serial_team->t.t_dispatch->th_disp_buffer = 1308 (dispatch_private_info_t *)__kmp_allocate( 1309 sizeof(dispatch_private_info_t)); 1310 } 1311 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1312 1313 KMP_MB(); 1314 1315 } else { 1316 /* this serialized team is already being used, 1317 * that's fine, just add another nested level */ 1318 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1319 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1320 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1321 ++serial_team->t.t_serialized; 1322 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1323 1324 // Nested level will be an index in the nested nthreads array 1325 int level = this_thr->th.th_team->t.t_level; 1326 // Thread value exists in the nested nthreads array for the next nested 1327 // level 1328 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1329 this_thr->th.th_current_task->td_icvs.nproc = 1330 __kmp_nested_nth.nth[level + 1]; 1331 } 1332 serial_team->t.t_level++; 1333 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1334 "of serial team %p to %d\n", 1335 global_tid, serial_team, serial_team->t.t_level)); 1336 1337 /* allocate/push dispatch buffers stack */ 1338 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1339 { 1340 dispatch_private_info_t *disp_buffer = 1341 (dispatch_private_info_t *)__kmp_allocate( 1342 sizeof(dispatch_private_info_t)); 1343 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1344 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1345 } 1346 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1347 1348 KMP_MB(); 1349 } 1350 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1351 1352 // Perform the display affinity functionality for 1353 // serialized parallel regions 1354 if (__kmp_display_affinity) { 1355 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1356 this_thr->th.th_prev_num_threads != 1) { 1357 // NULL means use the affinity-format-var ICV 1358 __kmp_aux_display_affinity(global_tid, NULL); 1359 this_thr->th.th_prev_level = serial_team->t.t_level; 1360 this_thr->th.th_prev_num_threads = 1; 1361 } 1362 } 1363 1364 if (__kmp_env_consistency_check) 1365 __kmp_push_parallel(global_tid, NULL); 1366 #if OMPT_SUPPORT 1367 serial_team->t.ompt_team_info.master_return_address = codeptr; 1368 if (ompt_enabled.enabled && 1369 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1370 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1371 1372 ompt_lw_taskteam_t lw_taskteam; 1373 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1374 &ompt_parallel_data, codeptr); 1375 1376 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1377 // don't use lw_taskteam after linking. content was swaped 1378 1379 /* OMPT implicit task begin */ 1380 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1381 if (ompt_enabled.ompt_callback_implicit_task) { 1382 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1383 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1384 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1385 OMPT_CUR_TASK_INFO(this_thr) 1386 ->thread_num = __kmp_tid_from_gtid(global_tid); 1387 } 1388 1389 /* OMPT state */ 1390 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1391 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1392 } 1393 #endif 1394 } 1395 1396 /* most of the work for a fork */ 1397 /* return true if we really went parallel, false if serialized */ 1398 int __kmp_fork_call(ident_t *loc, int gtid, 1399 enum fork_context_e call_context, // Intel, GNU, ... 1400 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1401 kmp_va_list ap) { 1402 void **argv; 1403 int i; 1404 int master_tid; 1405 int master_this_cons; 1406 kmp_team_t *team; 1407 kmp_team_t *parent_team; 1408 kmp_info_t *master_th; 1409 kmp_root_t *root; 1410 int nthreads; 1411 int master_active; 1412 int master_set_numthreads; 1413 int level; 1414 int active_level; 1415 int teams_level; 1416 #if KMP_NESTED_HOT_TEAMS 1417 kmp_hot_team_ptr_t **p_hot_teams; 1418 #endif 1419 { // KMP_TIME_BLOCK 1420 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1421 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1422 1423 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1424 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1425 /* Some systems prefer the stack for the root thread(s) to start with */ 1426 /* some gap from the parent stack to prevent false sharing. */ 1427 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1428 /* These 2 lines below are so this does not get optimized out */ 1429 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1430 __kmp_stkpadding += (short)((kmp_int64)dummy); 1431 } 1432 1433 /* initialize if needed */ 1434 KMP_DEBUG_ASSERT( 1435 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1436 if (!TCR_4(__kmp_init_parallel)) 1437 __kmp_parallel_initialize(); 1438 __kmp_resume_if_soft_paused(); 1439 1440 /* setup current data */ 1441 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1442 // shutdown 1443 parent_team = master_th->th.th_team; 1444 master_tid = master_th->th.th_info.ds.ds_tid; 1445 master_this_cons = master_th->th.th_local.this_construct; 1446 root = master_th->th.th_root; 1447 master_active = root->r.r_active; 1448 master_set_numthreads = master_th->th.th_set_nproc; 1449 1450 #if OMPT_SUPPORT 1451 ompt_data_t ompt_parallel_data = ompt_data_none; 1452 ompt_data_t *parent_task_data; 1453 ompt_frame_t *ompt_frame; 1454 ompt_data_t *implicit_task_data; 1455 void *return_address = NULL; 1456 1457 if (ompt_enabled.enabled) { 1458 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1459 NULL, NULL); 1460 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1461 } 1462 #endif 1463 1464 // Nested level will be an index in the nested nthreads array 1465 level = parent_team->t.t_level; 1466 // used to launch non-serial teams even if nested is not allowed 1467 active_level = parent_team->t.t_active_level; 1468 // needed to check nesting inside the teams 1469 teams_level = master_th->th.th_teams_level; 1470 #if KMP_NESTED_HOT_TEAMS 1471 p_hot_teams = &master_th->th.th_hot_teams; 1472 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1473 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1474 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1475 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1476 // it is either actual or not needed (when active_level > 0) 1477 (*p_hot_teams)[0].hot_team_nth = 1; 1478 } 1479 #endif 1480 1481 #if OMPT_SUPPORT 1482 if (ompt_enabled.enabled) { 1483 if (ompt_enabled.ompt_callback_parallel_begin) { 1484 int team_size = master_set_numthreads 1485 ? master_set_numthreads 1486 : get__nproc_2(parent_team, master_tid); 1487 int flags = OMPT_INVOKER(call_context) | 1488 ((microtask == (microtask_t)__kmp_teams_master) 1489 ? ompt_parallel_league 1490 : ompt_parallel_team); 1491 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1492 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1493 return_address); 1494 } 1495 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1496 } 1497 #endif 1498 1499 master_th->th.th_ident = loc; 1500 1501 if (master_th->th.th_teams_microtask && ap && 1502 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1503 // AC: This is start of parallel that is nested inside teams construct. 1504 // The team is actual (hot), all workers are ready at the fork barrier. 1505 // No lock needed to initialize the team a bit, then free workers. 1506 parent_team->t.t_ident = loc; 1507 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1508 parent_team->t.t_argc = argc; 1509 argv = (void **)parent_team->t.t_argv; 1510 for (i = argc - 1; i >= 0; --i) 1511 *argv++ = va_arg(kmp_va_deref(ap), void *); 1512 // Increment our nested depth levels, but not increase the serialization 1513 if (parent_team == master_th->th.th_serial_team) { 1514 // AC: we are in serialized parallel 1515 __kmpc_serialized_parallel(loc, gtid); 1516 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1517 1518 if (call_context == fork_context_gnu) { 1519 // AC: need to decrement t_serialized for enquiry functions to work 1520 // correctly, will restore at join time 1521 parent_team->t.t_serialized--; 1522 return TRUE; 1523 } 1524 1525 #if OMPT_SUPPORT 1526 void *dummy; 1527 void **exit_frame_p; 1528 1529 ompt_lw_taskteam_t lw_taskteam; 1530 1531 if (ompt_enabled.enabled) { 1532 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1533 &ompt_parallel_data, return_address); 1534 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1535 1536 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1537 // don't use lw_taskteam after linking. content was swaped 1538 1539 /* OMPT implicit task begin */ 1540 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1541 if (ompt_enabled.ompt_callback_implicit_task) { 1542 OMPT_CUR_TASK_INFO(master_th) 1543 ->thread_num = __kmp_tid_from_gtid(gtid); 1544 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1545 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1546 implicit_task_data, 1, 1547 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1548 } 1549 1550 /* OMPT state */ 1551 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1552 } else { 1553 exit_frame_p = &dummy; 1554 } 1555 #endif 1556 // AC: need to decrement t_serialized for enquiry functions to work 1557 // correctly, will restore at join time 1558 parent_team->t.t_serialized--; 1559 1560 { 1561 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1562 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1563 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1564 #if OMPT_SUPPORT 1565 , 1566 exit_frame_p 1567 #endif 1568 ); 1569 } 1570 1571 #if OMPT_SUPPORT 1572 if (ompt_enabled.enabled) { 1573 *exit_frame_p = NULL; 1574 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1575 if (ompt_enabled.ompt_callback_implicit_task) { 1576 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1577 ompt_scope_end, NULL, implicit_task_data, 1, 1578 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1579 } 1580 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1581 __ompt_lw_taskteam_unlink(master_th); 1582 if (ompt_enabled.ompt_callback_parallel_end) { 1583 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1584 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1585 OMPT_INVOKER(call_context) | ompt_parallel_team, 1586 return_address); 1587 } 1588 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1589 } 1590 #endif 1591 return TRUE; 1592 } 1593 1594 parent_team->t.t_pkfn = microtask; 1595 parent_team->t.t_invoke = invoker; 1596 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1597 parent_team->t.t_active_level++; 1598 parent_team->t.t_level++; 1599 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1600 1601 #if OMPT_SUPPORT 1602 if (ompt_enabled.enabled) { 1603 ompt_lw_taskteam_t lw_taskteam; 1604 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1605 &ompt_parallel_data, return_address); 1606 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1607 } 1608 #endif 1609 1610 /* Change number of threads in the team if requested */ 1611 if (master_set_numthreads) { // The parallel has num_threads clause 1612 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1613 // AC: only can reduce number of threads dynamically, can't increase 1614 kmp_info_t **other_threads = parent_team->t.t_threads; 1615 parent_team->t.t_nproc = master_set_numthreads; 1616 for (i = 0; i < master_set_numthreads; ++i) { 1617 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1618 } 1619 // Keep extra threads hot in the team for possible next parallels 1620 } 1621 master_th->th.th_set_nproc = 0; 1622 } 1623 1624 #if USE_DEBUGGER 1625 if (__kmp_debugging) { // Let debugger override number of threads. 1626 int nth = __kmp_omp_num_threads(loc); 1627 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1628 master_set_numthreads = nth; 1629 } 1630 } 1631 #endif 1632 1633 #if USE_ITT_BUILD && USE_ITT_NOTIFY 1634 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1635 KMP_ITT_DEBUG) && 1636 __kmp_forkjoin_frames_mode == 3 && 1637 parent_team->t.t_active_level == 1 // only report frames at level 1 1638 && master_th->th.th_teams_size.nteams == 1) { 1639 kmp_uint64 tmp_time = __itt_get_timestamp(); 1640 master_th->th.th_frame_time = tmp_time; 1641 parent_team->t.t_region_time = tmp_time; 1642 } 1643 if (__itt_stack_caller_create_ptr) { 1644 // create new stack stitching id before entering fork barrier 1645 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1646 } 1647 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY */ 1648 1649 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1650 "master_th=%p, gtid=%d\n", 1651 root, parent_team, master_th, gtid)); 1652 __kmp_internal_fork(loc, gtid, parent_team); 1653 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1654 "master_th=%p, gtid=%d\n", 1655 root, parent_team, master_th, gtid)); 1656 1657 if (call_context == fork_context_gnu) 1658 return TRUE; 1659 1660 /* Invoke microtask for MASTER thread */ 1661 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1662 parent_team->t.t_id, parent_team->t.t_pkfn)); 1663 1664 if (!parent_team->t.t_invoke(gtid)) { 1665 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1666 } 1667 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1668 parent_team->t.t_id, parent_team->t.t_pkfn)); 1669 KMP_MB(); /* Flush all pending memory write invalidates. */ 1670 1671 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1672 1673 return TRUE; 1674 } // Parallel closely nested in teams construct 1675 1676 #if KMP_DEBUG 1677 if (__kmp_tasking_mode != tskm_immediate_exec) { 1678 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1679 parent_team->t.t_task_team[master_th->th.th_task_state]); 1680 } 1681 #endif 1682 1683 if (parent_team->t.t_active_level >= 1684 master_th->th.th_current_task->td_icvs.max_active_levels) { 1685 nthreads = 1; 1686 } else { 1687 int enter_teams = ((ap == NULL && active_level == 0) || 1688 (ap && teams_level > 0 && teams_level == level)); 1689 nthreads = 1690 master_set_numthreads 1691 ? master_set_numthreads 1692 : get__nproc_2( 1693 parent_team, 1694 master_tid); // TODO: get nproc directly from current task 1695 1696 // Check if we need to take forkjoin lock? (no need for serialized 1697 // parallel out of teams construct). This code moved here from 1698 // __kmp_reserve_threads() to speedup nested serialized parallels. 1699 if (nthreads > 1) { 1700 if ((get__max_active_levels(master_th) == 1 && 1701 (root->r.r_in_parallel && !enter_teams)) || 1702 (__kmp_library == library_serial)) { 1703 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1704 " threads\n", 1705 gtid, nthreads)); 1706 nthreads = 1; 1707 } 1708 } 1709 if (nthreads > 1) { 1710 /* determine how many new threads we can use */ 1711 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1712 /* AC: If we execute teams from parallel region (on host), then teams 1713 should be created but each can only have 1 thread if nesting is 1714 disabled. If teams called from serial region, then teams and their 1715 threads should be created regardless of the nesting setting. */ 1716 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1717 nthreads, enter_teams); 1718 if (nthreads == 1) { 1719 // Free lock for single thread execution here; for multi-thread 1720 // execution it will be freed later after team of threads created 1721 // and initialized 1722 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1723 } 1724 } 1725 } 1726 KMP_DEBUG_ASSERT(nthreads > 0); 1727 1728 // If we temporarily changed the set number of threads then restore it now 1729 master_th->th.th_set_nproc = 0; 1730 1731 /* create a serialized parallel region? */ 1732 if (nthreads == 1) { 1733 /* josh todo: hypothetical question: what do we do for OS X*? */ 1734 #if KMP_OS_LINUX && \ 1735 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1736 void *args[argc]; 1737 #else 1738 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1739 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1740 KMP_ARCH_AARCH64) */ 1741 1742 KA_TRACE(20, 1743 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1744 1745 __kmpc_serialized_parallel(loc, gtid); 1746 1747 if (call_context == fork_context_intel) { 1748 /* TODO this sucks, use the compiler itself to pass args! :) */ 1749 master_th->th.th_serial_team->t.t_ident = loc; 1750 if (!ap) { 1751 // revert change made in __kmpc_serialized_parallel() 1752 master_th->th.th_serial_team->t.t_level--; 1753 // Get args from parent team for teams construct 1754 1755 #if OMPT_SUPPORT 1756 void *dummy; 1757 void **exit_frame_p; 1758 ompt_task_info_t *task_info; 1759 1760 ompt_lw_taskteam_t lw_taskteam; 1761 1762 if (ompt_enabled.enabled) { 1763 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1764 &ompt_parallel_data, return_address); 1765 1766 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1767 // don't use lw_taskteam after linking. content was swaped 1768 1769 task_info = OMPT_CUR_TASK_INFO(master_th); 1770 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1771 if (ompt_enabled.ompt_callback_implicit_task) { 1772 OMPT_CUR_TASK_INFO(master_th) 1773 ->thread_num = __kmp_tid_from_gtid(gtid); 1774 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1775 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1776 &(task_info->task_data), 1, 1777 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1778 ompt_task_implicit); 1779 } 1780 1781 /* OMPT state */ 1782 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1783 } else { 1784 exit_frame_p = &dummy; 1785 } 1786 #endif 1787 1788 { 1789 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1790 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1791 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1792 parent_team->t.t_argv 1793 #if OMPT_SUPPORT 1794 , 1795 exit_frame_p 1796 #endif 1797 ); 1798 } 1799 1800 #if OMPT_SUPPORT 1801 if (ompt_enabled.enabled) { 1802 *exit_frame_p = NULL; 1803 if (ompt_enabled.ompt_callback_implicit_task) { 1804 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1805 ompt_scope_end, NULL, &(task_info->task_data), 1, 1806 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1807 ompt_task_implicit); 1808 } 1809 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1810 __ompt_lw_taskteam_unlink(master_th); 1811 if (ompt_enabled.ompt_callback_parallel_end) { 1812 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1813 &ompt_parallel_data, parent_task_data, 1814 OMPT_INVOKER(call_context) | ompt_parallel_team, 1815 return_address); 1816 } 1817 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1818 } 1819 #endif 1820 } else if (microtask == (microtask_t)__kmp_teams_master) { 1821 KMP_DEBUG_ASSERT(master_th->th.th_team == 1822 master_th->th.th_serial_team); 1823 team = master_th->th.th_team; 1824 // team->t.t_pkfn = microtask; 1825 team->t.t_invoke = invoker; 1826 __kmp_alloc_argv_entries(argc, team, TRUE); 1827 team->t.t_argc = argc; 1828 argv = (void **)team->t.t_argv; 1829 if (ap) { 1830 for (i = argc - 1; i >= 0; --i) 1831 *argv++ = va_arg(kmp_va_deref(ap), void *); 1832 } else { 1833 for (i = 0; i < argc; ++i) 1834 // Get args from parent team for teams construct 1835 argv[i] = parent_team->t.t_argv[i]; 1836 } 1837 // AC: revert change made in __kmpc_serialized_parallel() 1838 // because initial code in teams should have level=0 1839 team->t.t_level--; 1840 // AC: call special invoker for outer "parallel" of teams construct 1841 invoker(gtid); 1842 #if OMPT_SUPPORT 1843 if (ompt_enabled.enabled) { 1844 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1845 if (ompt_enabled.ompt_callback_implicit_task) { 1846 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1847 ompt_scope_end, NULL, &(task_info->task_data), 0, 1848 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1849 } 1850 if (ompt_enabled.ompt_callback_parallel_end) { 1851 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1852 &ompt_parallel_data, parent_task_data, 1853 OMPT_INVOKER(call_context) | ompt_parallel_league, 1854 return_address); 1855 } 1856 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1857 } 1858 #endif 1859 } else { 1860 argv = args; 1861 for (i = argc - 1; i >= 0; --i) 1862 *argv++ = va_arg(kmp_va_deref(ap), void *); 1863 KMP_MB(); 1864 1865 #if OMPT_SUPPORT 1866 void *dummy; 1867 void **exit_frame_p; 1868 ompt_task_info_t *task_info; 1869 1870 ompt_lw_taskteam_t lw_taskteam; 1871 1872 if (ompt_enabled.enabled) { 1873 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1874 &ompt_parallel_data, return_address); 1875 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1876 // don't use lw_taskteam after linking. content was swaped 1877 task_info = OMPT_CUR_TASK_INFO(master_th); 1878 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1879 1880 /* OMPT implicit task begin */ 1881 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1882 if (ompt_enabled.ompt_callback_implicit_task) { 1883 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1884 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1885 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1886 ompt_task_implicit); 1887 OMPT_CUR_TASK_INFO(master_th) 1888 ->thread_num = __kmp_tid_from_gtid(gtid); 1889 } 1890 1891 /* OMPT state */ 1892 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1893 } else { 1894 exit_frame_p = &dummy; 1895 } 1896 #endif 1897 1898 { 1899 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1900 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1901 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1902 #if OMPT_SUPPORT 1903 , 1904 exit_frame_p 1905 #endif 1906 ); 1907 } 1908 1909 #if OMPT_SUPPORT 1910 if (ompt_enabled.enabled) { 1911 *exit_frame_p = NULL; 1912 if (ompt_enabled.ompt_callback_implicit_task) { 1913 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1914 ompt_scope_end, NULL, &(task_info->task_data), 1, 1915 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1916 ompt_task_implicit); 1917 } 1918 1919 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1920 __ompt_lw_taskteam_unlink(master_th); 1921 if (ompt_enabled.ompt_callback_parallel_end) { 1922 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1923 &ompt_parallel_data, parent_task_data, 1924 OMPT_INVOKER(call_context) | ompt_parallel_team, 1925 return_address); 1926 } 1927 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1928 } 1929 #endif 1930 } 1931 } else if (call_context == fork_context_gnu) { 1932 #if OMPT_SUPPORT 1933 ompt_lw_taskteam_t lwt; 1934 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1935 return_address); 1936 1937 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1938 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1939 // don't use lw_taskteam after linking. content was swaped 1940 #endif 1941 1942 // we were called from GNU native code 1943 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1944 return FALSE; 1945 } else { 1946 KMP_ASSERT2(call_context < fork_context_last, 1947 "__kmp_fork_call: unknown fork_context parameter"); 1948 } 1949 1950 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1951 KMP_MB(); 1952 return FALSE; 1953 } // if (nthreads == 1) 1954 1955 // GEH: only modify the executing flag in the case when not serialized 1956 // serialized case is handled in kmpc_serialized_parallel 1957 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1958 "curtask=%p, curtask_max_aclevel=%d\n", 1959 parent_team->t.t_active_level, master_th, 1960 master_th->th.th_current_task, 1961 master_th->th.th_current_task->td_icvs.max_active_levels)); 1962 // TODO: GEH - cannot do this assertion because root thread not set up as 1963 // executing 1964 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1965 master_th->th.th_current_task->td_flags.executing = 0; 1966 1967 if (!master_th->th.th_teams_microtask || level > teams_level) { 1968 /* Increment our nested depth level */ 1969 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1970 } 1971 1972 // See if we need to make a copy of the ICVs. 1973 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1974 if ((level + 1 < __kmp_nested_nth.used) && 1975 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1976 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1977 } else { 1978 nthreads_icv = 0; // don't update 1979 } 1980 1981 // Figure out the proc_bind_policy for the new team. 1982 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1983 kmp_proc_bind_t proc_bind_icv = 1984 proc_bind_default; // proc_bind_default means don't update 1985 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1986 proc_bind = proc_bind_false; 1987 } else { 1988 if (proc_bind == proc_bind_default) { 1989 // No proc_bind clause specified; use current proc-bind-var for this 1990 // parallel region 1991 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1992 } 1993 /* else: The proc_bind policy was specified explicitly on parallel clause. 1994 This overrides proc-bind-var for this parallel region, but does not 1995 change proc-bind-var. */ 1996 // Figure the value of proc-bind-var for the child threads. 1997 if ((level + 1 < __kmp_nested_proc_bind.used) && 1998 (__kmp_nested_proc_bind.bind_types[level + 1] != 1999 master_th->th.th_current_task->td_icvs.proc_bind)) { 2000 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 2001 } 2002 } 2003 2004 // Reset for next parallel region 2005 master_th->th.th_set_proc_bind = proc_bind_default; 2006 2007 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 2008 kmp_internal_control_t new_icvs; 2009 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2010 new_icvs.next = NULL; 2011 if (nthreads_icv > 0) { 2012 new_icvs.nproc = nthreads_icv; 2013 } 2014 if (proc_bind_icv != proc_bind_default) { 2015 new_icvs.proc_bind = proc_bind_icv; 2016 } 2017 2018 /* allocate a new parallel team */ 2019 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2020 team = __kmp_allocate_team(root, nthreads, nthreads, 2021 #if OMPT_SUPPORT 2022 ompt_parallel_data, 2023 #endif 2024 proc_bind, &new_icvs, 2025 argc USE_NESTED_HOT_ARG(master_th)); 2026 } else { 2027 /* allocate a new parallel team */ 2028 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2029 team = __kmp_allocate_team(root, nthreads, nthreads, 2030 #if OMPT_SUPPORT 2031 ompt_parallel_data, 2032 #endif 2033 proc_bind, 2034 &master_th->th.th_current_task->td_icvs, 2035 argc USE_NESTED_HOT_ARG(master_th)); 2036 } 2037 KF_TRACE( 2038 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2039 2040 /* setup the new team */ 2041 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2042 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2043 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2044 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2045 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2046 #if OMPT_SUPPORT 2047 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2048 return_address); 2049 #endif 2050 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2051 // TODO: parent_team->t.t_level == INT_MAX ??? 2052 if (!master_th->th.th_teams_microtask || level > teams_level) { 2053 int new_level = parent_team->t.t_level + 1; 2054 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2055 new_level = parent_team->t.t_active_level + 1; 2056 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2057 } else { 2058 // AC: Do not increase parallel level at start of the teams construct 2059 int new_level = parent_team->t.t_level; 2060 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2061 new_level = parent_team->t.t_active_level; 2062 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2063 } 2064 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2065 // set master's schedule as new run-time schedule 2066 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2067 2068 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2069 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2070 2071 // Update the floating point rounding in the team if required. 2072 propagateFPControl(team); 2073 2074 if (__kmp_tasking_mode != tskm_immediate_exec) { 2075 // Set master's task team to team's task team. Unless this is hot team, it 2076 // should be NULL. 2077 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2078 parent_team->t.t_task_team[master_th->th.th_task_state]); 2079 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2080 "%p, new task_team %p / team %p\n", 2081 __kmp_gtid_from_thread(master_th), 2082 master_th->th.th_task_team, parent_team, 2083 team->t.t_task_team[master_th->th.th_task_state], team)); 2084 2085 if (active_level || master_th->th.th_task_team) { 2086 // Take a memo of master's task_state 2087 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2088 if (master_th->th.th_task_state_top >= 2089 master_th->th.th_task_state_stack_sz) { // increase size 2090 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2091 kmp_uint8 *old_stack, *new_stack; 2092 kmp_uint32 i; 2093 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2094 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2095 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2096 } 2097 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2098 ++i) { // zero-init rest of stack 2099 new_stack[i] = 0; 2100 } 2101 old_stack = master_th->th.th_task_state_memo_stack; 2102 master_th->th.th_task_state_memo_stack = new_stack; 2103 master_th->th.th_task_state_stack_sz = new_size; 2104 __kmp_free(old_stack); 2105 } 2106 // Store master's task_state on stack 2107 master_th->th 2108 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2109 master_th->th.th_task_state; 2110 master_th->th.th_task_state_top++; 2111 #if KMP_NESTED_HOT_TEAMS 2112 if (master_th->th.th_hot_teams && 2113 active_level < __kmp_hot_teams_max_level && 2114 team == master_th->th.th_hot_teams[active_level].hot_team) { 2115 // Restore master's nested state if nested hot team 2116 master_th->th.th_task_state = 2117 master_th->th 2118 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2119 } else { 2120 #endif 2121 master_th->th.th_task_state = 0; 2122 #if KMP_NESTED_HOT_TEAMS 2123 } 2124 #endif 2125 } 2126 #if !KMP_NESTED_HOT_TEAMS 2127 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2128 (team == root->r.r_hot_team)); 2129 #endif 2130 } 2131 2132 KA_TRACE( 2133 20, 2134 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2135 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2136 team->t.t_nproc)); 2137 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2138 (team->t.t_master_tid == 0 && 2139 (team->t.t_parent == root->r.r_root_team || 2140 team->t.t_parent->t.t_serialized))); 2141 KMP_MB(); 2142 2143 /* now, setup the arguments */ 2144 argv = (void **)team->t.t_argv; 2145 if (ap) { 2146 for (i = argc - 1; i >= 0; --i) { 2147 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2148 KMP_CHECK_UPDATE(*argv, new_argv); 2149 argv++; 2150 } 2151 } else { 2152 for (i = 0; i < argc; ++i) { 2153 // Get args from parent team for teams construct 2154 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2155 } 2156 } 2157 2158 /* now actually fork the threads */ 2159 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2160 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2161 root->r.r_active = TRUE; 2162 2163 __kmp_fork_team_threads(root, team, master_th, gtid); 2164 __kmp_setup_icv_copy(team, nthreads, 2165 &master_th->th.th_current_task->td_icvs, loc); 2166 2167 #if OMPT_SUPPORT 2168 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2169 #endif 2170 2171 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2172 2173 #if USE_ITT_BUILD 2174 if (team->t.t_active_level == 1 // only report frames at level 1 2175 && !master_th->th.th_teams_microtask) { // not in teams construct 2176 #if USE_ITT_NOTIFY 2177 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2178 (__kmp_forkjoin_frames_mode == 3 || 2179 __kmp_forkjoin_frames_mode == 1)) { 2180 kmp_uint64 tmp_time = 0; 2181 if (__itt_get_timestamp_ptr) 2182 tmp_time = __itt_get_timestamp(); 2183 // Internal fork - report frame begin 2184 master_th->th.th_frame_time = tmp_time; 2185 if (__kmp_forkjoin_frames_mode == 3) 2186 team->t.t_region_time = tmp_time; 2187 } else 2188 // only one notification scheme (either "submit" or "forking/joined", not both) 2189 #endif /* USE_ITT_NOTIFY */ 2190 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2191 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2192 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2193 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2194 } 2195 } 2196 #endif /* USE_ITT_BUILD */ 2197 2198 /* now go on and do the work */ 2199 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2200 KMP_MB(); 2201 KF_TRACE(10, 2202 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2203 root, team, master_th, gtid)); 2204 2205 #if USE_ITT_BUILD 2206 if (__itt_stack_caller_create_ptr) { 2207 team->t.t_stack_id = 2208 __kmp_itt_stack_caller_create(); // create new stack stitching id 2209 // before entering fork barrier 2210 } 2211 #endif /* USE_ITT_BUILD */ 2212 2213 // AC: skip __kmp_internal_fork at teams construct, let only master 2214 // threads execute 2215 if (ap) { 2216 __kmp_internal_fork(loc, gtid, team); 2217 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2218 "master_th=%p, gtid=%d\n", 2219 root, team, master_th, gtid)); 2220 } 2221 2222 if (call_context == fork_context_gnu) { 2223 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2224 return TRUE; 2225 } 2226 2227 /* Invoke microtask for MASTER thread */ 2228 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2229 team->t.t_id, team->t.t_pkfn)); 2230 } // END of timer KMP_fork_call block 2231 2232 #if KMP_STATS_ENABLED 2233 // If beginning a teams construct, then change thread state 2234 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2235 if (!ap) { 2236 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2237 } 2238 #endif 2239 2240 if (!team->t.t_invoke(gtid)) { 2241 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2242 } 2243 2244 #if KMP_STATS_ENABLED 2245 // If was beginning of a teams construct, then reset thread state 2246 if (!ap) { 2247 KMP_SET_THREAD_STATE(previous_state); 2248 } 2249 #endif 2250 2251 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2252 team->t.t_id, team->t.t_pkfn)); 2253 KMP_MB(); /* Flush all pending memory write invalidates. */ 2254 2255 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2256 2257 #if OMPT_SUPPORT 2258 if (ompt_enabled.enabled) { 2259 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2260 } 2261 #endif 2262 2263 return TRUE; 2264 } 2265 2266 #if OMPT_SUPPORT 2267 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2268 kmp_team_t *team) { 2269 // restore state outside the region 2270 thread->th.ompt_thread_info.state = 2271 ((team->t.t_serialized) ? ompt_state_work_serial 2272 : ompt_state_work_parallel); 2273 } 2274 2275 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2276 kmp_team_t *team, ompt_data_t *parallel_data, 2277 int flags, void *codeptr) { 2278 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2279 if (ompt_enabled.ompt_callback_parallel_end) { 2280 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2281 parallel_data, &(task_info->task_data), flags, codeptr); 2282 } 2283 2284 task_info->frame.enter_frame = ompt_data_none; 2285 __kmp_join_restore_state(thread, team); 2286 } 2287 #endif 2288 2289 void __kmp_join_call(ident_t *loc, int gtid 2290 #if OMPT_SUPPORT 2291 , 2292 enum fork_context_e fork_context 2293 #endif 2294 , 2295 int exit_teams) { 2296 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2297 kmp_team_t *team; 2298 kmp_team_t *parent_team; 2299 kmp_info_t *master_th; 2300 kmp_root_t *root; 2301 int master_active; 2302 2303 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2304 2305 /* setup current data */ 2306 master_th = __kmp_threads[gtid]; 2307 root = master_th->th.th_root; 2308 team = master_th->th.th_team; 2309 parent_team = team->t.t_parent; 2310 2311 master_th->th.th_ident = loc; 2312 2313 #if OMPT_SUPPORT 2314 void *team_microtask = (void *)team->t.t_pkfn; 2315 // For GOMP interface with serialized parallel, need the 2316 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2317 // and end-parallel events. 2318 if (ompt_enabled.enabled && 2319 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2320 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2321 } 2322 #endif 2323 2324 #if KMP_DEBUG 2325 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2326 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2327 "th_task_team = %p\n", 2328 __kmp_gtid_from_thread(master_th), team, 2329 team->t.t_task_team[master_th->th.th_task_state], 2330 master_th->th.th_task_team)); 2331 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2332 team->t.t_task_team[master_th->th.th_task_state]); 2333 } 2334 #endif 2335 2336 if (team->t.t_serialized) { 2337 if (master_th->th.th_teams_microtask) { 2338 // We are in teams construct 2339 int level = team->t.t_level; 2340 int tlevel = master_th->th.th_teams_level; 2341 if (level == tlevel) { 2342 // AC: we haven't incremented it earlier at start of teams construct, 2343 // so do it here - at the end of teams construct 2344 team->t.t_level++; 2345 } else if (level == tlevel + 1) { 2346 // AC: we are exiting parallel inside teams, need to increment 2347 // serialization in order to restore it in the next call to 2348 // __kmpc_end_serialized_parallel 2349 team->t.t_serialized++; 2350 } 2351 } 2352 __kmpc_end_serialized_parallel(loc, gtid); 2353 2354 #if OMPT_SUPPORT 2355 if (ompt_enabled.enabled) { 2356 __kmp_join_restore_state(master_th, parent_team); 2357 } 2358 #endif 2359 2360 return; 2361 } 2362 2363 master_active = team->t.t_master_active; 2364 2365 if (!exit_teams) { 2366 // AC: No barrier for internal teams at exit from teams construct. 2367 // But there is barrier for external team (league). 2368 __kmp_internal_join(loc, gtid, team); 2369 } else { 2370 master_th->th.th_task_state = 2371 0; // AC: no tasking in teams (out of any parallel) 2372 } 2373 2374 KMP_MB(); 2375 2376 #if OMPT_SUPPORT 2377 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2378 void *codeptr = team->t.ompt_team_info.master_return_address; 2379 #endif 2380 2381 #if USE_ITT_BUILD 2382 if (__itt_stack_caller_create_ptr) { 2383 // destroy the stack stitching id after join barrier 2384 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2385 } 2386 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2387 if (team->t.t_active_level == 1 && 2388 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2389 master_th->th.th_teams_size.nteams == 1)) { 2390 master_th->th.th_ident = loc; 2391 // only one notification scheme (either "submit" or "forking/joined", not 2392 // both) 2393 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2394 __kmp_forkjoin_frames_mode == 3) 2395 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2396 master_th->th.th_frame_time, 0, loc, 2397 master_th->th.th_team_nproc, 1); 2398 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2399 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2400 __kmp_itt_region_joined(gtid); 2401 } // active_level == 1 2402 #endif /* USE_ITT_BUILD */ 2403 2404 if (master_th->th.th_teams_microtask && !exit_teams && 2405 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2406 team->t.t_level == master_th->th.th_teams_level + 1) { 2407 // AC: We need to leave the team structure intact at the end of parallel 2408 // inside the teams construct, so that at the next parallel same (hot) team 2409 // works, only adjust nesting levels 2410 #if OMPT_SUPPORT 2411 ompt_data_t ompt_parallel_data = ompt_data_none; 2412 if (ompt_enabled.enabled) { 2413 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2414 if (ompt_enabled.ompt_callback_implicit_task) { 2415 int ompt_team_size = team->t.t_nproc; 2416 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2417 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2418 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2419 } 2420 task_info->frame.exit_frame = ompt_data_none; 2421 task_info->task_data = ompt_data_none; 2422 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2423 __ompt_lw_taskteam_unlink(master_th); 2424 } 2425 #endif 2426 /* Decrement our nested depth level */ 2427 team->t.t_level--; 2428 team->t.t_active_level--; 2429 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2430 2431 // Restore number of threads in the team if needed. This code relies on 2432 // the proper adjustment of th_teams_size.nth after the fork in 2433 // __kmp_teams_master on each teams master in the case that 2434 // __kmp_reserve_threads reduced it. 2435 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2436 int old_num = master_th->th.th_team_nproc; 2437 int new_num = master_th->th.th_teams_size.nth; 2438 kmp_info_t **other_threads = team->t.t_threads; 2439 team->t.t_nproc = new_num; 2440 for (int i = 0; i < old_num; ++i) { 2441 other_threads[i]->th.th_team_nproc = new_num; 2442 } 2443 // Adjust states of non-used threads of the team 2444 for (int i = old_num; i < new_num; ++i) { 2445 // Re-initialize thread's barrier data. 2446 KMP_DEBUG_ASSERT(other_threads[i]); 2447 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2448 for (int b = 0; b < bs_last_barrier; ++b) { 2449 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2450 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2451 #if USE_DEBUGGER 2452 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2453 #endif 2454 } 2455 if (__kmp_tasking_mode != tskm_immediate_exec) { 2456 // Synchronize thread's task state 2457 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2458 } 2459 } 2460 } 2461 2462 #if OMPT_SUPPORT 2463 if (ompt_enabled.enabled) { 2464 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2465 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2466 } 2467 #endif 2468 2469 return; 2470 } 2471 2472 /* do cleanup and restore the parent team */ 2473 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2474 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2475 2476 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2477 2478 /* jc: The following lock has instructions with REL and ACQ semantics, 2479 separating the parallel user code called in this parallel region 2480 from the serial user code called after this function returns. */ 2481 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2482 2483 if (!master_th->th.th_teams_microtask || 2484 team->t.t_level > master_th->th.th_teams_level) { 2485 /* Decrement our nested depth level */ 2486 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2487 } 2488 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2489 2490 #if OMPT_SUPPORT 2491 if (ompt_enabled.enabled) { 2492 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2493 if (ompt_enabled.ompt_callback_implicit_task) { 2494 int flags = (team_microtask == (void *)__kmp_teams_master) 2495 ? ompt_task_initial 2496 : ompt_task_implicit; 2497 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2498 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2499 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2500 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2501 } 2502 task_info->frame.exit_frame = ompt_data_none; 2503 task_info->task_data = ompt_data_none; 2504 } 2505 #endif 2506 2507 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2508 master_th, team)); 2509 __kmp_pop_current_task_from_thread(master_th); 2510 2511 #if KMP_AFFINITY_SUPPORTED 2512 // Restore master thread's partition. 2513 master_th->th.th_first_place = team->t.t_first_place; 2514 master_th->th.th_last_place = team->t.t_last_place; 2515 #endif // KMP_AFFINITY_SUPPORTED 2516 master_th->th.th_def_allocator = team->t.t_def_allocator; 2517 2518 updateHWFPControl(team); 2519 2520 if (root->r.r_active != master_active) 2521 root->r.r_active = master_active; 2522 2523 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2524 master_th)); // this will free worker threads 2525 2526 /* this race was fun to find. make sure the following is in the critical 2527 region otherwise assertions may fail occasionally since the old team may be 2528 reallocated and the hierarchy appears inconsistent. it is actually safe to 2529 run and won't cause any bugs, but will cause those assertion failures. it's 2530 only one deref&assign so might as well put this in the critical region */ 2531 master_th->th.th_team = parent_team; 2532 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2533 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2534 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2535 2536 /* restore serialized team, if need be */ 2537 if (parent_team->t.t_serialized && 2538 parent_team != master_th->th.th_serial_team && 2539 parent_team != root->r.r_root_team) { 2540 __kmp_free_team(root, 2541 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2542 master_th->th.th_serial_team = parent_team; 2543 } 2544 2545 if (__kmp_tasking_mode != tskm_immediate_exec) { 2546 if (master_th->th.th_task_state_top > 2547 0) { // Restore task state from memo stack 2548 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2549 // Remember master's state if we re-use this nested hot team 2550 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2551 master_th->th.th_task_state; 2552 --master_th->th.th_task_state_top; // pop 2553 // Now restore state at this level 2554 master_th->th.th_task_state = 2555 master_th->th 2556 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2557 } 2558 // Copy the task team from the parent team to the master thread 2559 master_th->th.th_task_team = 2560 parent_team->t.t_task_team[master_th->th.th_task_state]; 2561 KA_TRACE(20, 2562 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2563 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2564 parent_team)); 2565 } 2566 2567 // TODO: GEH - cannot do this assertion because root thread not set up as 2568 // executing 2569 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2570 master_th->th.th_current_task->td_flags.executing = 1; 2571 2572 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2573 2574 #if OMPT_SUPPORT 2575 int flags = 2576 OMPT_INVOKER(fork_context) | 2577 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2578 : ompt_parallel_team); 2579 if (ompt_enabled.enabled) { 2580 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2581 codeptr); 2582 } 2583 #endif 2584 2585 KMP_MB(); 2586 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2587 } 2588 2589 /* Check whether we should push an internal control record onto the 2590 serial team stack. If so, do it. */ 2591 void __kmp_save_internal_controls(kmp_info_t *thread) { 2592 2593 if (thread->th.th_team != thread->th.th_serial_team) { 2594 return; 2595 } 2596 if (thread->th.th_team->t.t_serialized > 1) { 2597 int push = 0; 2598 2599 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2600 push = 1; 2601 } else { 2602 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2603 thread->th.th_team->t.t_serialized) { 2604 push = 1; 2605 } 2606 } 2607 if (push) { /* push a record on the serial team's stack */ 2608 kmp_internal_control_t *control = 2609 (kmp_internal_control_t *)__kmp_allocate( 2610 sizeof(kmp_internal_control_t)); 2611 2612 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2613 2614 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2615 2616 control->next = thread->th.th_team->t.t_control_stack_top; 2617 thread->th.th_team->t.t_control_stack_top = control; 2618 } 2619 } 2620 } 2621 2622 /* Changes set_nproc */ 2623 void __kmp_set_num_threads(int new_nth, int gtid) { 2624 kmp_info_t *thread; 2625 kmp_root_t *root; 2626 2627 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2628 KMP_DEBUG_ASSERT(__kmp_init_serial); 2629 2630 if (new_nth < 1) 2631 new_nth = 1; 2632 else if (new_nth > __kmp_max_nth) 2633 new_nth = __kmp_max_nth; 2634 2635 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2636 thread = __kmp_threads[gtid]; 2637 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2638 return; // nothing to do 2639 2640 __kmp_save_internal_controls(thread); 2641 2642 set__nproc(thread, new_nth); 2643 2644 // If this omp_set_num_threads() call will cause the hot team size to be 2645 // reduced (in the absence of a num_threads clause), then reduce it now, 2646 // rather than waiting for the next parallel region. 2647 root = thread->th.th_root; 2648 if (__kmp_init_parallel && (!root->r.r_active) && 2649 (root->r.r_hot_team->t.t_nproc > new_nth) 2650 #if KMP_NESTED_HOT_TEAMS 2651 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2652 #endif 2653 ) { 2654 kmp_team_t *hot_team = root->r.r_hot_team; 2655 int f; 2656 2657 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2658 2659 // Release the extra threads we don't need any more. 2660 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2661 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2662 if (__kmp_tasking_mode != tskm_immediate_exec) { 2663 // When decreasing team size, threads no longer in the team should unref 2664 // task team. 2665 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2666 } 2667 __kmp_free_thread(hot_team->t.t_threads[f]); 2668 hot_team->t.t_threads[f] = NULL; 2669 } 2670 hot_team->t.t_nproc = new_nth; 2671 #if KMP_NESTED_HOT_TEAMS 2672 if (thread->th.th_hot_teams) { 2673 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2674 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2675 } 2676 #endif 2677 2678 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2679 2680 // Update the t_nproc field in the threads that are still active. 2681 for (f = 0; f < new_nth; f++) { 2682 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2683 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2684 } 2685 // Special flag in case omp_set_num_threads() call 2686 hot_team->t.t_size_changed = -1; 2687 } 2688 } 2689 2690 /* Changes max_active_levels */ 2691 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2692 kmp_info_t *thread; 2693 2694 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2695 "%d = (%d)\n", 2696 gtid, max_active_levels)); 2697 KMP_DEBUG_ASSERT(__kmp_init_serial); 2698 2699 // validate max_active_levels 2700 if (max_active_levels < 0) { 2701 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2702 // We ignore this call if the user has specified a negative value. 2703 // The current setting won't be changed. The last valid setting will be 2704 // used. A warning will be issued (if warnings are allowed as controlled by 2705 // the KMP_WARNINGS env var). 2706 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2707 "max_active_levels for thread %d = (%d)\n", 2708 gtid, max_active_levels)); 2709 return; 2710 } 2711 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2712 // it's OK, the max_active_levels is within the valid range: [ 0; 2713 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2714 // We allow a zero value. (implementation defined behavior) 2715 } else { 2716 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2717 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2718 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2719 // Current upper limit is MAX_INT. (implementation defined behavior) 2720 // If the input exceeds the upper limit, we correct the input to be the 2721 // upper limit. (implementation defined behavior) 2722 // Actually, the flow should never get here until we use MAX_INT limit. 2723 } 2724 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2725 "max_active_levels for thread %d = (%d)\n", 2726 gtid, max_active_levels)); 2727 2728 thread = __kmp_threads[gtid]; 2729 2730 __kmp_save_internal_controls(thread); 2731 2732 set__max_active_levels(thread, max_active_levels); 2733 } 2734 2735 /* Gets max_active_levels */ 2736 int __kmp_get_max_active_levels(int gtid) { 2737 kmp_info_t *thread; 2738 2739 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2740 KMP_DEBUG_ASSERT(__kmp_init_serial); 2741 2742 thread = __kmp_threads[gtid]; 2743 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2744 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2745 "curtask_maxaclevel=%d\n", 2746 gtid, thread->th.th_current_task, 2747 thread->th.th_current_task->td_icvs.max_active_levels)); 2748 return thread->th.th_current_task->td_icvs.max_active_levels; 2749 } 2750 2751 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2752 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2753 2754 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2755 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2756 kmp_info_t *thread; 2757 kmp_sched_t orig_kind; 2758 // kmp_team_t *team; 2759 2760 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2761 gtid, (int)kind, chunk)); 2762 KMP_DEBUG_ASSERT(__kmp_init_serial); 2763 2764 // Check if the kind parameter is valid, correct if needed. 2765 // Valid parameters should fit in one of two intervals - standard or extended: 2766 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2767 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2768 orig_kind = kind; 2769 kind = __kmp_sched_without_mods(kind); 2770 2771 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2772 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2773 // TODO: Hint needs attention in case we change the default schedule. 2774 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2775 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2776 __kmp_msg_null); 2777 kind = kmp_sched_default; 2778 chunk = 0; // ignore chunk value in case of bad kind 2779 } 2780 2781 thread = __kmp_threads[gtid]; 2782 2783 __kmp_save_internal_controls(thread); 2784 2785 if (kind < kmp_sched_upper_std) { 2786 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2787 // differ static chunked vs. unchunked: chunk should be invalid to 2788 // indicate unchunked schedule (which is the default) 2789 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2790 } else { 2791 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2792 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2793 } 2794 } else { 2795 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2796 // kmp_sched_lower - 2 ]; 2797 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2798 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2799 kmp_sched_lower - 2]; 2800 } 2801 __kmp_sched_apply_mods_intkind( 2802 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2803 if (kind == kmp_sched_auto || chunk < 1) { 2804 // ignore parameter chunk for schedule auto 2805 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2806 } else { 2807 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2808 } 2809 } 2810 2811 /* Gets def_sched_var ICV values */ 2812 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2813 kmp_info_t *thread; 2814 enum sched_type th_type; 2815 2816 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2817 KMP_DEBUG_ASSERT(__kmp_init_serial); 2818 2819 thread = __kmp_threads[gtid]; 2820 2821 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2822 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2823 case kmp_sch_static: 2824 case kmp_sch_static_greedy: 2825 case kmp_sch_static_balanced: 2826 *kind = kmp_sched_static; 2827 __kmp_sched_apply_mods_stdkind(kind, th_type); 2828 *chunk = 0; // chunk was not set, try to show this fact via zero value 2829 return; 2830 case kmp_sch_static_chunked: 2831 *kind = kmp_sched_static; 2832 break; 2833 case kmp_sch_dynamic_chunked: 2834 *kind = kmp_sched_dynamic; 2835 break; 2836 case kmp_sch_guided_chunked: 2837 case kmp_sch_guided_iterative_chunked: 2838 case kmp_sch_guided_analytical_chunked: 2839 *kind = kmp_sched_guided; 2840 break; 2841 case kmp_sch_auto: 2842 *kind = kmp_sched_auto; 2843 break; 2844 case kmp_sch_trapezoidal: 2845 *kind = kmp_sched_trapezoidal; 2846 break; 2847 #if KMP_STATIC_STEAL_ENABLED 2848 case kmp_sch_static_steal: 2849 *kind = kmp_sched_static_steal; 2850 break; 2851 #endif 2852 default: 2853 KMP_FATAL(UnknownSchedulingType, th_type); 2854 } 2855 2856 __kmp_sched_apply_mods_stdkind(kind, th_type); 2857 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2858 } 2859 2860 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2861 2862 int ii, dd; 2863 kmp_team_t *team; 2864 kmp_info_t *thr; 2865 2866 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2867 KMP_DEBUG_ASSERT(__kmp_init_serial); 2868 2869 // validate level 2870 if (level == 0) 2871 return 0; 2872 if (level < 0) 2873 return -1; 2874 thr = __kmp_threads[gtid]; 2875 team = thr->th.th_team; 2876 ii = team->t.t_level; 2877 if (level > ii) 2878 return -1; 2879 2880 if (thr->th.th_teams_microtask) { 2881 // AC: we are in teams region where multiple nested teams have same level 2882 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2883 if (level <= 2884 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2885 KMP_DEBUG_ASSERT(ii >= tlevel); 2886 // AC: As we need to pass by the teams league, we need to artificially 2887 // increase ii 2888 if (ii == tlevel) { 2889 ii += 2; // three teams have same level 2890 } else { 2891 ii++; // two teams have same level 2892 } 2893 } 2894 } 2895 2896 if (ii == level) 2897 return __kmp_tid_from_gtid(gtid); 2898 2899 dd = team->t.t_serialized; 2900 level++; 2901 while (ii > level) { 2902 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2903 } 2904 if ((team->t.t_serialized) && (!dd)) { 2905 team = team->t.t_parent; 2906 continue; 2907 } 2908 if (ii > level) { 2909 team = team->t.t_parent; 2910 dd = team->t.t_serialized; 2911 ii--; 2912 } 2913 } 2914 2915 return (dd > 1) ? (0) : (team->t.t_master_tid); 2916 } 2917 2918 int __kmp_get_team_size(int gtid, int level) { 2919 2920 int ii, dd; 2921 kmp_team_t *team; 2922 kmp_info_t *thr; 2923 2924 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2925 KMP_DEBUG_ASSERT(__kmp_init_serial); 2926 2927 // validate level 2928 if (level == 0) 2929 return 1; 2930 if (level < 0) 2931 return -1; 2932 thr = __kmp_threads[gtid]; 2933 team = thr->th.th_team; 2934 ii = team->t.t_level; 2935 if (level > ii) 2936 return -1; 2937 2938 if (thr->th.th_teams_microtask) { 2939 // AC: we are in teams region where multiple nested teams have same level 2940 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2941 if (level <= 2942 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2943 KMP_DEBUG_ASSERT(ii >= tlevel); 2944 // AC: As we need to pass by the teams league, we need to artificially 2945 // increase ii 2946 if (ii == tlevel) { 2947 ii += 2; // three teams have same level 2948 } else { 2949 ii++; // two teams have same level 2950 } 2951 } 2952 } 2953 2954 while (ii > level) { 2955 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2956 } 2957 if (team->t.t_serialized && (!dd)) { 2958 team = team->t.t_parent; 2959 continue; 2960 } 2961 if (ii > level) { 2962 team = team->t.t_parent; 2963 ii--; 2964 } 2965 } 2966 2967 return team->t.t_nproc; 2968 } 2969 2970 kmp_r_sched_t __kmp_get_schedule_global() { 2971 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2972 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2973 // independently. So one can get the updated schedule here. 2974 2975 kmp_r_sched_t r_sched; 2976 2977 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2978 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2979 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2980 // different roots (even in OMP 2.5) 2981 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2982 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2983 if (s == kmp_sch_static) { 2984 // replace STATIC with more detailed schedule (balanced or greedy) 2985 r_sched.r_sched_type = __kmp_static; 2986 } else if (s == kmp_sch_guided_chunked) { 2987 // replace GUIDED with more detailed schedule (iterative or analytical) 2988 r_sched.r_sched_type = __kmp_guided; 2989 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2990 r_sched.r_sched_type = __kmp_sched; 2991 } 2992 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2993 2994 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2995 // __kmp_chunk may be wrong here (if it was not ever set) 2996 r_sched.chunk = KMP_DEFAULT_CHUNK; 2997 } else { 2998 r_sched.chunk = __kmp_chunk; 2999 } 3000 3001 return r_sched; 3002 } 3003 3004 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3005 at least argc number of *t_argv entries for the requested team. */ 3006 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3007 3008 KMP_DEBUG_ASSERT(team); 3009 if (!realloc || argc > team->t.t_max_argc) { 3010 3011 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3012 "current entries=%d\n", 3013 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3014 /* if previously allocated heap space for args, free them */ 3015 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3016 __kmp_free((void *)team->t.t_argv); 3017 3018 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3019 /* use unused space in the cache line for arguments */ 3020 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3021 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3022 "argv entries\n", 3023 team->t.t_id, team->t.t_max_argc)); 3024 team->t.t_argv = &team->t.t_inline_argv[0]; 3025 if (__kmp_storage_map) { 3026 __kmp_print_storage_map_gtid( 3027 -1, &team->t.t_inline_argv[0], 3028 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3029 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3030 team->t.t_id); 3031 } 3032 } else { 3033 /* allocate space for arguments in the heap */ 3034 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3035 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3036 : 2 * argc; 3037 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3038 "argv entries\n", 3039 team->t.t_id, team->t.t_max_argc)); 3040 team->t.t_argv = 3041 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3042 if (__kmp_storage_map) { 3043 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3044 &team->t.t_argv[team->t.t_max_argc], 3045 sizeof(void *) * team->t.t_max_argc, 3046 "team_%d.t_argv", team->t.t_id); 3047 } 3048 } 3049 } 3050 } 3051 3052 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3053 int i; 3054 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3055 team->t.t_threads = 3056 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3057 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3058 sizeof(dispatch_shared_info_t) * num_disp_buff); 3059 team->t.t_dispatch = 3060 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3061 team->t.t_implicit_task_taskdata = 3062 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3063 team->t.t_max_nproc = max_nth; 3064 3065 /* setup dispatch buffers */ 3066 for (i = 0; i < num_disp_buff; ++i) { 3067 team->t.t_disp_buffer[i].buffer_index = i; 3068 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3069 } 3070 } 3071 3072 static void __kmp_free_team_arrays(kmp_team_t *team) { 3073 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3074 int i; 3075 for (i = 0; i < team->t.t_max_nproc; ++i) { 3076 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3077 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3078 team->t.t_dispatch[i].th_disp_buffer = NULL; 3079 } 3080 } 3081 #if KMP_USE_HIER_SCHED 3082 __kmp_dispatch_free_hierarchies(team); 3083 #endif 3084 __kmp_free(team->t.t_threads); 3085 __kmp_free(team->t.t_disp_buffer); 3086 __kmp_free(team->t.t_dispatch); 3087 __kmp_free(team->t.t_implicit_task_taskdata); 3088 team->t.t_threads = NULL; 3089 team->t.t_disp_buffer = NULL; 3090 team->t.t_dispatch = NULL; 3091 team->t.t_implicit_task_taskdata = 0; 3092 } 3093 3094 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3095 kmp_info_t **oldThreads = team->t.t_threads; 3096 3097 __kmp_free(team->t.t_disp_buffer); 3098 __kmp_free(team->t.t_dispatch); 3099 __kmp_free(team->t.t_implicit_task_taskdata); 3100 __kmp_allocate_team_arrays(team, max_nth); 3101 3102 KMP_MEMCPY(team->t.t_threads, oldThreads, 3103 team->t.t_nproc * sizeof(kmp_info_t *)); 3104 3105 __kmp_free(oldThreads); 3106 } 3107 3108 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3109 3110 kmp_r_sched_t r_sched = 3111 __kmp_get_schedule_global(); // get current state of scheduling globals 3112 3113 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3114 3115 kmp_internal_control_t g_icvs = { 3116 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3117 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3118 // adjustment of threads (per thread) 3119 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3120 // whether blocktime is explicitly set 3121 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3122 #if KMP_USE_MONITOR 3123 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3124 // intervals 3125 #endif 3126 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3127 // next parallel region (per thread) 3128 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3129 __kmp_cg_max_nth, // int thread_limit; 3130 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3131 // for max_active_levels 3132 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3133 // {sched,chunk} pair 3134 __kmp_nested_proc_bind.bind_types[0], 3135 __kmp_default_device, 3136 NULL // struct kmp_internal_control *next; 3137 }; 3138 3139 return g_icvs; 3140 } 3141 3142 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3143 3144 kmp_internal_control_t gx_icvs; 3145 gx_icvs.serial_nesting_level = 3146 0; // probably =team->t.t_serial like in save_inter_controls 3147 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3148 gx_icvs.next = NULL; 3149 3150 return gx_icvs; 3151 } 3152 3153 static void __kmp_initialize_root(kmp_root_t *root) { 3154 int f; 3155 kmp_team_t *root_team; 3156 kmp_team_t *hot_team; 3157 int hot_team_max_nth; 3158 kmp_r_sched_t r_sched = 3159 __kmp_get_schedule_global(); // get current state of scheduling globals 3160 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3161 KMP_DEBUG_ASSERT(root); 3162 KMP_ASSERT(!root->r.r_begin); 3163 3164 /* setup the root state structure */ 3165 __kmp_init_lock(&root->r.r_begin_lock); 3166 root->r.r_begin = FALSE; 3167 root->r.r_active = FALSE; 3168 root->r.r_in_parallel = 0; 3169 root->r.r_blocktime = __kmp_dflt_blocktime; 3170 3171 /* setup the root team for this task */ 3172 /* allocate the root team structure */ 3173 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3174 3175 root_team = 3176 __kmp_allocate_team(root, 3177 1, // new_nproc 3178 1, // max_nproc 3179 #if OMPT_SUPPORT 3180 ompt_data_none, // root parallel id 3181 #endif 3182 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3183 0 // argc 3184 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3185 ); 3186 #if USE_DEBUGGER 3187 // Non-NULL value should be assigned to make the debugger display the root 3188 // team. 3189 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3190 #endif 3191 3192 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3193 3194 root->r.r_root_team = root_team; 3195 root_team->t.t_control_stack_top = NULL; 3196 3197 /* initialize root team */ 3198 root_team->t.t_threads[0] = NULL; 3199 root_team->t.t_nproc = 1; 3200 root_team->t.t_serialized = 1; 3201 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3202 root_team->t.t_sched.sched = r_sched.sched; 3203 KA_TRACE( 3204 20, 3205 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3206 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3207 3208 /* setup the hot team for this task */ 3209 /* allocate the hot team structure */ 3210 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3211 3212 hot_team = 3213 __kmp_allocate_team(root, 3214 1, // new_nproc 3215 __kmp_dflt_team_nth_ub * 2, // max_nproc 3216 #if OMPT_SUPPORT 3217 ompt_data_none, // root parallel id 3218 #endif 3219 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3220 0 // argc 3221 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3222 ); 3223 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3224 3225 root->r.r_hot_team = hot_team; 3226 root_team->t.t_control_stack_top = NULL; 3227 3228 /* first-time initialization */ 3229 hot_team->t.t_parent = root_team; 3230 3231 /* initialize hot team */ 3232 hot_team_max_nth = hot_team->t.t_max_nproc; 3233 for (f = 0; f < hot_team_max_nth; ++f) { 3234 hot_team->t.t_threads[f] = NULL; 3235 } 3236 hot_team->t.t_nproc = 1; 3237 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3238 hot_team->t.t_sched.sched = r_sched.sched; 3239 hot_team->t.t_size_changed = 0; 3240 } 3241 3242 #ifdef KMP_DEBUG 3243 3244 typedef struct kmp_team_list_item { 3245 kmp_team_p const *entry; 3246 struct kmp_team_list_item *next; 3247 } kmp_team_list_item_t; 3248 typedef kmp_team_list_item_t *kmp_team_list_t; 3249 3250 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3251 kmp_team_list_t list, // List of teams. 3252 kmp_team_p const *team // Team to add. 3253 ) { 3254 3255 // List must terminate with item where both entry and next are NULL. 3256 // Team is added to the list only once. 3257 // List is sorted in ascending order by team id. 3258 // Team id is *not* a key. 3259 3260 kmp_team_list_t l; 3261 3262 KMP_DEBUG_ASSERT(list != NULL); 3263 if (team == NULL) { 3264 return; 3265 } 3266 3267 __kmp_print_structure_team_accum(list, team->t.t_parent); 3268 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3269 3270 // Search list for the team. 3271 l = list; 3272 while (l->next != NULL && l->entry != team) { 3273 l = l->next; 3274 } 3275 if (l->next != NULL) { 3276 return; // Team has been added before, exit. 3277 } 3278 3279 // Team is not found. Search list again for insertion point. 3280 l = list; 3281 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3282 l = l->next; 3283 } 3284 3285 // Insert team. 3286 { 3287 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3288 sizeof(kmp_team_list_item_t)); 3289 *item = *l; 3290 l->entry = team; 3291 l->next = item; 3292 } 3293 } 3294 3295 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3296 3297 ) { 3298 __kmp_printf("%s", title); 3299 if (team != NULL) { 3300 __kmp_printf("%2x %p\n", team->t.t_id, team); 3301 } else { 3302 __kmp_printf(" - (nil)\n"); 3303 } 3304 } 3305 3306 static void __kmp_print_structure_thread(char const *title, 3307 kmp_info_p const *thread) { 3308 __kmp_printf("%s", title); 3309 if (thread != NULL) { 3310 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3311 } else { 3312 __kmp_printf(" - (nil)\n"); 3313 } 3314 } 3315 3316 void __kmp_print_structure(void) { 3317 3318 kmp_team_list_t list; 3319 3320 // Initialize list of teams. 3321 list = 3322 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3323 list->entry = NULL; 3324 list->next = NULL; 3325 3326 __kmp_printf("\n------------------------------\nGlobal Thread " 3327 "Table\n------------------------------\n"); 3328 { 3329 int gtid; 3330 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3331 __kmp_printf("%2d", gtid); 3332 if (__kmp_threads != NULL) { 3333 __kmp_printf(" %p", __kmp_threads[gtid]); 3334 } 3335 if (__kmp_root != NULL) { 3336 __kmp_printf(" %p", __kmp_root[gtid]); 3337 } 3338 __kmp_printf("\n"); 3339 } 3340 } 3341 3342 // Print out __kmp_threads array. 3343 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3344 "----------\n"); 3345 if (__kmp_threads != NULL) { 3346 int gtid; 3347 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3348 kmp_info_t const *thread = __kmp_threads[gtid]; 3349 if (thread != NULL) { 3350 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3351 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3352 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3353 __kmp_print_structure_team(" Serial Team: ", 3354 thread->th.th_serial_team); 3355 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3356 __kmp_print_structure_thread(" Master: ", 3357 thread->th.th_team_master); 3358 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3359 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3360 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3361 __kmp_print_structure_thread(" Next in pool: ", 3362 thread->th.th_next_pool); 3363 __kmp_printf("\n"); 3364 __kmp_print_structure_team_accum(list, thread->th.th_team); 3365 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3366 } 3367 } 3368 } else { 3369 __kmp_printf("Threads array is not allocated.\n"); 3370 } 3371 3372 // Print out __kmp_root array. 3373 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3374 "--------\n"); 3375 if (__kmp_root != NULL) { 3376 int gtid; 3377 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3378 kmp_root_t const *root = __kmp_root[gtid]; 3379 if (root != NULL) { 3380 __kmp_printf("GTID %2d %p:\n", gtid, root); 3381 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3382 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3383 __kmp_print_structure_thread(" Uber Thread: ", 3384 root->r.r_uber_thread); 3385 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3386 __kmp_printf(" In Parallel: %2d\n", 3387 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3388 __kmp_printf("\n"); 3389 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3390 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3391 } 3392 } 3393 } else { 3394 __kmp_printf("Ubers array is not allocated.\n"); 3395 } 3396 3397 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3398 "--------\n"); 3399 while (list->next != NULL) { 3400 kmp_team_p const *team = list->entry; 3401 int i; 3402 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3403 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3404 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3405 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3406 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3407 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3408 for (i = 0; i < team->t.t_nproc; ++i) { 3409 __kmp_printf(" Thread %2d: ", i); 3410 __kmp_print_structure_thread("", team->t.t_threads[i]); 3411 } 3412 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3413 __kmp_printf("\n"); 3414 list = list->next; 3415 } 3416 3417 // Print out __kmp_thread_pool and __kmp_team_pool. 3418 __kmp_printf("\n------------------------------\nPools\n----------------------" 3419 "--------\n"); 3420 __kmp_print_structure_thread("Thread pool: ", 3421 CCAST(kmp_info_t *, __kmp_thread_pool)); 3422 __kmp_print_structure_team("Team pool: ", 3423 CCAST(kmp_team_t *, __kmp_team_pool)); 3424 __kmp_printf("\n"); 3425 3426 // Free team list. 3427 while (list != NULL) { 3428 kmp_team_list_item_t *item = list; 3429 list = list->next; 3430 KMP_INTERNAL_FREE(item); 3431 } 3432 } 3433 3434 #endif 3435 3436 //--------------------------------------------------------------------------- 3437 // Stuff for per-thread fast random number generator 3438 // Table of primes 3439 static const unsigned __kmp_primes[] = { 3440 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3441 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3442 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3443 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3444 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3445 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3446 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3447 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3448 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3449 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3450 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3451 3452 //--------------------------------------------------------------------------- 3453 // __kmp_get_random: Get a random number using a linear congruential method. 3454 unsigned short __kmp_get_random(kmp_info_t *thread) { 3455 unsigned x = thread->th.th_x; 3456 unsigned short r = (unsigned short)(x >> 16); 3457 3458 thread->th.th_x = x * thread->th.th_a + 1; 3459 3460 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3461 thread->th.th_info.ds.ds_tid, r)); 3462 3463 return r; 3464 } 3465 //-------------------------------------------------------- 3466 // __kmp_init_random: Initialize a random number generator 3467 void __kmp_init_random(kmp_info_t *thread) { 3468 unsigned seed = thread->th.th_info.ds.ds_tid; 3469 3470 thread->th.th_a = 3471 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3472 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3473 KA_TRACE(30, 3474 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3475 } 3476 3477 #if KMP_OS_WINDOWS 3478 /* reclaim array entries for root threads that are already dead, returns number 3479 * reclaimed */ 3480 static int __kmp_reclaim_dead_roots(void) { 3481 int i, r = 0; 3482 3483 for (i = 0; i < __kmp_threads_capacity; ++i) { 3484 if (KMP_UBER_GTID(i) && 3485 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3486 !__kmp_root[i] 3487 ->r.r_active) { // AC: reclaim only roots died in non-active state 3488 r += __kmp_unregister_root_other_thread(i); 3489 } 3490 } 3491 return r; 3492 } 3493 #endif 3494 3495 /* This function attempts to create free entries in __kmp_threads and 3496 __kmp_root, and returns the number of free entries generated. 3497 3498 For Windows* OS static library, the first mechanism used is to reclaim array 3499 entries for root threads that are already dead. 3500 3501 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3502 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3503 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3504 threadprivate cache array has been created. Synchronization with 3505 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3506 3507 After any dead root reclamation, if the clipping value allows array expansion 3508 to result in the generation of a total of nNeed free slots, the function does 3509 that expansion. If not, nothing is done beyond the possible initial root 3510 thread reclamation. 3511 3512 If any argument is negative, the behavior is undefined. */ 3513 static int __kmp_expand_threads(int nNeed) { 3514 int added = 0; 3515 int minimumRequiredCapacity; 3516 int newCapacity; 3517 kmp_info_t **newThreads; 3518 kmp_root_t **newRoot; 3519 3520 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3521 // resizing __kmp_threads does not need additional protection if foreign 3522 // threads are present 3523 3524 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3525 /* only for Windows static library */ 3526 /* reclaim array entries for root threads that are already dead */ 3527 added = __kmp_reclaim_dead_roots(); 3528 3529 if (nNeed) { 3530 nNeed -= added; 3531 if (nNeed < 0) 3532 nNeed = 0; 3533 } 3534 #endif 3535 if (nNeed <= 0) 3536 return added; 3537 3538 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3539 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3540 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3541 // > __kmp_max_nth in one of two ways: 3542 // 3543 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3544 // may not be reused by another thread, so we may need to increase 3545 // __kmp_threads_capacity to __kmp_max_nth + 1. 3546 // 3547 // 2) New foreign root(s) are encountered. We always register new foreign 3548 // roots. This may cause a smaller # of threads to be allocated at 3549 // subsequent parallel regions, but the worker threads hang around (and 3550 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3551 // 3552 // Anyway, that is the reason for moving the check to see if 3553 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3554 // instead of having it performed here. -BB 3555 3556 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3557 3558 /* compute expansion headroom to check if we can expand */ 3559 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3560 /* possible expansion too small -- give up */ 3561 return added; 3562 } 3563 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3564 3565 newCapacity = __kmp_threads_capacity; 3566 do { 3567 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3568 : __kmp_sys_max_nth; 3569 } while (newCapacity < minimumRequiredCapacity); 3570 newThreads = (kmp_info_t **)__kmp_allocate( 3571 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3572 newRoot = 3573 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3574 KMP_MEMCPY(newThreads, __kmp_threads, 3575 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3576 KMP_MEMCPY(newRoot, __kmp_root, 3577 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3578 3579 kmp_info_t **temp_threads = __kmp_threads; 3580 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3581 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3582 __kmp_free(temp_threads); 3583 added += newCapacity - __kmp_threads_capacity; 3584 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3585 3586 if (newCapacity > __kmp_tp_capacity) { 3587 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3588 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3589 __kmp_threadprivate_resize_cache(newCapacity); 3590 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3591 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3592 } 3593 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3594 } 3595 3596 return added; 3597 } 3598 3599 /* Register the current thread as a root thread and obtain our gtid. We must 3600 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3601 thread that calls from __kmp_do_serial_initialize() */ 3602 int __kmp_register_root(int initial_thread) { 3603 kmp_info_t *root_thread; 3604 kmp_root_t *root; 3605 int gtid; 3606 int capacity; 3607 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3608 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3609 KMP_MB(); 3610 3611 /* 2007-03-02: 3612 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3613 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3614 work as expected -- it may return false (that means there is at least one 3615 empty slot in __kmp_threads array), but it is possible the only free slot 3616 is #0, which is reserved for initial thread and so cannot be used for this 3617 one. Following code workarounds this bug. 3618 3619 However, right solution seems to be not reserving slot #0 for initial 3620 thread because: 3621 (1) there is no magic in slot #0, 3622 (2) we cannot detect initial thread reliably (the first thread which does 3623 serial initialization may be not a real initial thread). 3624 */ 3625 capacity = __kmp_threads_capacity; 3626 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3627 --capacity; 3628 } 3629 3630 /* see if there are too many threads */ 3631 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3632 if (__kmp_tp_cached) { 3633 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3634 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3635 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3636 } else { 3637 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3638 __kmp_msg_null); 3639 } 3640 } 3641 3642 // When hidden helper task is enabled, __kmp_threads is organized as follows: 3643 // 0: initial thread, also a regular OpenMP thread. 3644 // [1, __kmp_hidden_helper_threads_num]: slots for hidden helper threads. 3645 // [__kmp_hidden_helper_threads_num + 1, __kmp_threads_capacity): slots for 3646 // regular OpenMP threads. 3647 if (TCR_4(__kmp_init_hidden_helper_threads)) { 3648 // Find an available thread slot for hidden helper thread. Slots for hidden 3649 // helper threads start from 1 to __kmp_hidden_helper_threads_num. 3650 for (gtid = 1; TCR_PTR(__kmp_threads[gtid]) != NULL && 3651 gtid <= __kmp_hidden_helper_threads_num; 3652 gtid++) 3653 ; 3654 KMP_ASSERT(gtid <= __kmp_hidden_helper_threads_num); 3655 KA_TRACE(1, ("__kmp_register_root: found slot in threads array for " 3656 "hidden helper thread: T#%d\n", 3657 gtid)); 3658 } else { 3659 /* find an available thread slot */ 3660 // Don't reassign the zero slot since we need that to only be used by 3661 // initial thread. Slots for hidden helper threads should also be skipped. 3662 if (initial_thread && __kmp_threads[0] == NULL) { 3663 gtid = 0; 3664 } else { 3665 for (gtid = __kmp_hidden_helper_threads_num + 1; 3666 TCR_PTR(__kmp_threads[gtid]) != NULL; gtid++) 3667 ; 3668 } 3669 KA_TRACE( 3670 1, ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3671 KMP_ASSERT(gtid < __kmp_threads_capacity); 3672 } 3673 3674 /* update global accounting */ 3675 __kmp_all_nth++; 3676 TCW_4(__kmp_nth, __kmp_nth + 1); 3677 3678 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3679 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3680 if (__kmp_adjust_gtid_mode) { 3681 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3682 if (TCR_4(__kmp_gtid_mode) != 2) { 3683 TCW_4(__kmp_gtid_mode, 2); 3684 } 3685 } else { 3686 if (TCR_4(__kmp_gtid_mode) != 1) { 3687 TCW_4(__kmp_gtid_mode, 1); 3688 } 3689 } 3690 } 3691 3692 #ifdef KMP_ADJUST_BLOCKTIME 3693 /* Adjust blocktime to zero if necessary */ 3694 /* Middle initialization might not have occurred yet */ 3695 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3696 if (__kmp_nth > __kmp_avail_proc) { 3697 __kmp_zero_bt = TRUE; 3698 } 3699 } 3700 #endif /* KMP_ADJUST_BLOCKTIME */ 3701 3702 /* setup this new hierarchy */ 3703 if (!(root = __kmp_root[gtid])) { 3704 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3705 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3706 } 3707 3708 #if KMP_STATS_ENABLED 3709 // Initialize stats as soon as possible (right after gtid assignment). 3710 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3711 __kmp_stats_thread_ptr->startLife(); 3712 KMP_SET_THREAD_STATE(SERIAL_REGION); 3713 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3714 #endif 3715 __kmp_initialize_root(root); 3716 3717 /* setup new root thread structure */ 3718 if (root->r.r_uber_thread) { 3719 root_thread = root->r.r_uber_thread; 3720 } else { 3721 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3722 if (__kmp_storage_map) { 3723 __kmp_print_thread_storage_map(root_thread, gtid); 3724 } 3725 root_thread->th.th_info.ds.ds_gtid = gtid; 3726 #if OMPT_SUPPORT 3727 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3728 #endif 3729 root_thread->th.th_root = root; 3730 if (__kmp_env_consistency_check) { 3731 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3732 } 3733 #if USE_FAST_MEMORY 3734 __kmp_initialize_fast_memory(root_thread); 3735 #endif /* USE_FAST_MEMORY */ 3736 3737 #if KMP_USE_BGET 3738 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3739 __kmp_initialize_bget(root_thread); 3740 #endif 3741 __kmp_init_random(root_thread); // Initialize random number generator 3742 } 3743 3744 /* setup the serial team held in reserve by the root thread */ 3745 if (!root_thread->th.th_serial_team) { 3746 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3747 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3748 root_thread->th.th_serial_team = __kmp_allocate_team( 3749 root, 1, 1, 3750 #if OMPT_SUPPORT 3751 ompt_data_none, // root parallel id 3752 #endif 3753 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3754 } 3755 KMP_ASSERT(root_thread->th.th_serial_team); 3756 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3757 root_thread->th.th_serial_team)); 3758 3759 /* drop root_thread into place */ 3760 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3761 3762 root->r.r_root_team->t.t_threads[0] = root_thread; 3763 root->r.r_hot_team->t.t_threads[0] = root_thread; 3764 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3765 // AC: the team created in reserve, not for execution (it is unused for now). 3766 root_thread->th.th_serial_team->t.t_serialized = 0; 3767 root->r.r_uber_thread = root_thread; 3768 3769 /* initialize the thread, get it ready to go */ 3770 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3771 TCW_4(__kmp_init_gtid, TRUE); 3772 3773 /* prepare the master thread for get_gtid() */ 3774 __kmp_gtid_set_specific(gtid); 3775 3776 #if USE_ITT_BUILD 3777 __kmp_itt_thread_name(gtid); 3778 #endif /* USE_ITT_BUILD */ 3779 3780 #ifdef KMP_TDATA_GTID 3781 __kmp_gtid = gtid; 3782 #endif 3783 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3784 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3785 3786 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3787 "plain=%u\n", 3788 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3789 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3790 KMP_INIT_BARRIER_STATE)); 3791 { // Initialize barrier data. 3792 int b; 3793 for (b = 0; b < bs_last_barrier; ++b) { 3794 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3795 #if USE_DEBUGGER 3796 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3797 #endif 3798 } 3799 } 3800 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3801 KMP_INIT_BARRIER_STATE); 3802 3803 #if KMP_AFFINITY_SUPPORTED 3804 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3805 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3806 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3807 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3808 if (TCR_4(__kmp_init_middle)) { 3809 __kmp_affinity_set_init_mask(gtid, TRUE); 3810 } 3811 #endif /* KMP_AFFINITY_SUPPORTED */ 3812 root_thread->th.th_def_allocator = __kmp_def_allocator; 3813 root_thread->th.th_prev_level = 0; 3814 root_thread->th.th_prev_num_threads = 1; 3815 3816 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3817 tmp->cg_root = root_thread; 3818 tmp->cg_thread_limit = __kmp_cg_max_nth; 3819 tmp->cg_nthreads = 1; 3820 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3821 " cg_nthreads init to 1\n", 3822 root_thread, tmp)); 3823 tmp->up = NULL; 3824 root_thread->th.th_cg_roots = tmp; 3825 3826 __kmp_root_counter++; 3827 3828 #if OMPT_SUPPORT 3829 if (!initial_thread && ompt_enabled.enabled) { 3830 3831 kmp_info_t *root_thread = ompt_get_thread(); 3832 3833 ompt_set_thread_state(root_thread, ompt_state_overhead); 3834 3835 if (ompt_enabled.ompt_callback_thread_begin) { 3836 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3837 ompt_thread_initial, __ompt_get_thread_data_internal()); 3838 } 3839 ompt_data_t *task_data; 3840 ompt_data_t *parallel_data; 3841 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3842 if (ompt_enabled.ompt_callback_implicit_task) { 3843 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3844 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3845 } 3846 3847 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3848 } 3849 #endif 3850 3851 KMP_MB(); 3852 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3853 3854 return gtid; 3855 } 3856 3857 #if KMP_NESTED_HOT_TEAMS 3858 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3859 const int max_level) { 3860 int i, n, nth; 3861 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3862 if (!hot_teams || !hot_teams[level].hot_team) { 3863 return 0; 3864 } 3865 KMP_DEBUG_ASSERT(level < max_level); 3866 kmp_team_t *team = hot_teams[level].hot_team; 3867 nth = hot_teams[level].hot_team_nth; 3868 n = nth - 1; // master is not freed 3869 if (level < max_level - 1) { 3870 for (i = 0; i < nth; ++i) { 3871 kmp_info_t *th = team->t.t_threads[i]; 3872 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3873 if (i > 0 && th->th.th_hot_teams) { 3874 __kmp_free(th->th.th_hot_teams); 3875 th->th.th_hot_teams = NULL; 3876 } 3877 } 3878 } 3879 __kmp_free_team(root, team, NULL); 3880 return n; 3881 } 3882 #endif 3883 3884 // Resets a root thread and clear its root and hot teams. 3885 // Returns the number of __kmp_threads entries directly and indirectly freed. 3886 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3887 kmp_team_t *root_team = root->r.r_root_team; 3888 kmp_team_t *hot_team = root->r.r_hot_team; 3889 int n = hot_team->t.t_nproc; 3890 int i; 3891 3892 KMP_DEBUG_ASSERT(!root->r.r_active); 3893 3894 root->r.r_root_team = NULL; 3895 root->r.r_hot_team = NULL; 3896 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3897 // before call to __kmp_free_team(). 3898 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3899 #if KMP_NESTED_HOT_TEAMS 3900 if (__kmp_hot_teams_max_level > 3901 0) { // need to free nested hot teams and their threads if any 3902 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3903 kmp_info_t *th = hot_team->t.t_threads[i]; 3904 if (__kmp_hot_teams_max_level > 1) { 3905 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3906 } 3907 if (th->th.th_hot_teams) { 3908 __kmp_free(th->th.th_hot_teams); 3909 th->th.th_hot_teams = NULL; 3910 } 3911 } 3912 } 3913 #endif 3914 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3915 3916 // Before we can reap the thread, we need to make certain that all other 3917 // threads in the teams that had this root as ancestor have stopped trying to 3918 // steal tasks. 3919 if (__kmp_tasking_mode != tskm_immediate_exec) { 3920 __kmp_wait_to_unref_task_teams(); 3921 } 3922 3923 #if KMP_OS_WINDOWS 3924 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3925 KA_TRACE( 3926 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3927 "\n", 3928 (LPVOID) & (root->r.r_uber_thread->th), 3929 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3930 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3931 #endif /* KMP_OS_WINDOWS */ 3932 3933 #if OMPT_SUPPORT 3934 ompt_data_t *task_data; 3935 ompt_data_t *parallel_data; 3936 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3937 if (ompt_enabled.ompt_callback_implicit_task) { 3938 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3939 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3940 } 3941 if (ompt_enabled.ompt_callback_thread_end) { 3942 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3943 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3944 } 3945 #endif 3946 3947 TCW_4(__kmp_nth, 3948 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3949 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3950 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3951 " to %d\n", 3952 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3953 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3954 if (i == 1) { 3955 // need to free contention group structure 3956 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3957 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3958 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3959 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3960 root->r.r_uber_thread->th.th_cg_roots = NULL; 3961 } 3962 __kmp_reap_thread(root->r.r_uber_thread, 1); 3963 3964 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3965 // instead of freeing. 3966 root->r.r_uber_thread = NULL; 3967 /* mark root as no longer in use */ 3968 root->r.r_begin = FALSE; 3969 3970 return n; 3971 } 3972 3973 void __kmp_unregister_root_current_thread(int gtid) { 3974 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3975 /* this lock should be ok, since unregister_root_current_thread is never 3976 called during an abort, only during a normal close. furthermore, if you 3977 have the forkjoin lock, you should never try to get the initz lock */ 3978 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3979 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3980 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3981 "exiting T#%d\n", 3982 gtid)); 3983 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3984 return; 3985 } 3986 kmp_root_t *root = __kmp_root[gtid]; 3987 3988 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3989 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3990 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3991 KMP_ASSERT(root->r.r_active == FALSE); 3992 3993 KMP_MB(); 3994 3995 kmp_info_t *thread = __kmp_threads[gtid]; 3996 kmp_team_t *team = thread->th.th_team; 3997 kmp_task_team_t *task_team = thread->th.th_task_team; 3998 3999 // we need to wait for the proxy tasks before finishing the thread 4000 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4001 #if OMPT_SUPPORT 4002 // the runtime is shutting down so we won't report any events 4003 thread->th.ompt_thread_info.state = ompt_state_undefined; 4004 #endif 4005 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4006 } 4007 4008 __kmp_reset_root(gtid, root); 4009 4010 KMP_MB(); 4011 KC_TRACE(10, 4012 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4013 4014 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4015 } 4016 4017 #if KMP_OS_WINDOWS 4018 /* __kmp_forkjoin_lock must be already held 4019 Unregisters a root thread that is not the current thread. Returns the number 4020 of __kmp_threads entries freed as a result. */ 4021 static int __kmp_unregister_root_other_thread(int gtid) { 4022 kmp_root_t *root = __kmp_root[gtid]; 4023 int r; 4024 4025 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4026 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4027 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4028 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4029 KMP_ASSERT(root->r.r_active == FALSE); 4030 4031 r = __kmp_reset_root(gtid, root); 4032 KC_TRACE(10, 4033 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4034 return r; 4035 } 4036 #endif 4037 4038 #if KMP_DEBUG 4039 void __kmp_task_info() { 4040 4041 kmp_int32 gtid = __kmp_entry_gtid(); 4042 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4043 kmp_info_t *this_thr = __kmp_threads[gtid]; 4044 kmp_team_t *steam = this_thr->th.th_serial_team; 4045 kmp_team_t *team = this_thr->th.th_team; 4046 4047 __kmp_printf( 4048 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4049 "ptask=%p\n", 4050 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4051 team->t.t_implicit_task_taskdata[tid].td_parent); 4052 } 4053 #endif // KMP_DEBUG 4054 4055 /* TODO optimize with one big memclr, take out what isn't needed, split 4056 responsibility to workers as much as possible, and delay initialization of 4057 features as much as possible */ 4058 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4059 int tid, int gtid) { 4060 /* this_thr->th.th_info.ds.ds_gtid is setup in 4061 kmp_allocate_thread/create_worker. 4062 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4063 kmp_info_t *master = team->t.t_threads[0]; 4064 KMP_DEBUG_ASSERT(this_thr != NULL); 4065 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4066 KMP_DEBUG_ASSERT(team); 4067 KMP_DEBUG_ASSERT(team->t.t_threads); 4068 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4069 KMP_DEBUG_ASSERT(master); 4070 KMP_DEBUG_ASSERT(master->th.th_root); 4071 4072 KMP_MB(); 4073 4074 TCW_SYNC_PTR(this_thr->th.th_team, team); 4075 4076 this_thr->th.th_info.ds.ds_tid = tid; 4077 this_thr->th.th_set_nproc = 0; 4078 if (__kmp_tasking_mode != tskm_immediate_exec) 4079 // When tasking is possible, threads are not safe to reap until they are 4080 // done tasking; this will be set when tasking code is exited in wait 4081 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4082 else // no tasking --> always safe to reap 4083 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4084 this_thr->th.th_set_proc_bind = proc_bind_default; 4085 #if KMP_AFFINITY_SUPPORTED 4086 this_thr->th.th_new_place = this_thr->th.th_current_place; 4087 #endif 4088 this_thr->th.th_root = master->th.th_root; 4089 4090 /* setup the thread's cache of the team structure */ 4091 this_thr->th.th_team_nproc = team->t.t_nproc; 4092 this_thr->th.th_team_master = master; 4093 this_thr->th.th_team_serialized = team->t.t_serialized; 4094 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4095 4096 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4097 4098 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4099 tid, gtid, this_thr, this_thr->th.th_current_task)); 4100 4101 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4102 team, tid, TRUE); 4103 4104 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4105 tid, gtid, this_thr, this_thr->th.th_current_task)); 4106 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4107 // __kmp_initialize_team()? 4108 4109 /* TODO no worksharing in speculative threads */ 4110 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4111 4112 this_thr->th.th_local.this_construct = 0; 4113 4114 if (!this_thr->th.th_pri_common) { 4115 this_thr->th.th_pri_common = 4116 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4117 if (__kmp_storage_map) { 4118 __kmp_print_storage_map_gtid( 4119 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4120 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4121 } 4122 this_thr->th.th_pri_head = NULL; 4123 } 4124 4125 if (this_thr != master && // Master's CG root is initialized elsewhere 4126 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4127 // Make new thread's CG root same as master's 4128 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4129 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4130 if (tmp) { 4131 // worker changes CG, need to check if old CG should be freed 4132 int i = tmp->cg_nthreads--; 4133 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4134 " on node %p of thread %p to %d\n", 4135 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4136 if (i == 1) { 4137 __kmp_free(tmp); // last thread left CG --> free it 4138 } 4139 } 4140 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4141 // Increment new thread's CG root's counter to add the new thread 4142 this_thr->th.th_cg_roots->cg_nthreads++; 4143 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4144 " node %p of thread %p to %d\n", 4145 this_thr, this_thr->th.th_cg_roots, 4146 this_thr->th.th_cg_roots->cg_root, 4147 this_thr->th.th_cg_roots->cg_nthreads)); 4148 this_thr->th.th_current_task->td_icvs.thread_limit = 4149 this_thr->th.th_cg_roots->cg_thread_limit; 4150 } 4151 4152 /* Initialize dynamic dispatch */ 4153 { 4154 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4155 // Use team max_nproc since this will never change for the team. 4156 size_t disp_size = 4157 sizeof(dispatch_private_info_t) * 4158 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4159 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4160 team->t.t_max_nproc)); 4161 KMP_ASSERT(dispatch); 4162 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4163 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4164 4165 dispatch->th_disp_index = 0; 4166 dispatch->th_doacross_buf_idx = 0; 4167 if (!dispatch->th_disp_buffer) { 4168 dispatch->th_disp_buffer = 4169 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4170 4171 if (__kmp_storage_map) { 4172 __kmp_print_storage_map_gtid( 4173 gtid, &dispatch->th_disp_buffer[0], 4174 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4175 ? 1 4176 : __kmp_dispatch_num_buffers], 4177 disp_size, "th_%d.th_dispatch.th_disp_buffer " 4178 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4179 gtid, team->t.t_id, gtid); 4180 } 4181 } else { 4182 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4183 } 4184 4185 dispatch->th_dispatch_pr_current = 0; 4186 dispatch->th_dispatch_sh_current = 0; 4187 4188 dispatch->th_deo_fcn = 0; /* ORDERED */ 4189 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4190 } 4191 4192 this_thr->th.th_next_pool = NULL; 4193 4194 if (!this_thr->th.th_task_state_memo_stack) { 4195 size_t i; 4196 this_thr->th.th_task_state_memo_stack = 4197 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4198 this_thr->th.th_task_state_top = 0; 4199 this_thr->th.th_task_state_stack_sz = 4; 4200 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4201 ++i) // zero init the stack 4202 this_thr->th.th_task_state_memo_stack[i] = 0; 4203 } 4204 4205 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4206 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4207 4208 KMP_MB(); 4209 } 4210 4211 /* allocate a new thread for the requesting team. this is only called from 4212 within a forkjoin critical section. we will first try to get an available 4213 thread from the thread pool. if none is available, we will fork a new one 4214 assuming we are able to create a new one. this should be assured, as the 4215 caller should check on this first. */ 4216 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4217 int new_tid) { 4218 kmp_team_t *serial_team; 4219 kmp_info_t *new_thr; 4220 int new_gtid; 4221 4222 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4223 KMP_DEBUG_ASSERT(root && team); 4224 #if !KMP_NESTED_HOT_TEAMS 4225 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4226 #endif 4227 KMP_MB(); 4228 4229 /* first, try to get one from the thread pool */ 4230 if (__kmp_thread_pool) { 4231 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4232 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4233 if (new_thr == __kmp_thread_pool_insert_pt) { 4234 __kmp_thread_pool_insert_pt = NULL; 4235 } 4236 TCW_4(new_thr->th.th_in_pool, FALSE); 4237 __kmp_suspend_initialize_thread(new_thr); 4238 __kmp_lock_suspend_mx(new_thr); 4239 if (new_thr->th.th_active_in_pool == TRUE) { 4240 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4241 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4242 new_thr->th.th_active_in_pool = FALSE; 4243 } 4244 __kmp_unlock_suspend_mx(new_thr); 4245 4246 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4247 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4248 KMP_ASSERT(!new_thr->th.th_team); 4249 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4250 4251 /* setup the thread structure */ 4252 __kmp_initialize_info(new_thr, team, new_tid, 4253 new_thr->th.th_info.ds.ds_gtid); 4254 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4255 4256 TCW_4(__kmp_nth, __kmp_nth + 1); 4257 4258 new_thr->th.th_task_state = 0; 4259 new_thr->th.th_task_state_top = 0; 4260 new_thr->th.th_task_state_stack_sz = 4; 4261 4262 #ifdef KMP_ADJUST_BLOCKTIME 4263 /* Adjust blocktime back to zero if necessary */ 4264 /* Middle initialization might not have occurred yet */ 4265 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4266 if (__kmp_nth > __kmp_avail_proc) { 4267 __kmp_zero_bt = TRUE; 4268 } 4269 } 4270 #endif /* KMP_ADJUST_BLOCKTIME */ 4271 4272 #if KMP_DEBUG 4273 // If thread entered pool via __kmp_free_thread, wait_flag should != 4274 // KMP_BARRIER_PARENT_FLAG. 4275 int b; 4276 kmp_balign_t *balign = new_thr->th.th_bar; 4277 for (b = 0; b < bs_last_barrier; ++b) 4278 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4279 #endif 4280 4281 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4282 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4283 4284 KMP_MB(); 4285 return new_thr; 4286 } 4287 4288 /* no, well fork a new one */ 4289 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4290 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4291 4292 #if KMP_USE_MONITOR 4293 // If this is the first worker thread the RTL is creating, then also 4294 // launch the monitor thread. We try to do this as early as possible. 4295 if (!TCR_4(__kmp_init_monitor)) { 4296 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4297 if (!TCR_4(__kmp_init_monitor)) { 4298 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4299 TCW_4(__kmp_init_monitor, 1); 4300 __kmp_create_monitor(&__kmp_monitor); 4301 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4302 #if KMP_OS_WINDOWS 4303 // AC: wait until monitor has started. This is a fix for CQ232808. 4304 // The reason is that if the library is loaded/unloaded in a loop with 4305 // small (parallel) work in between, then there is high probability that 4306 // monitor thread started after the library shutdown. At shutdown it is 4307 // too late to cope with the problem, because when the master is in 4308 // DllMain (process detach) the monitor has no chances to start (it is 4309 // blocked), and master has no means to inform the monitor that the 4310 // library has gone, because all the memory which the monitor can access 4311 // is going to be released/reset. 4312 while (TCR_4(__kmp_init_monitor) < 2) { 4313 KMP_YIELD(TRUE); 4314 } 4315 KF_TRACE(10, ("after monitor thread has started\n")); 4316 #endif 4317 } 4318 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4319 } 4320 #endif 4321 4322 KMP_MB(); 4323 4324 { 4325 int new_start_gtid = TCR_4(__kmp_init_hidden_helper_threads) 4326 ? 1 4327 : __kmp_hidden_helper_threads_num + 1; 4328 4329 for (new_gtid = new_start_gtid; TCR_PTR(__kmp_threads[new_gtid]) != NULL; 4330 ++new_gtid) { 4331 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4332 } 4333 4334 if (TCR_4(__kmp_init_hidden_helper_threads)) { 4335 KMP_DEBUG_ASSERT(new_gtid <= __kmp_hidden_helper_threads_num); 4336 } 4337 } 4338 4339 /* allocate space for it. */ 4340 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4341 4342 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4343 4344 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4345 // suppress race conditions detection on synchronization flags in debug mode 4346 // this helps to analyze library internals eliminating false positives 4347 __itt_suppress_mark_range( 4348 __itt_suppress_range, __itt_suppress_threading_errors, 4349 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4350 __itt_suppress_mark_range( 4351 __itt_suppress_range, __itt_suppress_threading_errors, 4352 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4353 #if KMP_OS_WINDOWS 4354 __itt_suppress_mark_range( 4355 __itt_suppress_range, __itt_suppress_threading_errors, 4356 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4357 #else 4358 __itt_suppress_mark_range(__itt_suppress_range, 4359 __itt_suppress_threading_errors, 4360 &new_thr->th.th_suspend_init_count, 4361 sizeof(new_thr->th.th_suspend_init_count)); 4362 #endif 4363 // TODO: check if we need to also suppress b_arrived flags 4364 __itt_suppress_mark_range(__itt_suppress_range, 4365 __itt_suppress_threading_errors, 4366 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4367 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4368 __itt_suppress_mark_range(__itt_suppress_range, 4369 __itt_suppress_threading_errors, 4370 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4371 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4372 __itt_suppress_mark_range(__itt_suppress_range, 4373 __itt_suppress_threading_errors, 4374 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4375 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4376 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4377 if (__kmp_storage_map) { 4378 __kmp_print_thread_storage_map(new_thr, new_gtid); 4379 } 4380 4381 // add the reserve serialized team, initialized from the team's master thread 4382 { 4383 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4384 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4385 new_thr->th.th_serial_team = serial_team = 4386 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4387 #if OMPT_SUPPORT 4388 ompt_data_none, // root parallel id 4389 #endif 4390 proc_bind_default, &r_icvs, 4391 0 USE_NESTED_HOT_ARG(NULL)); 4392 } 4393 KMP_ASSERT(serial_team); 4394 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4395 // execution (it is unused for now). 4396 serial_team->t.t_threads[0] = new_thr; 4397 KF_TRACE(10, 4398 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4399 new_thr)); 4400 4401 /* setup the thread structures */ 4402 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4403 4404 #if USE_FAST_MEMORY 4405 __kmp_initialize_fast_memory(new_thr); 4406 #endif /* USE_FAST_MEMORY */ 4407 4408 #if KMP_USE_BGET 4409 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4410 __kmp_initialize_bget(new_thr); 4411 #endif 4412 4413 __kmp_init_random(new_thr); // Initialize random number generator 4414 4415 /* Initialize these only once when thread is grabbed for a team allocation */ 4416 KA_TRACE(20, 4417 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4418 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4419 4420 int b; 4421 kmp_balign_t *balign = new_thr->th.th_bar; 4422 for (b = 0; b < bs_last_barrier; ++b) { 4423 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4424 balign[b].bb.team = NULL; 4425 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4426 balign[b].bb.use_oncore_barrier = 0; 4427 } 4428 4429 new_thr->th.th_spin_here = FALSE; 4430 new_thr->th.th_next_waiting = 0; 4431 #if KMP_OS_UNIX 4432 new_thr->th.th_blocking = false; 4433 #endif 4434 4435 #if KMP_AFFINITY_SUPPORTED 4436 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4437 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4438 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4439 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4440 #endif 4441 new_thr->th.th_def_allocator = __kmp_def_allocator; 4442 new_thr->th.th_prev_level = 0; 4443 new_thr->th.th_prev_num_threads = 1; 4444 4445 TCW_4(new_thr->th.th_in_pool, FALSE); 4446 new_thr->th.th_active_in_pool = FALSE; 4447 TCW_4(new_thr->th.th_active, TRUE); 4448 4449 /* adjust the global counters */ 4450 __kmp_all_nth++; 4451 __kmp_nth++; 4452 4453 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4454 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4455 if (__kmp_adjust_gtid_mode) { 4456 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4457 if (TCR_4(__kmp_gtid_mode) != 2) { 4458 TCW_4(__kmp_gtid_mode, 2); 4459 } 4460 } else { 4461 if (TCR_4(__kmp_gtid_mode) != 1) { 4462 TCW_4(__kmp_gtid_mode, 1); 4463 } 4464 } 4465 } 4466 4467 #ifdef KMP_ADJUST_BLOCKTIME 4468 /* Adjust blocktime back to zero if necessary */ 4469 /* Middle initialization might not have occurred yet */ 4470 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4471 if (__kmp_nth > __kmp_avail_proc) { 4472 __kmp_zero_bt = TRUE; 4473 } 4474 } 4475 #endif /* KMP_ADJUST_BLOCKTIME */ 4476 4477 /* actually fork it and create the new worker thread */ 4478 KF_TRACE( 4479 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4480 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4481 KF_TRACE(10, 4482 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4483 4484 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4485 new_gtid)); 4486 KMP_MB(); 4487 return new_thr; 4488 } 4489 4490 /* Reinitialize team for reuse. 4491 The hot team code calls this case at every fork barrier, so EPCC barrier 4492 test are extremely sensitive to changes in it, esp. writes to the team 4493 struct, which cause a cache invalidation in all threads. 4494 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4495 static void __kmp_reinitialize_team(kmp_team_t *team, 4496 kmp_internal_control_t *new_icvs, 4497 ident_t *loc) { 4498 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4499 team->t.t_threads[0], team)); 4500 KMP_DEBUG_ASSERT(team && new_icvs); 4501 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4502 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4503 4504 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4505 // Copy ICVs to the master thread's implicit taskdata 4506 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4507 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4508 4509 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4510 team->t.t_threads[0], team)); 4511 } 4512 4513 /* Initialize the team data structure. 4514 This assumes the t_threads and t_max_nproc are already set. 4515 Also, we don't touch the arguments */ 4516 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4517 kmp_internal_control_t *new_icvs, 4518 ident_t *loc) { 4519 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4520 4521 /* verify */ 4522 KMP_DEBUG_ASSERT(team); 4523 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4524 KMP_DEBUG_ASSERT(team->t.t_threads); 4525 KMP_MB(); 4526 4527 team->t.t_master_tid = 0; /* not needed */ 4528 /* team->t.t_master_bar; not needed */ 4529 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4530 team->t.t_nproc = new_nproc; 4531 4532 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4533 team->t.t_next_pool = NULL; 4534 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4535 * up hot team */ 4536 4537 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4538 team->t.t_invoke = NULL; /* not needed */ 4539 4540 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4541 team->t.t_sched.sched = new_icvs->sched.sched; 4542 4543 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4544 team->t.t_fp_control_saved = FALSE; /* not needed */ 4545 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4546 team->t.t_mxcsr = 0; /* not needed */ 4547 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4548 4549 team->t.t_construct = 0; 4550 4551 team->t.t_ordered.dt.t_value = 0; 4552 team->t.t_master_active = FALSE; 4553 4554 #ifdef KMP_DEBUG 4555 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4556 #endif 4557 #if KMP_OS_WINDOWS 4558 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4559 #endif 4560 4561 team->t.t_control_stack_top = NULL; 4562 4563 __kmp_reinitialize_team(team, new_icvs, loc); 4564 4565 KMP_MB(); 4566 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4567 } 4568 4569 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4570 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4571 static void 4572 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4573 if (KMP_AFFINITY_CAPABLE()) { 4574 int status; 4575 if (old_mask != NULL) { 4576 status = __kmp_get_system_affinity(old_mask, TRUE); 4577 int error = errno; 4578 if (status != 0) { 4579 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4580 __kmp_msg_null); 4581 } 4582 } 4583 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4584 } 4585 } 4586 #endif 4587 4588 #if KMP_AFFINITY_SUPPORTED 4589 4590 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4591 // It calculates the worker + master thread's partition based upon the parent 4592 // thread's partition, and binds each worker to a thread in their partition. 4593 // The master thread's partition should already include its current binding. 4594 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4595 // Copy the master thread's place partition to the team struct 4596 kmp_info_t *master_th = team->t.t_threads[0]; 4597 KMP_DEBUG_ASSERT(master_th != NULL); 4598 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4599 int first_place = master_th->th.th_first_place; 4600 int last_place = master_th->th.th_last_place; 4601 int masters_place = master_th->th.th_current_place; 4602 team->t.t_first_place = first_place; 4603 team->t.t_last_place = last_place; 4604 4605 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4606 "bound to place %d partition = [%d,%d]\n", 4607 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4608 team->t.t_id, masters_place, first_place, last_place)); 4609 4610 switch (proc_bind) { 4611 4612 case proc_bind_default: 4613 // serial teams might have the proc_bind policy set to proc_bind_default. It 4614 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4615 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4616 break; 4617 4618 case proc_bind_master: { 4619 int f; 4620 int n_th = team->t.t_nproc; 4621 for (f = 1; f < n_th; f++) { 4622 kmp_info_t *th = team->t.t_threads[f]; 4623 KMP_DEBUG_ASSERT(th != NULL); 4624 th->th.th_first_place = first_place; 4625 th->th.th_last_place = last_place; 4626 th->th.th_new_place = masters_place; 4627 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4628 team->t.t_display_affinity != 1) { 4629 team->t.t_display_affinity = 1; 4630 } 4631 4632 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4633 "partition = [%d,%d]\n", 4634 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4635 f, masters_place, first_place, last_place)); 4636 } 4637 } break; 4638 4639 case proc_bind_close: { 4640 int f; 4641 int n_th = team->t.t_nproc; 4642 int n_places; 4643 if (first_place <= last_place) { 4644 n_places = last_place - first_place + 1; 4645 } else { 4646 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4647 } 4648 if (n_th <= n_places) { 4649 int place = masters_place; 4650 for (f = 1; f < n_th; f++) { 4651 kmp_info_t *th = team->t.t_threads[f]; 4652 KMP_DEBUG_ASSERT(th != NULL); 4653 4654 if (place == last_place) { 4655 place = first_place; 4656 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4657 place = 0; 4658 } else { 4659 place++; 4660 } 4661 th->th.th_first_place = first_place; 4662 th->th.th_last_place = last_place; 4663 th->th.th_new_place = place; 4664 if (__kmp_display_affinity && place != th->th.th_current_place && 4665 team->t.t_display_affinity != 1) { 4666 team->t.t_display_affinity = 1; 4667 } 4668 4669 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4670 "partition = [%d,%d]\n", 4671 __kmp_gtid_from_thread(team->t.t_threads[f]), 4672 team->t.t_id, f, place, first_place, last_place)); 4673 } 4674 } else { 4675 int S, rem, gap, s_count; 4676 S = n_th / n_places; 4677 s_count = 0; 4678 rem = n_th - (S * n_places); 4679 gap = rem > 0 ? n_places / rem : n_places; 4680 int place = masters_place; 4681 int gap_ct = gap; 4682 for (f = 0; f < n_th; f++) { 4683 kmp_info_t *th = team->t.t_threads[f]; 4684 KMP_DEBUG_ASSERT(th != NULL); 4685 4686 th->th.th_first_place = first_place; 4687 th->th.th_last_place = last_place; 4688 th->th.th_new_place = place; 4689 if (__kmp_display_affinity && place != th->th.th_current_place && 4690 team->t.t_display_affinity != 1) { 4691 team->t.t_display_affinity = 1; 4692 } 4693 s_count++; 4694 4695 if ((s_count == S) && rem && (gap_ct == gap)) { 4696 // do nothing, add an extra thread to place on next iteration 4697 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4698 // we added an extra thread to this place; move to next place 4699 if (place == last_place) { 4700 place = first_place; 4701 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4702 place = 0; 4703 } else { 4704 place++; 4705 } 4706 s_count = 0; 4707 gap_ct = 1; 4708 rem--; 4709 } else if (s_count == S) { // place full; don't add extra 4710 if (place == last_place) { 4711 place = first_place; 4712 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4713 place = 0; 4714 } else { 4715 place++; 4716 } 4717 gap_ct++; 4718 s_count = 0; 4719 } 4720 4721 KA_TRACE(100, 4722 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4723 "partition = [%d,%d]\n", 4724 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4725 th->th.th_new_place, first_place, last_place)); 4726 } 4727 KMP_DEBUG_ASSERT(place == masters_place); 4728 } 4729 } break; 4730 4731 case proc_bind_spread: { 4732 int f; 4733 int n_th = team->t.t_nproc; 4734 int n_places; 4735 int thidx; 4736 if (first_place <= last_place) { 4737 n_places = last_place - first_place + 1; 4738 } else { 4739 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4740 } 4741 if (n_th <= n_places) { 4742 int place = -1; 4743 4744 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4745 int S = n_places / n_th; 4746 int s_count, rem, gap, gap_ct; 4747 4748 place = masters_place; 4749 rem = n_places - n_th * S; 4750 gap = rem ? n_th / rem : 1; 4751 gap_ct = gap; 4752 thidx = n_th; 4753 if (update_master_only == 1) 4754 thidx = 1; 4755 for (f = 0; f < thidx; f++) { 4756 kmp_info_t *th = team->t.t_threads[f]; 4757 KMP_DEBUG_ASSERT(th != NULL); 4758 4759 th->th.th_first_place = place; 4760 th->th.th_new_place = place; 4761 if (__kmp_display_affinity && place != th->th.th_current_place && 4762 team->t.t_display_affinity != 1) { 4763 team->t.t_display_affinity = 1; 4764 } 4765 s_count = 1; 4766 while (s_count < S) { 4767 if (place == last_place) { 4768 place = first_place; 4769 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4770 place = 0; 4771 } else { 4772 place++; 4773 } 4774 s_count++; 4775 } 4776 if (rem && (gap_ct == gap)) { 4777 if (place == last_place) { 4778 place = first_place; 4779 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4780 place = 0; 4781 } else { 4782 place++; 4783 } 4784 rem--; 4785 gap_ct = 0; 4786 } 4787 th->th.th_last_place = place; 4788 gap_ct++; 4789 4790 if (place == last_place) { 4791 place = first_place; 4792 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4793 place = 0; 4794 } else { 4795 place++; 4796 } 4797 4798 KA_TRACE(100, 4799 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4800 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4801 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4802 f, th->th.th_new_place, th->th.th_first_place, 4803 th->th.th_last_place, __kmp_affinity_num_masks)); 4804 } 4805 } else { 4806 /* Having uniform space of available computation places I can create 4807 T partitions of round(P/T) size and put threads into the first 4808 place of each partition. */ 4809 double current = static_cast<double>(masters_place); 4810 double spacing = 4811 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4812 int first, last; 4813 kmp_info_t *th; 4814 4815 thidx = n_th + 1; 4816 if (update_master_only == 1) 4817 thidx = 1; 4818 for (f = 0; f < thidx; f++) { 4819 first = static_cast<int>(current); 4820 last = static_cast<int>(current + spacing) - 1; 4821 KMP_DEBUG_ASSERT(last >= first); 4822 if (first >= n_places) { 4823 if (masters_place) { 4824 first -= n_places; 4825 last -= n_places; 4826 if (first == (masters_place + 1)) { 4827 KMP_DEBUG_ASSERT(f == n_th); 4828 first--; 4829 } 4830 if (last == masters_place) { 4831 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4832 last--; 4833 } 4834 } else { 4835 KMP_DEBUG_ASSERT(f == n_th); 4836 first = 0; 4837 last = 0; 4838 } 4839 } 4840 if (last >= n_places) { 4841 last = (n_places - 1); 4842 } 4843 place = first; 4844 current += spacing; 4845 if (f < n_th) { 4846 KMP_DEBUG_ASSERT(0 <= first); 4847 KMP_DEBUG_ASSERT(n_places > first); 4848 KMP_DEBUG_ASSERT(0 <= last); 4849 KMP_DEBUG_ASSERT(n_places > last); 4850 KMP_DEBUG_ASSERT(last_place >= first_place); 4851 th = team->t.t_threads[f]; 4852 KMP_DEBUG_ASSERT(th); 4853 th->th.th_first_place = first; 4854 th->th.th_new_place = place; 4855 th->th.th_last_place = last; 4856 if (__kmp_display_affinity && place != th->th.th_current_place && 4857 team->t.t_display_affinity != 1) { 4858 team->t.t_display_affinity = 1; 4859 } 4860 KA_TRACE(100, 4861 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4862 "partition = [%d,%d], spacing = %.4f\n", 4863 __kmp_gtid_from_thread(team->t.t_threads[f]), 4864 team->t.t_id, f, th->th.th_new_place, 4865 th->th.th_first_place, th->th.th_last_place, spacing)); 4866 } 4867 } 4868 } 4869 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4870 } else { 4871 int S, rem, gap, s_count; 4872 S = n_th / n_places; 4873 s_count = 0; 4874 rem = n_th - (S * n_places); 4875 gap = rem > 0 ? n_places / rem : n_places; 4876 int place = masters_place; 4877 int gap_ct = gap; 4878 thidx = n_th; 4879 if (update_master_only == 1) 4880 thidx = 1; 4881 for (f = 0; f < thidx; f++) { 4882 kmp_info_t *th = team->t.t_threads[f]; 4883 KMP_DEBUG_ASSERT(th != NULL); 4884 4885 th->th.th_first_place = place; 4886 th->th.th_last_place = place; 4887 th->th.th_new_place = place; 4888 if (__kmp_display_affinity && place != th->th.th_current_place && 4889 team->t.t_display_affinity != 1) { 4890 team->t.t_display_affinity = 1; 4891 } 4892 s_count++; 4893 4894 if ((s_count == S) && rem && (gap_ct == gap)) { 4895 // do nothing, add an extra thread to place on next iteration 4896 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4897 // we added an extra thread to this place; move on to next place 4898 if (place == last_place) { 4899 place = first_place; 4900 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4901 place = 0; 4902 } else { 4903 place++; 4904 } 4905 s_count = 0; 4906 gap_ct = 1; 4907 rem--; 4908 } else if (s_count == S) { // place is full; don't add extra thread 4909 if (place == last_place) { 4910 place = first_place; 4911 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4912 place = 0; 4913 } else { 4914 place++; 4915 } 4916 gap_ct++; 4917 s_count = 0; 4918 } 4919 4920 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4921 "partition = [%d,%d]\n", 4922 __kmp_gtid_from_thread(team->t.t_threads[f]), 4923 team->t.t_id, f, th->th.th_new_place, 4924 th->th.th_first_place, th->th.th_last_place)); 4925 } 4926 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4927 } 4928 } break; 4929 4930 default: 4931 break; 4932 } 4933 4934 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4935 } 4936 4937 #endif // KMP_AFFINITY_SUPPORTED 4938 4939 /* allocate a new team data structure to use. take one off of the free pool if 4940 available */ 4941 kmp_team_t * 4942 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4943 #if OMPT_SUPPORT 4944 ompt_data_t ompt_parallel_data, 4945 #endif 4946 kmp_proc_bind_t new_proc_bind, 4947 kmp_internal_control_t *new_icvs, 4948 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4949 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4950 int f; 4951 kmp_team_t *team; 4952 int use_hot_team = !root->r.r_active; 4953 int level = 0; 4954 4955 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4956 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4957 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4958 KMP_MB(); 4959 4960 #if KMP_NESTED_HOT_TEAMS 4961 kmp_hot_team_ptr_t *hot_teams; 4962 if (master) { 4963 team = master->th.th_team; 4964 level = team->t.t_active_level; 4965 if (master->th.th_teams_microtask) { // in teams construct? 4966 if (master->th.th_teams_size.nteams > 1 && 4967 ( // #teams > 1 4968 team->t.t_pkfn == 4969 (microtask_t)__kmp_teams_master || // inner fork of the teams 4970 master->th.th_teams_level < 4971 team->t.t_level)) { // or nested parallel inside the teams 4972 ++level; // not increment if #teams==1, or for outer fork of the teams; 4973 // increment otherwise 4974 } 4975 } 4976 hot_teams = master->th.th_hot_teams; 4977 if (level < __kmp_hot_teams_max_level && hot_teams && 4978 hot_teams[level].hot_team) { 4979 // hot team has already been allocated for given level 4980 use_hot_team = 1; 4981 } else { 4982 use_hot_team = 0; 4983 } 4984 } else { 4985 // check we won't access uninitialized hot_teams, just in case 4986 KMP_DEBUG_ASSERT(new_nproc == 1); 4987 } 4988 #endif 4989 // Optimization to use a "hot" team 4990 if (use_hot_team && new_nproc > 1) { 4991 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4992 #if KMP_NESTED_HOT_TEAMS 4993 team = hot_teams[level].hot_team; 4994 #else 4995 team = root->r.r_hot_team; 4996 #endif 4997 #if KMP_DEBUG 4998 if (__kmp_tasking_mode != tskm_immediate_exec) { 4999 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5000 "task_team[1] = %p before reinit\n", 5001 team->t.t_task_team[0], team->t.t_task_team[1])); 5002 } 5003 #endif 5004 5005 // Has the number of threads changed? 5006 /* Let's assume the most common case is that the number of threads is 5007 unchanged, and put that case first. */ 5008 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 5009 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 5010 // This case can mean that omp_set_num_threads() was called and the hot 5011 // team size was already reduced, so we check the special flag 5012 if (team->t.t_size_changed == -1) { 5013 team->t.t_size_changed = 1; 5014 } else { 5015 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 5016 } 5017 5018 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5019 kmp_r_sched_t new_sched = new_icvs->sched; 5020 // set master's schedule as new run-time schedule 5021 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 5022 5023 __kmp_reinitialize_team(team, new_icvs, 5024 root->r.r_uber_thread->th.th_ident); 5025 5026 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5027 team->t.t_threads[0], team)); 5028 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5029 5030 #if KMP_AFFINITY_SUPPORTED 5031 if ((team->t.t_size_changed == 0) && 5032 (team->t.t_proc_bind == new_proc_bind)) { 5033 if (new_proc_bind == proc_bind_spread) { 5034 __kmp_partition_places( 5035 team, 1); // add flag to update only master for spread 5036 } 5037 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5038 "proc_bind = %d, partition = [%d,%d]\n", 5039 team->t.t_id, new_proc_bind, team->t.t_first_place, 5040 team->t.t_last_place)); 5041 } else { 5042 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5043 __kmp_partition_places(team); 5044 } 5045 #else 5046 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5047 #endif /* KMP_AFFINITY_SUPPORTED */ 5048 } else if (team->t.t_nproc > new_nproc) { 5049 KA_TRACE(20, 5050 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5051 new_nproc)); 5052 5053 team->t.t_size_changed = 1; 5054 #if KMP_NESTED_HOT_TEAMS 5055 if (__kmp_hot_teams_mode == 0) { 5056 // AC: saved number of threads should correspond to team's value in this 5057 // mode, can be bigger in mode 1, when hot team has threads in reserve 5058 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5059 hot_teams[level].hot_team_nth = new_nproc; 5060 #endif // KMP_NESTED_HOT_TEAMS 5061 /* release the extra threads we don't need any more */ 5062 for (f = new_nproc; f < team->t.t_nproc; f++) { 5063 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5064 if (__kmp_tasking_mode != tskm_immediate_exec) { 5065 // When decreasing team size, threads no longer in the team should 5066 // unref task team. 5067 team->t.t_threads[f]->th.th_task_team = NULL; 5068 } 5069 __kmp_free_thread(team->t.t_threads[f]); 5070 team->t.t_threads[f] = NULL; 5071 } 5072 #if KMP_NESTED_HOT_TEAMS 5073 } // (__kmp_hot_teams_mode == 0) 5074 else { 5075 // When keeping extra threads in team, switch threads to wait on own 5076 // b_go flag 5077 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5078 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5079 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5080 for (int b = 0; b < bs_last_barrier; ++b) { 5081 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5082 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5083 } 5084 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5085 } 5086 } 5087 } 5088 #endif // KMP_NESTED_HOT_TEAMS 5089 team->t.t_nproc = new_nproc; 5090 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5091 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5092 __kmp_reinitialize_team(team, new_icvs, 5093 root->r.r_uber_thread->th.th_ident); 5094 5095 // Update remaining threads 5096 for (f = 0; f < new_nproc; ++f) { 5097 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5098 } 5099 5100 // restore the current task state of the master thread: should be the 5101 // implicit task 5102 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5103 team->t.t_threads[0], team)); 5104 5105 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5106 5107 #ifdef KMP_DEBUG 5108 for (f = 0; f < team->t.t_nproc; f++) { 5109 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5110 team->t.t_threads[f]->th.th_team_nproc == 5111 team->t.t_nproc); 5112 } 5113 #endif 5114 5115 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5116 #if KMP_AFFINITY_SUPPORTED 5117 __kmp_partition_places(team); 5118 #endif 5119 } else { // team->t.t_nproc < new_nproc 5120 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5121 kmp_affin_mask_t *old_mask; 5122 if (KMP_AFFINITY_CAPABLE()) { 5123 KMP_CPU_ALLOC(old_mask); 5124 } 5125 #endif 5126 5127 KA_TRACE(20, 5128 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5129 new_nproc)); 5130 5131 team->t.t_size_changed = 1; 5132 5133 #if KMP_NESTED_HOT_TEAMS 5134 int avail_threads = hot_teams[level].hot_team_nth; 5135 if (new_nproc < avail_threads) 5136 avail_threads = new_nproc; 5137 kmp_info_t **other_threads = team->t.t_threads; 5138 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5139 // Adjust barrier data of reserved threads (if any) of the team 5140 // Other data will be set in __kmp_initialize_info() below. 5141 int b; 5142 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5143 for (b = 0; b < bs_last_barrier; ++b) { 5144 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5145 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5146 #if USE_DEBUGGER 5147 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5148 #endif 5149 } 5150 } 5151 if (hot_teams[level].hot_team_nth >= new_nproc) { 5152 // we have all needed threads in reserve, no need to allocate any 5153 // this only possible in mode 1, cannot have reserved threads in mode 0 5154 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5155 team->t.t_nproc = new_nproc; // just get reserved threads involved 5156 } else { 5157 // we may have some threads in reserve, but not enough 5158 team->t.t_nproc = 5159 hot_teams[level] 5160 .hot_team_nth; // get reserved threads involved if any 5161 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5162 #endif // KMP_NESTED_HOT_TEAMS 5163 if (team->t.t_max_nproc < new_nproc) { 5164 /* reallocate larger arrays */ 5165 __kmp_reallocate_team_arrays(team, new_nproc); 5166 __kmp_reinitialize_team(team, new_icvs, NULL); 5167 } 5168 5169 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5170 /* Temporarily set full mask for master thread before creation of 5171 workers. The reason is that workers inherit the affinity from master, 5172 so if a lot of workers are created on the single core quickly, they 5173 don't get a chance to set their own affinity for a long time. */ 5174 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5175 #endif 5176 5177 /* allocate new threads for the hot team */ 5178 for (f = team->t.t_nproc; f < new_nproc; f++) { 5179 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5180 KMP_DEBUG_ASSERT(new_worker); 5181 team->t.t_threads[f] = new_worker; 5182 5183 KA_TRACE(20, 5184 ("__kmp_allocate_team: team %d init T#%d arrived: " 5185 "join=%llu, plain=%llu\n", 5186 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5187 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5188 team->t.t_bar[bs_plain_barrier].b_arrived)); 5189 5190 { // Initialize barrier data for new threads. 5191 int b; 5192 kmp_balign_t *balign = new_worker->th.th_bar; 5193 for (b = 0; b < bs_last_barrier; ++b) { 5194 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5195 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5196 KMP_BARRIER_PARENT_FLAG); 5197 #if USE_DEBUGGER 5198 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5199 #endif 5200 } 5201 } 5202 } 5203 5204 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5205 if (KMP_AFFINITY_CAPABLE()) { 5206 /* Restore initial master thread's affinity mask */ 5207 __kmp_set_system_affinity(old_mask, TRUE); 5208 KMP_CPU_FREE(old_mask); 5209 } 5210 #endif 5211 #if KMP_NESTED_HOT_TEAMS 5212 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5213 #endif // KMP_NESTED_HOT_TEAMS 5214 /* make sure everyone is syncronized */ 5215 int old_nproc = team->t.t_nproc; // save old value and use to update only 5216 // new threads below 5217 __kmp_initialize_team(team, new_nproc, new_icvs, 5218 root->r.r_uber_thread->th.th_ident); 5219 5220 /* reinitialize the threads */ 5221 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5222 for (f = 0; f < team->t.t_nproc; ++f) 5223 __kmp_initialize_info(team->t.t_threads[f], team, f, 5224 __kmp_gtid_from_tid(f, team)); 5225 5226 if (level) { // set th_task_state for new threads in nested hot team 5227 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5228 // only need to set the th_task_state for the new threads. th_task_state 5229 // for master thread will not be accurate until after this in 5230 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5231 // correct value. 5232 for (f = old_nproc; f < team->t.t_nproc; ++f) 5233 team->t.t_threads[f]->th.th_task_state = 5234 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5235 } else { // set th_task_state for new threads in non-nested hot team 5236 kmp_uint8 old_state = 5237 team->t.t_threads[0]->th.th_task_state; // copy master's state 5238 for (f = old_nproc; f < team->t.t_nproc; ++f) 5239 team->t.t_threads[f]->th.th_task_state = old_state; 5240 } 5241 5242 #ifdef KMP_DEBUG 5243 for (f = 0; f < team->t.t_nproc; ++f) { 5244 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5245 team->t.t_threads[f]->th.th_team_nproc == 5246 team->t.t_nproc); 5247 } 5248 #endif 5249 5250 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5251 #if KMP_AFFINITY_SUPPORTED 5252 __kmp_partition_places(team); 5253 #endif 5254 } // Check changes in number of threads 5255 5256 kmp_info_t *master = team->t.t_threads[0]; 5257 if (master->th.th_teams_microtask) { 5258 for (f = 1; f < new_nproc; ++f) { 5259 // propagate teams construct specific info to workers 5260 kmp_info_t *thr = team->t.t_threads[f]; 5261 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5262 thr->th.th_teams_level = master->th.th_teams_level; 5263 thr->th.th_teams_size = master->th.th_teams_size; 5264 } 5265 } 5266 #if KMP_NESTED_HOT_TEAMS 5267 if (level) { 5268 // Sync barrier state for nested hot teams, not needed for outermost hot 5269 // team. 5270 for (f = 1; f < new_nproc; ++f) { 5271 kmp_info_t *thr = team->t.t_threads[f]; 5272 int b; 5273 kmp_balign_t *balign = thr->th.th_bar; 5274 for (b = 0; b < bs_last_barrier; ++b) { 5275 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5276 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5277 #if USE_DEBUGGER 5278 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5279 #endif 5280 } 5281 } 5282 } 5283 #endif // KMP_NESTED_HOT_TEAMS 5284 5285 /* reallocate space for arguments if necessary */ 5286 __kmp_alloc_argv_entries(argc, team, TRUE); 5287 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5288 // The hot team re-uses the previous task team, 5289 // if untouched during the previous release->gather phase. 5290 5291 KF_TRACE(10, (" hot_team = %p\n", team)); 5292 5293 #if KMP_DEBUG 5294 if (__kmp_tasking_mode != tskm_immediate_exec) { 5295 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5296 "task_team[1] = %p after reinit\n", 5297 team->t.t_task_team[0], team->t.t_task_team[1])); 5298 } 5299 #endif 5300 5301 #if OMPT_SUPPORT 5302 __ompt_team_assign_id(team, ompt_parallel_data); 5303 #endif 5304 5305 KMP_MB(); 5306 5307 return team; 5308 } 5309 5310 /* next, let's try to take one from the team pool */ 5311 KMP_MB(); 5312 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5313 /* TODO: consider resizing undersized teams instead of reaping them, now 5314 that we have a resizing mechanism */ 5315 if (team->t.t_max_nproc >= max_nproc) { 5316 /* take this team from the team pool */ 5317 __kmp_team_pool = team->t.t_next_pool; 5318 5319 /* setup the team for fresh use */ 5320 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5321 5322 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5323 "task_team[1] %p to NULL\n", 5324 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5325 team->t.t_task_team[0] = NULL; 5326 team->t.t_task_team[1] = NULL; 5327 5328 /* reallocate space for arguments if necessary */ 5329 __kmp_alloc_argv_entries(argc, team, TRUE); 5330 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5331 5332 KA_TRACE( 5333 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5334 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5335 { // Initialize barrier data. 5336 int b; 5337 for (b = 0; b < bs_last_barrier; ++b) { 5338 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5339 #if USE_DEBUGGER 5340 team->t.t_bar[b].b_master_arrived = 0; 5341 team->t.t_bar[b].b_team_arrived = 0; 5342 #endif 5343 } 5344 } 5345 5346 team->t.t_proc_bind = new_proc_bind; 5347 5348 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5349 team->t.t_id)); 5350 5351 #if OMPT_SUPPORT 5352 __ompt_team_assign_id(team, ompt_parallel_data); 5353 #endif 5354 5355 KMP_MB(); 5356 5357 return team; 5358 } 5359 5360 /* reap team if it is too small, then loop back and check the next one */ 5361 // not sure if this is wise, but, will be redone during the hot-teams 5362 // rewrite. 5363 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5364 team = __kmp_reap_team(team); 5365 __kmp_team_pool = team; 5366 } 5367 5368 /* nothing available in the pool, no matter, make a new team! */ 5369 KMP_MB(); 5370 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5371 5372 /* and set it up */ 5373 team->t.t_max_nproc = max_nproc; 5374 /* NOTE well, for some reason allocating one big buffer and dividing it up 5375 seems to really hurt performance a lot on the P4, so, let's not use this */ 5376 __kmp_allocate_team_arrays(team, max_nproc); 5377 5378 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5379 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5380 5381 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5382 "%p to NULL\n", 5383 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5384 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5385 // memory, no need to duplicate 5386 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5387 // memory, no need to duplicate 5388 5389 if (__kmp_storage_map) { 5390 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5391 } 5392 5393 /* allocate space for arguments */ 5394 __kmp_alloc_argv_entries(argc, team, FALSE); 5395 team->t.t_argc = argc; 5396 5397 KA_TRACE(20, 5398 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5399 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5400 { // Initialize barrier data. 5401 int b; 5402 for (b = 0; b < bs_last_barrier; ++b) { 5403 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5404 #if USE_DEBUGGER 5405 team->t.t_bar[b].b_master_arrived = 0; 5406 team->t.t_bar[b].b_team_arrived = 0; 5407 #endif 5408 } 5409 } 5410 5411 team->t.t_proc_bind = new_proc_bind; 5412 5413 #if OMPT_SUPPORT 5414 __ompt_team_assign_id(team, ompt_parallel_data); 5415 team->t.ompt_serialized_team_info = NULL; 5416 #endif 5417 5418 KMP_MB(); 5419 5420 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5421 team->t.t_id)); 5422 5423 return team; 5424 } 5425 5426 /* TODO implement hot-teams at all levels */ 5427 /* TODO implement lazy thread release on demand (disband request) */ 5428 5429 /* free the team. return it to the team pool. release all the threads 5430 * associated with it */ 5431 void __kmp_free_team(kmp_root_t *root, 5432 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5433 int f; 5434 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5435 team->t.t_id)); 5436 5437 /* verify state */ 5438 KMP_DEBUG_ASSERT(root); 5439 KMP_DEBUG_ASSERT(team); 5440 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5441 KMP_DEBUG_ASSERT(team->t.t_threads); 5442 5443 int use_hot_team = team == root->r.r_hot_team; 5444 #if KMP_NESTED_HOT_TEAMS 5445 int level; 5446 kmp_hot_team_ptr_t *hot_teams; 5447 if (master) { 5448 level = team->t.t_active_level - 1; 5449 if (master->th.th_teams_microtask) { // in teams construct? 5450 if (master->th.th_teams_size.nteams > 1) { 5451 ++level; // level was not increased in teams construct for 5452 // team_of_masters 5453 } 5454 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5455 master->th.th_teams_level == team->t.t_level) { 5456 ++level; // level was not increased in teams construct for 5457 // team_of_workers before the parallel 5458 } // team->t.t_level will be increased inside parallel 5459 } 5460 hot_teams = master->th.th_hot_teams; 5461 if (level < __kmp_hot_teams_max_level) { 5462 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5463 use_hot_team = 1; 5464 } 5465 } 5466 #endif // KMP_NESTED_HOT_TEAMS 5467 5468 /* team is done working */ 5469 TCW_SYNC_PTR(team->t.t_pkfn, 5470 NULL); // Important for Debugging Support Library. 5471 #if KMP_OS_WINDOWS 5472 team->t.t_copyin_counter = 0; // init counter for possible reuse 5473 #endif 5474 // Do not reset pointer to parent team to NULL for hot teams. 5475 5476 /* if we are non-hot team, release our threads */ 5477 if (!use_hot_team) { 5478 if (__kmp_tasking_mode != tskm_immediate_exec) { 5479 // Wait for threads to reach reapable state 5480 for (f = 1; f < team->t.t_nproc; ++f) { 5481 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5482 kmp_info_t *th = team->t.t_threads[f]; 5483 volatile kmp_uint32 *state = &th->th.th_reap_state; 5484 while (*state != KMP_SAFE_TO_REAP) { 5485 #if KMP_OS_WINDOWS 5486 // On Windows a thread can be killed at any time, check this 5487 DWORD ecode; 5488 if (!__kmp_is_thread_alive(th, &ecode)) { 5489 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5490 break; 5491 } 5492 #endif 5493 // first check if thread is sleeping 5494 kmp_flag_64<> fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5495 if (fl.is_sleeping()) 5496 fl.resume(__kmp_gtid_from_thread(th)); 5497 KMP_CPU_PAUSE(); 5498 } 5499 } 5500 5501 // Delete task teams 5502 int tt_idx; 5503 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5504 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5505 if (task_team != NULL) { 5506 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5507 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5508 team->t.t_threads[f]->th.th_task_team = NULL; 5509 } 5510 KA_TRACE( 5511 20, 5512 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5513 __kmp_get_gtid(), task_team, team->t.t_id)); 5514 #if KMP_NESTED_HOT_TEAMS 5515 __kmp_free_task_team(master, task_team); 5516 #endif 5517 team->t.t_task_team[tt_idx] = NULL; 5518 } 5519 } 5520 } 5521 5522 // Reset pointer to parent team only for non-hot teams. 5523 team->t.t_parent = NULL; 5524 team->t.t_level = 0; 5525 team->t.t_active_level = 0; 5526 5527 /* free the worker threads */ 5528 for (f = 1; f < team->t.t_nproc; ++f) { 5529 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5530 __kmp_free_thread(team->t.t_threads[f]); 5531 team->t.t_threads[f] = NULL; 5532 } 5533 5534 /* put the team back in the team pool */ 5535 /* TODO limit size of team pool, call reap_team if pool too large */ 5536 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5537 __kmp_team_pool = (volatile kmp_team_t *)team; 5538 } else { // Check if team was created for the masters in a teams construct 5539 // See if first worker is a CG root 5540 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5541 team->t.t_threads[1]->th.th_cg_roots); 5542 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5543 // Clean up the CG root nodes on workers so that this team can be re-used 5544 for (f = 1; f < team->t.t_nproc; ++f) { 5545 kmp_info_t *thr = team->t.t_threads[f]; 5546 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5547 thr->th.th_cg_roots->cg_root == thr); 5548 // Pop current CG root off list 5549 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5550 thr->th.th_cg_roots = tmp->up; 5551 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5552 " up to node %p. cg_nthreads was %d\n", 5553 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5554 int i = tmp->cg_nthreads--; 5555 if (i == 1) { 5556 __kmp_free(tmp); // free CG if we are the last thread in it 5557 } 5558 // Restore current task's thread_limit from CG root 5559 if (thr->th.th_cg_roots) 5560 thr->th.th_current_task->td_icvs.thread_limit = 5561 thr->th.th_cg_roots->cg_thread_limit; 5562 } 5563 } 5564 } 5565 5566 KMP_MB(); 5567 } 5568 5569 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5570 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5571 kmp_team_t *next_pool = team->t.t_next_pool; 5572 5573 KMP_DEBUG_ASSERT(team); 5574 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5575 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5576 KMP_DEBUG_ASSERT(team->t.t_threads); 5577 KMP_DEBUG_ASSERT(team->t.t_argv); 5578 5579 /* TODO clean the threads that are a part of this? */ 5580 5581 /* free stuff */ 5582 __kmp_free_team_arrays(team); 5583 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5584 __kmp_free((void *)team->t.t_argv); 5585 __kmp_free(team); 5586 5587 KMP_MB(); 5588 return next_pool; 5589 } 5590 5591 // Free the thread. Don't reap it, just place it on the pool of available 5592 // threads. 5593 // 5594 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5595 // binding for the affinity mechanism to be useful. 5596 // 5597 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5598 // However, we want to avoid a potential performance problem by always 5599 // scanning through the list to find the correct point at which to insert 5600 // the thread (potential N**2 behavior). To do this we keep track of the 5601 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5602 // With single-level parallelism, threads will always be added to the tail 5603 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5604 // parallelism, all bets are off and we may need to scan through the entire 5605 // free list. 5606 // 5607 // This change also has a potentially large performance benefit, for some 5608 // applications. Previously, as threads were freed from the hot team, they 5609 // would be placed back on the free list in inverse order. If the hot team 5610 // grew back to it's original size, then the freed thread would be placed 5611 // back on the hot team in reverse order. This could cause bad cache 5612 // locality problems on programs where the size of the hot team regularly 5613 // grew and shrunk. 5614 // 5615 // Now, for single-level parallelism, the OMP tid is always == gtid. 5616 void __kmp_free_thread(kmp_info_t *this_th) { 5617 int gtid; 5618 kmp_info_t **scan; 5619 5620 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5621 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5622 5623 KMP_DEBUG_ASSERT(this_th); 5624 5625 // When moving thread to pool, switch thread to wait on own b_go flag, and 5626 // uninitialized (NULL team). 5627 int b; 5628 kmp_balign_t *balign = this_th->th.th_bar; 5629 for (b = 0; b < bs_last_barrier; ++b) { 5630 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5631 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5632 balign[b].bb.team = NULL; 5633 balign[b].bb.leaf_kids = 0; 5634 } 5635 this_th->th.th_task_state = 0; 5636 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5637 5638 /* put thread back on the free pool */ 5639 TCW_PTR(this_th->th.th_team, NULL); 5640 TCW_PTR(this_th->th.th_root, NULL); 5641 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5642 5643 while (this_th->th.th_cg_roots) { 5644 this_th->th.th_cg_roots->cg_nthreads--; 5645 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5646 " %p of thread %p to %d\n", 5647 this_th, this_th->th.th_cg_roots, 5648 this_th->th.th_cg_roots->cg_root, 5649 this_th->th.th_cg_roots->cg_nthreads)); 5650 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5651 if (tmp->cg_root == this_th) { // Thread is a cg_root 5652 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5653 KA_TRACE( 5654 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5655 this_th->th.th_cg_roots = tmp->up; 5656 __kmp_free(tmp); 5657 } else { // Worker thread 5658 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5659 __kmp_free(tmp); 5660 } 5661 this_th->th.th_cg_roots = NULL; 5662 break; 5663 } 5664 } 5665 5666 /* If the implicit task assigned to this thread can be used by other threads 5667 * -> multiple threads can share the data and try to free the task at 5668 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5669 * with higher probability when hot team is disabled but can occurs even when 5670 * the hot team is enabled */ 5671 __kmp_free_implicit_task(this_th); 5672 this_th->th.th_current_task = NULL; 5673 5674 // If the __kmp_thread_pool_insert_pt is already past the new insert 5675 // point, then we need to re-scan the entire list. 5676 gtid = this_th->th.th_info.ds.ds_gtid; 5677 if (__kmp_thread_pool_insert_pt != NULL) { 5678 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5679 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5680 __kmp_thread_pool_insert_pt = NULL; 5681 } 5682 } 5683 5684 // Scan down the list to find the place to insert the thread. 5685 // scan is the address of a link in the list, possibly the address of 5686 // __kmp_thread_pool itself. 5687 // 5688 // In the absence of nested parallelism, the for loop will have 0 iterations. 5689 if (__kmp_thread_pool_insert_pt != NULL) { 5690 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5691 } else { 5692 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5693 } 5694 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5695 scan = &((*scan)->th.th_next_pool)) 5696 ; 5697 5698 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5699 // to its address. 5700 TCW_PTR(this_th->th.th_next_pool, *scan); 5701 __kmp_thread_pool_insert_pt = *scan = this_th; 5702 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5703 (this_th->th.th_info.ds.ds_gtid < 5704 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5705 TCW_4(this_th->th.th_in_pool, TRUE); 5706 __kmp_suspend_initialize_thread(this_th); 5707 __kmp_lock_suspend_mx(this_th); 5708 if (this_th->th.th_active == TRUE) { 5709 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5710 this_th->th.th_active_in_pool = TRUE; 5711 } 5712 #if KMP_DEBUG 5713 else { 5714 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5715 } 5716 #endif 5717 __kmp_unlock_suspend_mx(this_th); 5718 5719 TCW_4(__kmp_nth, __kmp_nth - 1); 5720 5721 #ifdef KMP_ADJUST_BLOCKTIME 5722 /* Adjust blocktime back to user setting or default if necessary */ 5723 /* Middle initialization might never have occurred */ 5724 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5725 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5726 if (__kmp_nth <= __kmp_avail_proc) { 5727 __kmp_zero_bt = FALSE; 5728 } 5729 } 5730 #endif /* KMP_ADJUST_BLOCKTIME */ 5731 5732 KMP_MB(); 5733 } 5734 5735 /* ------------------------------------------------------------------------ */ 5736 5737 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5738 int gtid = this_thr->th.th_info.ds.ds_gtid; 5739 /* void *stack_data;*/ 5740 kmp_team_t **volatile pteam; 5741 5742 KMP_MB(); 5743 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5744 5745 if (__kmp_env_consistency_check) { 5746 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5747 } 5748 5749 #if OMPT_SUPPORT 5750 ompt_data_t *thread_data; 5751 if (ompt_enabled.enabled) { 5752 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5753 *thread_data = ompt_data_none; 5754 5755 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5756 this_thr->th.ompt_thread_info.wait_id = 0; 5757 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5758 this_thr->th.ompt_thread_info.parallel_flags = 0; 5759 if (ompt_enabled.ompt_callback_thread_begin) { 5760 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5761 ompt_thread_worker, thread_data); 5762 } 5763 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5764 } 5765 #endif 5766 5767 /* This is the place where threads wait for work */ 5768 while (!TCR_4(__kmp_global.g.g_done)) { 5769 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5770 KMP_MB(); 5771 5772 /* wait for work to do */ 5773 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5774 5775 /* No tid yet since not part of a team */ 5776 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5777 5778 #if OMPT_SUPPORT 5779 if (ompt_enabled.enabled) { 5780 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5781 } 5782 #endif 5783 5784 pteam = &this_thr->th.th_team; 5785 5786 /* have we been allocated? */ 5787 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5788 /* we were just woken up, so run our new task */ 5789 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5790 int rc; 5791 KA_TRACE(20, 5792 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5793 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5794 (*pteam)->t.t_pkfn)); 5795 5796 updateHWFPControl(*pteam); 5797 5798 #if OMPT_SUPPORT 5799 if (ompt_enabled.enabled) { 5800 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5801 } 5802 #endif 5803 5804 rc = (*pteam)->t.t_invoke(gtid); 5805 KMP_ASSERT(rc); 5806 5807 KMP_MB(); 5808 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5809 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5810 (*pteam)->t.t_pkfn)); 5811 } 5812 #if OMPT_SUPPORT 5813 if (ompt_enabled.enabled) { 5814 /* no frame set while outside task */ 5815 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5816 5817 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5818 } 5819 #endif 5820 /* join barrier after parallel region */ 5821 __kmp_join_barrier(gtid); 5822 } 5823 } 5824 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5825 5826 #if OMPT_SUPPORT 5827 if (ompt_enabled.ompt_callback_thread_end) { 5828 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5829 } 5830 #endif 5831 5832 this_thr->th.th_task_team = NULL; 5833 /* run the destructors for the threadprivate data for this thread */ 5834 __kmp_common_destroy_gtid(gtid); 5835 5836 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5837 KMP_MB(); 5838 return this_thr; 5839 } 5840 5841 /* ------------------------------------------------------------------------ */ 5842 5843 void __kmp_internal_end_dest(void *specific_gtid) { 5844 // Make sure no significant bits are lost 5845 int gtid; 5846 __kmp_type_convert((kmp_intptr_t)specific_gtid - 1, >id); 5847 5848 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5849 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5850 * this is because 0 is reserved for the nothing-stored case */ 5851 5852 __kmp_internal_end_thread(gtid); 5853 } 5854 5855 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5856 5857 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5858 __kmp_internal_end_atexit(); 5859 } 5860 5861 #endif 5862 5863 /* [Windows] josh: when the atexit handler is called, there may still be more 5864 than one thread alive */ 5865 void __kmp_internal_end_atexit(void) { 5866 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5867 /* [Windows] 5868 josh: ideally, we want to completely shutdown the library in this atexit 5869 handler, but stat code that depends on thread specific data for gtid fails 5870 because that data becomes unavailable at some point during the shutdown, so 5871 we call __kmp_internal_end_thread instead. We should eventually remove the 5872 dependency on __kmp_get_specific_gtid in the stat code and use 5873 __kmp_internal_end_library to cleanly shutdown the library. 5874 5875 // TODO: Can some of this comment about GVS be removed? 5876 I suspect that the offending stat code is executed when the calling thread 5877 tries to clean up a dead root thread's data structures, resulting in GVS 5878 code trying to close the GVS structures for that thread, but since the stat 5879 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5880 the calling thread is cleaning up itself instead of another thread, it get 5881 confused. This happens because allowing a thread to unregister and cleanup 5882 another thread is a recent modification for addressing an issue. 5883 Based on the current design (20050722), a thread may end up 5884 trying to unregister another thread only if thread death does not trigger 5885 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5886 thread specific data destructor function to detect thread death. For 5887 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5888 is nothing. Thus, the workaround is applicable only for Windows static 5889 stat library. */ 5890 __kmp_internal_end_library(-1); 5891 #if KMP_OS_WINDOWS 5892 __kmp_close_console(); 5893 #endif 5894 } 5895 5896 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5897 // It is assumed __kmp_forkjoin_lock is acquired. 5898 5899 int gtid; 5900 5901 KMP_DEBUG_ASSERT(thread != NULL); 5902 5903 gtid = thread->th.th_info.ds.ds_gtid; 5904 5905 if (!is_root) { 5906 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5907 /* Assume the threads are at the fork barrier here */ 5908 KA_TRACE( 5909 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5910 gtid)); 5911 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5912 * (GEH) */ 5913 ANNOTATE_HAPPENS_BEFORE(thread); 5914 kmp_flag_64<> flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 5915 thread); 5916 __kmp_release_64(&flag); 5917 } 5918 5919 // Terminate OS thread. 5920 __kmp_reap_worker(thread); 5921 5922 // The thread was killed asynchronously. If it was actively 5923 // spinning in the thread pool, decrement the global count. 5924 // 5925 // There is a small timing hole here - if the worker thread was just waking 5926 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5927 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5928 // the global counter might not get updated. 5929 // 5930 // Currently, this can only happen as the library is unloaded, 5931 // so there are no harmful side effects. 5932 if (thread->th.th_active_in_pool) { 5933 thread->th.th_active_in_pool = FALSE; 5934 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5935 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5936 } 5937 } 5938 5939 __kmp_free_implicit_task(thread); 5940 5941 // Free the fast memory for tasking 5942 #if USE_FAST_MEMORY 5943 __kmp_free_fast_memory(thread); 5944 #endif /* USE_FAST_MEMORY */ 5945 5946 __kmp_suspend_uninitialize_thread(thread); 5947 5948 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5949 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5950 5951 --__kmp_all_nth; 5952 // __kmp_nth was decremented when thread is added to the pool. 5953 5954 #ifdef KMP_ADJUST_BLOCKTIME 5955 /* Adjust blocktime back to user setting or default if necessary */ 5956 /* Middle initialization might never have occurred */ 5957 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5958 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5959 if (__kmp_nth <= __kmp_avail_proc) { 5960 __kmp_zero_bt = FALSE; 5961 } 5962 } 5963 #endif /* KMP_ADJUST_BLOCKTIME */ 5964 5965 /* free the memory being used */ 5966 if (__kmp_env_consistency_check) { 5967 if (thread->th.th_cons) { 5968 __kmp_free_cons_stack(thread->th.th_cons); 5969 thread->th.th_cons = NULL; 5970 } 5971 } 5972 5973 if (thread->th.th_pri_common != NULL) { 5974 __kmp_free(thread->th.th_pri_common); 5975 thread->th.th_pri_common = NULL; 5976 } 5977 5978 if (thread->th.th_task_state_memo_stack != NULL) { 5979 __kmp_free(thread->th.th_task_state_memo_stack); 5980 thread->th.th_task_state_memo_stack = NULL; 5981 } 5982 5983 #if KMP_USE_BGET 5984 if (thread->th.th_local.bget_data != NULL) { 5985 __kmp_finalize_bget(thread); 5986 } 5987 #endif 5988 5989 #if KMP_AFFINITY_SUPPORTED 5990 if (thread->th.th_affin_mask != NULL) { 5991 KMP_CPU_FREE(thread->th.th_affin_mask); 5992 thread->th.th_affin_mask = NULL; 5993 } 5994 #endif /* KMP_AFFINITY_SUPPORTED */ 5995 5996 #if KMP_USE_HIER_SCHED 5997 if (thread->th.th_hier_bar_data != NULL) { 5998 __kmp_free(thread->th.th_hier_bar_data); 5999 thread->th.th_hier_bar_data = NULL; 6000 } 6001 #endif 6002 6003 __kmp_reap_team(thread->th.th_serial_team); 6004 thread->th.th_serial_team = NULL; 6005 __kmp_free(thread); 6006 6007 KMP_MB(); 6008 6009 } // __kmp_reap_thread 6010 6011 static void __kmp_internal_end(void) { 6012 int i; 6013 6014 /* First, unregister the library */ 6015 __kmp_unregister_library(); 6016 6017 #if KMP_OS_WINDOWS 6018 /* In Win static library, we can't tell when a root actually dies, so we 6019 reclaim the data structures for any root threads that have died but not 6020 unregistered themselves, in order to shut down cleanly. 6021 In Win dynamic library we also can't tell when a thread dies. */ 6022 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6023 // dead roots 6024 #endif 6025 6026 for (i = 0; i < __kmp_threads_capacity; i++) 6027 if (__kmp_root[i]) 6028 if (__kmp_root[i]->r.r_active) 6029 break; 6030 KMP_MB(); /* Flush all pending memory write invalidates. */ 6031 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6032 6033 if (i < __kmp_threads_capacity) { 6034 #if KMP_USE_MONITOR 6035 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6036 KMP_MB(); /* Flush all pending memory write invalidates. */ 6037 6038 // Need to check that monitor was initialized before reaping it. If we are 6039 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6040 // __kmp_monitor will appear to contain valid data, but it is only valid in 6041 // the parent process, not the child. 6042 // New behavior (201008): instead of keying off of the flag 6043 // __kmp_init_parallel, the monitor thread creation is keyed off 6044 // of the new flag __kmp_init_monitor. 6045 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6046 if (TCR_4(__kmp_init_monitor)) { 6047 __kmp_reap_monitor(&__kmp_monitor); 6048 TCW_4(__kmp_init_monitor, 0); 6049 } 6050 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6051 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6052 #endif // KMP_USE_MONITOR 6053 } else { 6054 /* TODO move this to cleanup code */ 6055 #ifdef KMP_DEBUG 6056 /* make sure that everything has properly ended */ 6057 for (i = 0; i < __kmp_threads_capacity; i++) { 6058 if (__kmp_root[i]) { 6059 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6060 // there can be uber threads alive here 6061 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6062 } 6063 } 6064 #endif 6065 6066 KMP_MB(); 6067 6068 // Reap the worker threads. 6069 // This is valid for now, but be careful if threads are reaped sooner. 6070 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6071 // Get the next thread from the pool. 6072 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6073 __kmp_thread_pool = thread->th.th_next_pool; 6074 // Reap it. 6075 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6076 thread->th.th_next_pool = NULL; 6077 thread->th.th_in_pool = FALSE; 6078 __kmp_reap_thread(thread, 0); 6079 } 6080 __kmp_thread_pool_insert_pt = NULL; 6081 6082 // Reap teams. 6083 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6084 // Get the next team from the pool. 6085 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6086 __kmp_team_pool = team->t.t_next_pool; 6087 // Reap it. 6088 team->t.t_next_pool = NULL; 6089 __kmp_reap_team(team); 6090 } 6091 6092 __kmp_reap_task_teams(); 6093 6094 #if KMP_OS_UNIX 6095 // Threads that are not reaped should not access any resources since they 6096 // are going to be deallocated soon, so the shutdown sequence should wait 6097 // until all threads either exit the final spin-waiting loop or begin 6098 // sleeping after the given blocktime. 6099 for (i = 0; i < __kmp_threads_capacity; i++) { 6100 kmp_info_t *thr = __kmp_threads[i]; 6101 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6102 KMP_CPU_PAUSE(); 6103 } 6104 #endif 6105 6106 for (i = 0; i < __kmp_threads_capacity; ++i) { 6107 // TBD: Add some checking... 6108 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6109 } 6110 6111 /* Make sure all threadprivate destructors get run by joining with all 6112 worker threads before resetting this flag */ 6113 TCW_SYNC_4(__kmp_init_common, FALSE); 6114 6115 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6116 KMP_MB(); 6117 6118 #if KMP_USE_MONITOR 6119 // See note above: One of the possible fixes for CQ138434 / CQ140126 6120 // 6121 // FIXME: push both code fragments down and CSE them? 6122 // push them into __kmp_cleanup() ? 6123 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6124 if (TCR_4(__kmp_init_monitor)) { 6125 __kmp_reap_monitor(&__kmp_monitor); 6126 TCW_4(__kmp_init_monitor, 0); 6127 } 6128 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6129 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6130 #endif 6131 } /* else !__kmp_global.t_active */ 6132 TCW_4(__kmp_init_gtid, FALSE); 6133 KMP_MB(); /* Flush all pending memory write invalidates. */ 6134 6135 __kmp_cleanup(); 6136 #if OMPT_SUPPORT 6137 ompt_fini(); 6138 #endif 6139 } 6140 6141 void __kmp_internal_end_library(int gtid_req) { 6142 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6143 /* this shouldn't be a race condition because __kmp_internal_end() is the 6144 only place to clear __kmp_serial_init */ 6145 /* we'll check this later too, after we get the lock */ 6146 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6147 // redundant, because the next check will work in any case. 6148 if (__kmp_global.g.g_abort) { 6149 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6150 /* TODO abort? */ 6151 return; 6152 } 6153 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6154 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6155 return; 6156 } 6157 6158 KMP_MB(); /* Flush all pending memory write invalidates. */ 6159 /* find out who we are and what we should do */ 6160 { 6161 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6162 KA_TRACE( 6163 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6164 if (gtid == KMP_GTID_SHUTDOWN) { 6165 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6166 "already shutdown\n")); 6167 return; 6168 } else if (gtid == KMP_GTID_MONITOR) { 6169 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6170 "registered, or system shutdown\n")); 6171 return; 6172 } else if (gtid == KMP_GTID_DNE) { 6173 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6174 "shutdown\n")); 6175 /* we don't know who we are, but we may still shutdown the library */ 6176 } else if (KMP_UBER_GTID(gtid)) { 6177 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6178 if (__kmp_root[gtid]->r.r_active) { 6179 __kmp_global.g.g_abort = -1; 6180 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6181 __kmp_unregister_library(); 6182 KA_TRACE(10, 6183 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6184 gtid)); 6185 return; 6186 } else { 6187 KA_TRACE( 6188 10, 6189 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6190 __kmp_unregister_root_current_thread(gtid); 6191 } 6192 } else { 6193 /* worker threads may call this function through the atexit handler, if they 6194 * call exit() */ 6195 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6196 TODO: do a thorough shutdown instead */ 6197 #ifdef DUMP_DEBUG_ON_EXIT 6198 if (__kmp_debug_buf) 6199 __kmp_dump_debug_buffer(); 6200 #endif 6201 // added unregister library call here when we switch to shm linux 6202 // if we don't, it will leave lots of files in /dev/shm 6203 // cleanup shared memory file before exiting. 6204 __kmp_unregister_library(); 6205 return; 6206 } 6207 } 6208 /* synchronize the termination process */ 6209 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6210 6211 /* have we already finished */ 6212 if (__kmp_global.g.g_abort) { 6213 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6214 /* TODO abort? */ 6215 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6216 return; 6217 } 6218 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6219 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6220 return; 6221 } 6222 6223 /* We need this lock to enforce mutex between this reading of 6224 __kmp_threads_capacity and the writing by __kmp_register_root. 6225 Alternatively, we can use a counter of roots that is atomically updated by 6226 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6227 __kmp_internal_end_*. */ 6228 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6229 6230 /* now we can safely conduct the actual termination */ 6231 __kmp_internal_end(); 6232 6233 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6234 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6235 6236 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6237 6238 #ifdef DUMP_DEBUG_ON_EXIT 6239 if (__kmp_debug_buf) 6240 __kmp_dump_debug_buffer(); 6241 #endif 6242 6243 #if KMP_OS_WINDOWS 6244 __kmp_close_console(); 6245 #endif 6246 6247 __kmp_fini_allocator(); 6248 6249 } // __kmp_internal_end_library 6250 6251 void __kmp_internal_end_thread(int gtid_req) { 6252 int i; 6253 6254 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6255 /* this shouldn't be a race condition because __kmp_internal_end() is the 6256 * only place to clear __kmp_serial_init */ 6257 /* we'll check this later too, after we get the lock */ 6258 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6259 // redundant, because the next check will work in any case. 6260 if (__kmp_global.g.g_abort) { 6261 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6262 /* TODO abort? */ 6263 return; 6264 } 6265 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6266 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6267 return; 6268 } 6269 6270 // If hidden helper team has been initialized, we need to deinit it 6271 if (TCR_4(__kmp_init_hidden_helper)) { 6272 TCW_SYNC_4(__kmp_hidden_helper_team_done, TRUE); 6273 // First release the main thread to let it continue its work 6274 __kmp_hidden_helper_main_thread_release(); 6275 // Wait until the hidden helper team has been destroyed 6276 __kmp_hidden_helper_threads_deinitz_wait(); 6277 } 6278 6279 KMP_MB(); /* Flush all pending memory write invalidates. */ 6280 6281 /* find out who we are and what we should do */ 6282 { 6283 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6284 KA_TRACE(10, 6285 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6286 if (gtid == KMP_GTID_SHUTDOWN) { 6287 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6288 "already shutdown\n")); 6289 return; 6290 } else if (gtid == KMP_GTID_MONITOR) { 6291 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6292 "registered, or system shutdown\n")); 6293 return; 6294 } else if (gtid == KMP_GTID_DNE) { 6295 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6296 "shutdown\n")); 6297 return; 6298 /* we don't know who we are */ 6299 } else if (KMP_UBER_GTID(gtid)) { 6300 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6301 if (__kmp_root[gtid]->r.r_active) { 6302 __kmp_global.g.g_abort = -1; 6303 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6304 KA_TRACE(10, 6305 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6306 gtid)); 6307 return; 6308 } else { 6309 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6310 gtid)); 6311 __kmp_unregister_root_current_thread(gtid); 6312 } 6313 } else { 6314 /* just a worker thread, let's leave */ 6315 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6316 6317 if (gtid >= 0) { 6318 __kmp_threads[gtid]->th.th_task_team = NULL; 6319 } 6320 6321 KA_TRACE(10, 6322 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6323 gtid)); 6324 return; 6325 } 6326 } 6327 #if KMP_DYNAMIC_LIB 6328 if (__kmp_pause_status != kmp_hard_paused) 6329 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6330 // because we will better shutdown later in the library destructor. 6331 { 6332 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6333 return; 6334 } 6335 #endif 6336 /* synchronize the termination process */ 6337 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6338 6339 /* have we already finished */ 6340 if (__kmp_global.g.g_abort) { 6341 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6342 /* TODO abort? */ 6343 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6344 return; 6345 } 6346 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6347 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6348 return; 6349 } 6350 6351 /* We need this lock to enforce mutex between this reading of 6352 __kmp_threads_capacity and the writing by __kmp_register_root. 6353 Alternatively, we can use a counter of roots that is atomically updated by 6354 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6355 __kmp_internal_end_*. */ 6356 6357 /* should we finish the run-time? are all siblings done? */ 6358 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6359 6360 for (i = 0; i < __kmp_threads_capacity; ++i) { 6361 if (KMP_UBER_GTID(i)) { 6362 KA_TRACE( 6363 10, 6364 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6365 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6366 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6367 return; 6368 } 6369 } 6370 6371 /* now we can safely conduct the actual termination */ 6372 6373 __kmp_internal_end(); 6374 6375 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6376 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6377 6378 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6379 6380 #ifdef DUMP_DEBUG_ON_EXIT 6381 if (__kmp_debug_buf) 6382 __kmp_dump_debug_buffer(); 6383 #endif 6384 } // __kmp_internal_end_thread 6385 6386 // ----------------------------------------------------------------------------- 6387 // Library registration stuff. 6388 6389 static long __kmp_registration_flag = 0; 6390 // Random value used to indicate library initialization. 6391 static char *__kmp_registration_str = NULL; 6392 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6393 6394 static inline char *__kmp_reg_status_name() { 6395 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6396 each thread. If registration and unregistration go in different threads 6397 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6398 env var can not be found, because the name will contain different pid. */ 6399 // macOS* complains about name being too long with additional getuid() 6400 #if KMP_OS_UNIX && !KMP_OS_DARWIN && KMP_DYNAMIC_LIB 6401 return __kmp_str_format("__KMP_REGISTERED_LIB_%d_%d", (int)getpid(), 6402 (int)getuid()); 6403 #else 6404 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6405 #endif 6406 } // __kmp_reg_status_get 6407 6408 void __kmp_register_library_startup(void) { 6409 6410 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6411 int done = 0; 6412 union { 6413 double dtime; 6414 long ltime; 6415 } time; 6416 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6417 __kmp_initialize_system_tick(); 6418 #endif 6419 __kmp_read_system_time(&time.dtime); 6420 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6421 __kmp_registration_str = 6422 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6423 __kmp_registration_flag, KMP_LIBRARY_FILE); 6424 6425 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6426 __kmp_registration_str)); 6427 6428 while (!done) { 6429 6430 char *value = NULL; // Actual value of the environment variable. 6431 6432 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6433 char *shm_name = __kmp_str_format("/%s", name); 6434 int shm_preexist = 0; 6435 char *data1; 6436 int fd1 = shm_open(shm_name, O_CREAT | O_EXCL | O_RDWR, 0666); 6437 if ((fd1 == -1) && (errno == EEXIST)) { 6438 // file didn't open because it already exists. 6439 // try opening existing file 6440 fd1 = shm_open(shm_name, O_RDWR, 0666); 6441 if (fd1 == -1) { // file didn't open 6442 // error out here 6443 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM"), KMP_ERR(0), 6444 __kmp_msg_null); 6445 } else { 6446 // able to open existing file 6447 shm_preexist = 1; 6448 } 6449 } else if (fd1 == -1) { // SHM didn't open; it was due to error other than 6450 // already exists. 6451 // error out here. 6452 __kmp_fatal(KMP_MSG(FunctionError, "Can't open SHM2"), KMP_ERR(errno), 6453 __kmp_msg_null); 6454 } 6455 if (shm_preexist == 0) { 6456 // we created SHM now set size 6457 if (ftruncate(fd1, SHM_SIZE) == -1) { 6458 // error occured setting size; 6459 __kmp_fatal(KMP_MSG(FunctionError, "Can't set size of SHM"), 6460 KMP_ERR(errno), __kmp_msg_null); 6461 } 6462 } 6463 data1 = 6464 (char *)mmap(0, SHM_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, fd1, 0); 6465 if (data1 == MAP_FAILED) { 6466 // failed to map shared memory 6467 __kmp_fatal(KMP_MSG(FunctionError, "Can't map SHM"), KMP_ERR(errno), 6468 __kmp_msg_null); 6469 } 6470 if (shm_preexist == 0) { // set data to SHM, set value 6471 KMP_STRCPY_S(data1, SHM_SIZE, __kmp_registration_str); 6472 } 6473 // Read value from either what we just wrote or existing file. 6474 value = __kmp_str_format("%s", data1); // read value from SHM 6475 munmap(data1, SHM_SIZE); 6476 close(fd1); 6477 #else // Windows and unix with static library 6478 // Set environment variable, but do not overwrite if it is exist. 6479 __kmp_env_set(name, __kmp_registration_str, 0); 6480 // read value to see if it got set 6481 value = __kmp_env_get(name); 6482 #endif 6483 6484 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6485 done = 1; // Ok, environment variable set successfully, exit the loop. 6486 } else { 6487 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6488 // Check whether it alive or dead. 6489 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6490 char *tail = value; 6491 char *flag_addr_str = NULL; 6492 char *flag_val_str = NULL; 6493 char const *file_name = NULL; 6494 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6495 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6496 file_name = tail; 6497 if (tail != NULL) { 6498 long *flag_addr = 0; 6499 long flag_val = 0; 6500 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr)); 6501 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6502 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6503 // First, check whether environment-encoded address is mapped into 6504 // addr space. 6505 // If so, dereference it to see if it still has the right value. 6506 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6507 neighbor = 1; 6508 } else { 6509 // If not, then we know the other copy of the library is no longer 6510 // running. 6511 neighbor = 2; 6512 } 6513 } 6514 } 6515 switch (neighbor) { 6516 case 0: // Cannot parse environment variable -- neighbor status unknown. 6517 // Assume it is the incompatible format of future version of the 6518 // library. Assume the other library is alive. 6519 // WARN( ... ); // TODO: Issue a warning. 6520 file_name = "unknown library"; 6521 KMP_FALLTHROUGH(); 6522 // Attention! Falling to the next case. That's intentional. 6523 case 1: { // Neighbor is alive. 6524 // Check it is allowed. 6525 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6526 if (!__kmp_str_match_true(duplicate_ok)) { 6527 // That's not allowed. Issue fatal error. 6528 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6529 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6530 } 6531 KMP_INTERNAL_FREE(duplicate_ok); 6532 __kmp_duplicate_library_ok = 1; 6533 done = 1; // Exit the loop. 6534 } break; 6535 case 2: { // Neighbor is dead. 6536 6537 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6538 // close shared memory. 6539 shm_unlink(shm_name); // this removes file in /dev/shm 6540 #else 6541 // Clear the variable and try to register library again. 6542 __kmp_env_unset(name); 6543 #endif 6544 } break; 6545 default: { KMP_DEBUG_ASSERT(0); } break; 6546 } 6547 } 6548 KMP_INTERNAL_FREE((void *)value); 6549 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6550 KMP_INTERNAL_FREE((void *)shm_name); 6551 #endif 6552 } // while 6553 KMP_INTERNAL_FREE((void *)name); 6554 6555 } // func __kmp_register_library_startup 6556 6557 void __kmp_unregister_library(void) { 6558 6559 char *name = __kmp_reg_status_name(); 6560 char *value = NULL; 6561 6562 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6563 char *shm_name = __kmp_str_format("/%s", name); 6564 int fd1 = shm_open(shm_name, O_RDONLY, 0666); 6565 if (fd1 == -1) { 6566 // file did not open. return. 6567 return; 6568 } 6569 char *data1 = (char *)mmap(0, SHM_SIZE, PROT_READ, MAP_SHARED, fd1, 0); 6570 if (data1 != MAP_FAILED) { 6571 value = __kmp_str_format("%s", data1); // read value from SHM 6572 munmap(data1, SHM_SIZE); 6573 } 6574 close(fd1); 6575 #else 6576 value = __kmp_env_get(name); 6577 #endif 6578 6579 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6580 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6581 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6582 // Ok, this is our variable. Delete it. 6583 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6584 shm_unlink(shm_name); // this removes file in /dev/shm 6585 #else 6586 __kmp_env_unset(name); 6587 #endif 6588 } 6589 6590 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB // shared memory is with dynamic library 6591 KMP_INTERNAL_FREE(shm_name); 6592 #endif 6593 6594 KMP_INTERNAL_FREE(__kmp_registration_str); 6595 KMP_INTERNAL_FREE(value); 6596 KMP_INTERNAL_FREE(name); 6597 6598 __kmp_registration_flag = 0; 6599 __kmp_registration_str = NULL; 6600 6601 } // __kmp_unregister_library 6602 6603 // End of Library registration stuff. 6604 // ----------------------------------------------------------------------------- 6605 6606 #if KMP_MIC_SUPPORTED 6607 6608 static void __kmp_check_mic_type() { 6609 kmp_cpuid_t cpuid_state = {0}; 6610 kmp_cpuid_t *cs_p = &cpuid_state; 6611 __kmp_x86_cpuid(1, 0, cs_p); 6612 // We don't support mic1 at the moment 6613 if ((cs_p->eax & 0xff0) == 0xB10) { 6614 __kmp_mic_type = mic2; 6615 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6616 __kmp_mic_type = mic3; 6617 } else { 6618 __kmp_mic_type = non_mic; 6619 } 6620 } 6621 6622 #endif /* KMP_MIC_SUPPORTED */ 6623 6624 #if KMP_HAVE_UMWAIT 6625 static void __kmp_user_level_mwait_init() { 6626 struct kmp_cpuid buf; 6627 __kmp_x86_cpuid(7, 0, &buf); 6628 __kmp_umwait_enabled = ((buf.ecx >> 5) & 1) && __kmp_user_level_mwait; 6629 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_umwait_enabled = %d\n", 6630 __kmp_umwait_enabled)); 6631 } 6632 #elif KMP_HAVE_MWAIT 6633 #ifndef AT_INTELPHIUSERMWAIT 6634 // Spurious, non-existent value that should always fail to return anything. 6635 // Will be replaced with the correct value when we know that. 6636 #define AT_INTELPHIUSERMWAIT 10000 6637 #endif 6638 // getauxval() function is available in RHEL7 and SLES12. If a system with an 6639 // earlier OS is used to build the RTL, we'll use the following internal 6640 // function when the entry is not found. 6641 unsigned long getauxval(unsigned long) KMP_WEAK_ATTRIBUTE_EXTERNAL; 6642 unsigned long getauxval(unsigned long) { return 0; } 6643 6644 static void __kmp_user_level_mwait_init() { 6645 // When getauxval() and correct value of AT_INTELPHIUSERMWAIT are available 6646 // use them to find if the user-level mwait is enabled. Otherwise, forcibly 6647 // set __kmp_mwait_enabled=TRUE on Intel MIC if the environment variable 6648 // KMP_USER_LEVEL_MWAIT was set to TRUE. 6649 if (__kmp_mic_type == mic3) { 6650 unsigned long res = getauxval(AT_INTELPHIUSERMWAIT); 6651 if ((res & 0x1) || __kmp_user_level_mwait) { 6652 __kmp_mwait_enabled = TRUE; 6653 if (__kmp_user_level_mwait) { 6654 KMP_INFORM(EnvMwaitWarn); 6655 } 6656 } else { 6657 __kmp_mwait_enabled = FALSE; 6658 } 6659 } 6660 KF_TRACE(30, ("__kmp_user_level_mwait_init: __kmp_mic_type = %d, " 6661 "__kmp_mwait_enabled = %d\n", 6662 __kmp_mic_type, __kmp_mwait_enabled)); 6663 } 6664 #endif /* KMP_HAVE_UMWAIT */ 6665 6666 static void __kmp_do_serial_initialize(void) { 6667 int i, gtid; 6668 size_t size; 6669 6670 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6671 6672 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6673 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6674 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6675 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6676 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6677 6678 #if OMPT_SUPPORT 6679 ompt_pre_init(); 6680 #endif 6681 6682 __kmp_validate_locks(); 6683 6684 /* Initialize internal memory allocator */ 6685 __kmp_init_allocator(); 6686 6687 /* Register the library startup via an environment variable and check to see 6688 whether another copy of the library is already registered. */ 6689 6690 __kmp_register_library_startup(); 6691 6692 /* TODO reinitialization of library */ 6693 if (TCR_4(__kmp_global.g.g_done)) { 6694 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6695 } 6696 6697 __kmp_global.g.g_abort = 0; 6698 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6699 6700 /* initialize the locks */ 6701 #if KMP_USE_ADAPTIVE_LOCKS 6702 #if KMP_DEBUG_ADAPTIVE_LOCKS 6703 __kmp_init_speculative_stats(); 6704 #endif 6705 #endif 6706 #if KMP_STATS_ENABLED 6707 __kmp_stats_init(); 6708 #endif 6709 __kmp_init_lock(&__kmp_global_lock); 6710 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6711 __kmp_init_lock(&__kmp_debug_lock); 6712 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6713 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6714 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6715 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6716 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6717 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6718 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6719 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6720 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6721 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6722 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6723 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6724 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6725 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6726 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6727 #if KMP_USE_MONITOR 6728 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6729 #endif 6730 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6731 6732 /* conduct initialization and initial setup of configuration */ 6733 6734 __kmp_runtime_initialize(); 6735 6736 #if KMP_MIC_SUPPORTED 6737 __kmp_check_mic_type(); 6738 #endif 6739 6740 // Some global variable initialization moved here from kmp_env_initialize() 6741 #ifdef KMP_DEBUG 6742 kmp_diag = 0; 6743 #endif 6744 __kmp_abort_delay = 0; 6745 6746 // From __kmp_init_dflt_team_nth() 6747 /* assume the entire machine will be used */ 6748 __kmp_dflt_team_nth_ub = __kmp_xproc; 6749 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6750 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6751 } 6752 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6753 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6754 } 6755 __kmp_max_nth = __kmp_sys_max_nth; 6756 __kmp_cg_max_nth = __kmp_sys_max_nth; 6757 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6758 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6759 __kmp_teams_max_nth = __kmp_sys_max_nth; 6760 } 6761 6762 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6763 // part 6764 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6765 #if KMP_USE_MONITOR 6766 __kmp_monitor_wakeups = 6767 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6768 __kmp_bt_intervals = 6769 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6770 #endif 6771 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6772 __kmp_library = library_throughput; 6773 // From KMP_SCHEDULE initialization 6774 __kmp_static = kmp_sch_static_balanced; 6775 // AC: do not use analytical here, because it is non-monotonous 6776 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6777 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6778 // need to repeat assignment 6779 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6780 // bit control and barrier method control parts 6781 #if KMP_FAST_REDUCTION_BARRIER 6782 #define kmp_reduction_barrier_gather_bb ((int)1) 6783 #define kmp_reduction_barrier_release_bb ((int)1) 6784 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6785 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6786 #endif // KMP_FAST_REDUCTION_BARRIER 6787 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6788 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6789 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6790 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6791 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6792 #if KMP_FAST_REDUCTION_BARRIER 6793 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6794 // lin_64 ): hyper,1 6795 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6796 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6797 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6798 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6799 } 6800 #endif // KMP_FAST_REDUCTION_BARRIER 6801 } 6802 #if KMP_FAST_REDUCTION_BARRIER 6803 #undef kmp_reduction_barrier_release_pat 6804 #undef kmp_reduction_barrier_gather_pat 6805 #undef kmp_reduction_barrier_release_bb 6806 #undef kmp_reduction_barrier_gather_bb 6807 #endif // KMP_FAST_REDUCTION_BARRIER 6808 #if KMP_MIC_SUPPORTED 6809 if (__kmp_mic_type == mic2) { // KNC 6810 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6811 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6812 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6813 1; // forkjoin release 6814 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6815 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6816 } 6817 #if KMP_FAST_REDUCTION_BARRIER 6818 if (__kmp_mic_type == mic2) { // KNC 6819 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6820 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6821 } 6822 #endif // KMP_FAST_REDUCTION_BARRIER 6823 #endif // KMP_MIC_SUPPORTED 6824 6825 // From KMP_CHECKS initialization 6826 #ifdef KMP_DEBUG 6827 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6828 #else 6829 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6830 #endif 6831 6832 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6833 __kmp_foreign_tp = TRUE; 6834 6835 __kmp_global.g.g_dynamic = FALSE; 6836 __kmp_global.g.g_dynamic_mode = dynamic_default; 6837 6838 __kmp_env_initialize(NULL); 6839 6840 #if KMP_HAVE_MWAIT || KMP_HAVE_UMWAIT 6841 __kmp_user_level_mwait_init(); 6842 #endif 6843 // Print all messages in message catalog for testing purposes. 6844 #ifdef KMP_DEBUG 6845 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6846 if (__kmp_str_match_true(val)) { 6847 kmp_str_buf_t buffer; 6848 __kmp_str_buf_init(&buffer); 6849 __kmp_i18n_dump_catalog(&buffer); 6850 __kmp_printf("%s", buffer.str); 6851 __kmp_str_buf_free(&buffer); 6852 } 6853 __kmp_env_free(&val); 6854 #endif 6855 6856 __kmp_threads_capacity = 6857 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6858 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6859 __kmp_tp_capacity = __kmp_default_tp_capacity( 6860 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6861 6862 // If the library is shut down properly, both pools must be NULL. Just in 6863 // case, set them to NULL -- some memory may leak, but subsequent code will 6864 // work even if pools are not freed. 6865 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6866 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6867 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6868 __kmp_thread_pool = NULL; 6869 __kmp_thread_pool_insert_pt = NULL; 6870 __kmp_team_pool = NULL; 6871 6872 /* Allocate all of the variable sized records */ 6873 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6874 * expandable */ 6875 /* Since allocation is cache-aligned, just add extra padding at the end */ 6876 size = 6877 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6878 CACHE_LINE; 6879 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6880 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6881 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6882 6883 /* init thread counts */ 6884 KMP_DEBUG_ASSERT(__kmp_all_nth == 6885 0); // Asserts fail if the library is reinitializing and 6886 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6887 __kmp_all_nth = 0; 6888 __kmp_nth = 0; 6889 6890 /* setup the uber master thread and hierarchy */ 6891 gtid = __kmp_register_root(TRUE); 6892 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6893 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6894 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6895 6896 KMP_MB(); /* Flush all pending memory write invalidates. */ 6897 6898 __kmp_common_initialize(); 6899 6900 #if KMP_OS_UNIX 6901 /* invoke the child fork handler */ 6902 __kmp_register_atfork(); 6903 #endif 6904 6905 #if !KMP_DYNAMIC_LIB 6906 { 6907 /* Invoke the exit handler when the program finishes, only for static 6908 library. For dynamic library, we already have _fini and DllMain. */ 6909 int rc = atexit(__kmp_internal_end_atexit); 6910 if (rc != 0) { 6911 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6912 __kmp_msg_null); 6913 } 6914 } 6915 #endif 6916 6917 #if KMP_HANDLE_SIGNALS 6918 #if KMP_OS_UNIX 6919 /* NOTE: make sure that this is called before the user installs their own 6920 signal handlers so that the user handlers are called first. this way they 6921 can return false, not call our handler, avoid terminating the library, and 6922 continue execution where they left off. */ 6923 __kmp_install_signals(FALSE); 6924 #endif /* KMP_OS_UNIX */ 6925 #if KMP_OS_WINDOWS 6926 __kmp_install_signals(TRUE); 6927 #endif /* KMP_OS_WINDOWS */ 6928 #endif 6929 6930 /* we have finished the serial initialization */ 6931 __kmp_init_counter++; 6932 6933 __kmp_init_serial = TRUE; 6934 6935 if (__kmp_settings) { 6936 __kmp_env_print(); 6937 } 6938 6939 if (__kmp_display_env || __kmp_display_env_verbose) { 6940 __kmp_env_print_2(); 6941 } 6942 6943 #if OMPT_SUPPORT 6944 ompt_post_init(); 6945 #endif 6946 6947 KMP_MB(); 6948 6949 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6950 } 6951 6952 void __kmp_serial_initialize(void) { 6953 if (__kmp_init_serial) { 6954 return; 6955 } 6956 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6957 if (__kmp_init_serial) { 6958 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6959 return; 6960 } 6961 __kmp_do_serial_initialize(); 6962 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6963 } 6964 6965 static void __kmp_do_middle_initialize(void) { 6966 int i, j; 6967 int prev_dflt_team_nth; 6968 6969 if (!__kmp_init_serial) { 6970 __kmp_do_serial_initialize(); 6971 } 6972 6973 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6974 6975 // Save the previous value for the __kmp_dflt_team_nth so that 6976 // we can avoid some reinitialization if it hasn't changed. 6977 prev_dflt_team_nth = __kmp_dflt_team_nth; 6978 6979 #if KMP_AFFINITY_SUPPORTED 6980 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6981 // number of cores on the machine. 6982 __kmp_affinity_initialize(); 6983 6984 // Run through the __kmp_threads array and set the affinity mask 6985 // for each root thread that is currently registered with the RTL. 6986 for (i = 0; i < __kmp_threads_capacity; i++) { 6987 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6988 __kmp_affinity_set_init_mask(i, TRUE); 6989 } 6990 } 6991 #endif /* KMP_AFFINITY_SUPPORTED */ 6992 6993 KMP_ASSERT(__kmp_xproc > 0); 6994 if (__kmp_avail_proc == 0) { 6995 __kmp_avail_proc = __kmp_xproc; 6996 } 6997 6998 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6999 // correct them now 7000 j = 0; 7001 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 7002 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 7003 __kmp_avail_proc; 7004 j++; 7005 } 7006 7007 if (__kmp_dflt_team_nth == 0) { 7008 #ifdef KMP_DFLT_NTH_CORES 7009 // Default #threads = #cores 7010 __kmp_dflt_team_nth = __kmp_ncores; 7011 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7012 "__kmp_ncores (%d)\n", 7013 __kmp_dflt_team_nth)); 7014 #else 7015 // Default #threads = #available OS procs 7016 __kmp_dflt_team_nth = __kmp_avail_proc; 7017 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 7018 "__kmp_avail_proc(%d)\n", 7019 __kmp_dflt_team_nth)); 7020 #endif /* KMP_DFLT_NTH_CORES */ 7021 } 7022 7023 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 7024 __kmp_dflt_team_nth = KMP_MIN_NTH; 7025 } 7026 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 7027 __kmp_dflt_team_nth = __kmp_sys_max_nth; 7028 } 7029 7030 // There's no harm in continuing if the following check fails, 7031 // but it indicates an error in the previous logic. 7032 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 7033 7034 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 7035 // Run through the __kmp_threads array and set the num threads icv for each 7036 // root thread that is currently registered with the RTL (which has not 7037 // already explicitly set its nthreads-var with a call to 7038 // omp_set_num_threads()). 7039 for (i = 0; i < __kmp_threads_capacity; i++) { 7040 kmp_info_t *thread = __kmp_threads[i]; 7041 if (thread == NULL) 7042 continue; 7043 if (thread->th.th_current_task->td_icvs.nproc != 0) 7044 continue; 7045 7046 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 7047 } 7048 } 7049 KA_TRACE( 7050 20, 7051 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 7052 __kmp_dflt_team_nth)); 7053 7054 #ifdef KMP_ADJUST_BLOCKTIME 7055 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 7056 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 7057 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 7058 if (__kmp_nth > __kmp_avail_proc) { 7059 __kmp_zero_bt = TRUE; 7060 } 7061 } 7062 #endif /* KMP_ADJUST_BLOCKTIME */ 7063 7064 /* we have finished middle initialization */ 7065 TCW_SYNC_4(__kmp_init_middle, TRUE); 7066 7067 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 7068 } 7069 7070 void __kmp_middle_initialize(void) { 7071 if (__kmp_init_middle) { 7072 return; 7073 } 7074 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7075 if (__kmp_init_middle) { 7076 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7077 return; 7078 } 7079 __kmp_do_middle_initialize(); 7080 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7081 } 7082 7083 void __kmp_parallel_initialize(void) { 7084 int gtid = __kmp_entry_gtid(); // this might be a new root 7085 7086 /* synchronize parallel initialization (for sibling) */ 7087 if (TCR_4(__kmp_init_parallel)) 7088 return; 7089 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7090 if (TCR_4(__kmp_init_parallel)) { 7091 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7092 return; 7093 } 7094 7095 /* TODO reinitialization after we have already shut down */ 7096 if (TCR_4(__kmp_global.g.g_done)) { 7097 KA_TRACE( 7098 10, 7099 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 7100 __kmp_infinite_loop(); 7101 } 7102 7103 /* jc: The lock __kmp_initz_lock is already held, so calling 7104 __kmp_serial_initialize would cause a deadlock. So we call 7105 __kmp_do_serial_initialize directly. */ 7106 if (!__kmp_init_middle) { 7107 __kmp_do_middle_initialize(); 7108 } 7109 __kmp_resume_if_hard_paused(); 7110 7111 /* begin initialization */ 7112 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 7113 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7114 7115 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 7116 // Save the FP control regs. 7117 // Worker threads will set theirs to these values at thread startup. 7118 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 7119 __kmp_store_mxcsr(&__kmp_init_mxcsr); 7120 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 7121 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 7122 7123 #if KMP_OS_UNIX 7124 #if KMP_HANDLE_SIGNALS 7125 /* must be after __kmp_serial_initialize */ 7126 __kmp_install_signals(TRUE); 7127 #endif 7128 #endif 7129 7130 __kmp_suspend_initialize(); 7131 7132 #if defined(USE_LOAD_BALANCE) 7133 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7134 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 7135 } 7136 #else 7137 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 7138 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7139 } 7140 #endif 7141 7142 if (__kmp_version) { 7143 __kmp_print_version_2(); 7144 } 7145 7146 /* we have finished parallel initialization */ 7147 TCW_SYNC_4(__kmp_init_parallel, TRUE); 7148 7149 KMP_MB(); 7150 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 7151 7152 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7153 } 7154 7155 void __kmp_hidden_helper_initialize() { 7156 if (TCR_4(__kmp_init_hidden_helper)) 7157 return; 7158 7159 // __kmp_parallel_initialize is required before we initialize hidden helper 7160 if (!TCR_4(__kmp_init_parallel)) 7161 __kmp_parallel_initialize(); 7162 7163 // Double check. Note that this double check should not be placed before 7164 // __kmp_parallel_initialize as it will cause dead lock. 7165 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7166 if (TCR_4(__kmp_init_hidden_helper)) { 7167 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7168 return; 7169 } 7170 7171 // Set the count of hidden helper tasks to be executed to zero 7172 KMP_ATOMIC_ST_REL(&__kmp_unexecuted_hidden_helper_tasks, 0); 7173 7174 // Set the global variable indicating that we're initializing hidden helper 7175 // team/threads 7176 TCW_SYNC_4(__kmp_init_hidden_helper_threads, TRUE); 7177 7178 // Platform independent initialization 7179 __kmp_do_initialize_hidden_helper_threads(); 7180 7181 // Wait here for the finish of initialization of hidden helper teams 7182 __kmp_hidden_helper_threads_initz_wait(); 7183 7184 // We have finished hidden helper initialization 7185 TCW_SYNC_4(__kmp_init_hidden_helper, TRUE); 7186 7187 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7188 } 7189 7190 /* ------------------------------------------------------------------------ */ 7191 7192 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7193 kmp_team_t *team) { 7194 kmp_disp_t *dispatch; 7195 7196 KMP_MB(); 7197 7198 /* none of the threads have encountered any constructs, yet. */ 7199 this_thr->th.th_local.this_construct = 0; 7200 #if KMP_CACHE_MANAGE 7201 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7202 #endif /* KMP_CACHE_MANAGE */ 7203 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7204 KMP_DEBUG_ASSERT(dispatch); 7205 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7206 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7207 // this_thr->th.th_info.ds.ds_tid ] ); 7208 7209 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7210 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7211 if (__kmp_env_consistency_check) 7212 __kmp_push_parallel(gtid, team->t.t_ident); 7213 7214 KMP_MB(); /* Flush all pending memory write invalidates. */ 7215 } 7216 7217 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7218 kmp_team_t *team) { 7219 if (__kmp_env_consistency_check) 7220 __kmp_pop_parallel(gtid, team->t.t_ident); 7221 7222 __kmp_finish_implicit_task(this_thr); 7223 } 7224 7225 int __kmp_invoke_task_func(int gtid) { 7226 int rc; 7227 int tid = __kmp_tid_from_gtid(gtid); 7228 kmp_info_t *this_thr = __kmp_threads[gtid]; 7229 kmp_team_t *team = this_thr->th.th_team; 7230 7231 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7232 #if USE_ITT_BUILD 7233 if (__itt_stack_caller_create_ptr) { 7234 __kmp_itt_stack_callee_enter( 7235 (__itt_caller) 7236 team->t.t_stack_id); // inform ittnotify about entering user's code 7237 } 7238 #endif /* USE_ITT_BUILD */ 7239 #if INCLUDE_SSC_MARKS 7240 SSC_MARK_INVOKING(); 7241 #endif 7242 7243 #if OMPT_SUPPORT 7244 void *dummy; 7245 void **exit_frame_p; 7246 ompt_data_t *my_task_data; 7247 ompt_data_t *my_parallel_data; 7248 int ompt_team_size; 7249 7250 if (ompt_enabled.enabled) { 7251 exit_frame_p = &( 7252 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr); 7253 } else { 7254 exit_frame_p = &dummy; 7255 } 7256 7257 my_task_data = 7258 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7259 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7260 if (ompt_enabled.ompt_callback_implicit_task) { 7261 ompt_team_size = team->t.t_nproc; 7262 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7263 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7264 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7265 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7266 } 7267 #endif 7268 7269 #if KMP_STATS_ENABLED 7270 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7271 if (previous_state == stats_state_e::TEAMS_REGION) { 7272 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7273 } else { 7274 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7275 } 7276 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7277 #endif 7278 7279 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7280 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7281 #if OMPT_SUPPORT 7282 , 7283 exit_frame_p 7284 #endif 7285 ); 7286 #if OMPT_SUPPORT 7287 *exit_frame_p = NULL; 7288 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7289 #endif 7290 7291 #if KMP_STATS_ENABLED 7292 if (previous_state == stats_state_e::TEAMS_REGION) { 7293 KMP_SET_THREAD_STATE(previous_state); 7294 } 7295 KMP_POP_PARTITIONED_TIMER(); 7296 #endif 7297 7298 #if USE_ITT_BUILD 7299 if (__itt_stack_caller_create_ptr) { 7300 __kmp_itt_stack_callee_leave( 7301 (__itt_caller) 7302 team->t.t_stack_id); // inform ittnotify about leaving user's code 7303 } 7304 #endif /* USE_ITT_BUILD */ 7305 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7306 7307 return rc; 7308 } 7309 7310 void __kmp_teams_master(int gtid) { 7311 // This routine is called by all master threads in teams construct 7312 kmp_info_t *thr = __kmp_threads[gtid]; 7313 kmp_team_t *team = thr->th.th_team; 7314 ident_t *loc = team->t.t_ident; 7315 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7316 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7317 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7318 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7319 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7320 7321 // This thread is a new CG root. Set up the proper variables. 7322 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7323 tmp->cg_root = thr; // Make thr the CG root 7324 // Init to thread limit that was stored when league masters were forked 7325 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7326 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7327 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7328 " cg_nthreads to 1\n", 7329 thr, tmp)); 7330 tmp->up = thr->th.th_cg_roots; 7331 thr->th.th_cg_roots = tmp; 7332 7333 // Launch league of teams now, but not let workers execute 7334 // (they hang on fork barrier until next parallel) 7335 #if INCLUDE_SSC_MARKS 7336 SSC_MARK_FORKING(); 7337 #endif 7338 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7339 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7340 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7341 #if INCLUDE_SSC_MARKS 7342 SSC_MARK_JOINING(); 7343 #endif 7344 // If the team size was reduced from the limit, set it to the new size 7345 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7346 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7347 // AC: last parameter "1" eliminates join barrier which won't work because 7348 // worker threads are in a fork barrier waiting for more parallel regions 7349 __kmp_join_call(loc, gtid 7350 #if OMPT_SUPPORT 7351 , 7352 fork_context_intel 7353 #endif 7354 , 7355 1); 7356 } 7357 7358 int __kmp_invoke_teams_master(int gtid) { 7359 kmp_info_t *this_thr = __kmp_threads[gtid]; 7360 kmp_team_t *team = this_thr->th.th_team; 7361 #if KMP_DEBUG 7362 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7363 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7364 (void *)__kmp_teams_master); 7365 #endif 7366 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7367 #if OMPT_SUPPORT 7368 int tid = __kmp_tid_from_gtid(gtid); 7369 ompt_data_t *task_data = 7370 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7371 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7372 if (ompt_enabled.ompt_callback_implicit_task) { 7373 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7374 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7375 ompt_task_initial); 7376 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7377 } 7378 #endif 7379 __kmp_teams_master(gtid); 7380 #if OMPT_SUPPORT 7381 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7382 #endif 7383 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7384 return 1; 7385 } 7386 7387 /* this sets the requested number of threads for the next parallel region 7388 encountered by this team. since this should be enclosed in the forkjoin 7389 critical section it should avoid race conditions with asymmetrical nested 7390 parallelism */ 7391 7392 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7393 kmp_info_t *thr = __kmp_threads[gtid]; 7394 7395 if (num_threads > 0) 7396 thr->th.th_set_nproc = num_threads; 7397 } 7398 7399 /* this sets the requested number of teams for the teams region and/or 7400 the number of threads for the next parallel region encountered */ 7401 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7402 int num_threads) { 7403 kmp_info_t *thr = __kmp_threads[gtid]; 7404 KMP_DEBUG_ASSERT(num_teams >= 0); 7405 KMP_DEBUG_ASSERT(num_threads >= 0); 7406 7407 if (num_teams == 0) 7408 num_teams = 1; // default number of teams is 1. 7409 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7410 if (!__kmp_reserve_warn) { 7411 __kmp_reserve_warn = 1; 7412 __kmp_msg(kmp_ms_warning, 7413 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7414 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7415 } 7416 num_teams = __kmp_teams_max_nth; 7417 } 7418 // Set number of teams (number of threads in the outer "parallel" of the 7419 // teams) 7420 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7421 7422 // Remember the number of threads for inner parallel regions 7423 if (!TCR_4(__kmp_init_middle)) 7424 __kmp_middle_initialize(); // get internal globals calculated 7425 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7426 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7427 if (num_threads == 0) { 7428 num_threads = __kmp_avail_proc / num_teams; 7429 // adjust num_threads w/o warning as it is not user setting 7430 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7431 // no thread_limit clause specified - do not change thread-limit-var ICV 7432 if (num_threads > __kmp_dflt_team_nth) { 7433 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7434 } 7435 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7436 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7437 } // prevent team size to exceed thread-limit-var 7438 if (num_teams * num_threads > __kmp_teams_max_nth) { 7439 num_threads = __kmp_teams_max_nth / num_teams; 7440 } 7441 } else { 7442 // This thread will be the master of the league masters 7443 // Store new thread limit; old limit is saved in th_cg_roots list 7444 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7445 // num_threads = min(num_threads, nthreads-var) 7446 if (num_threads > __kmp_dflt_team_nth) { 7447 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7448 } 7449 if (num_teams * num_threads > __kmp_teams_max_nth) { 7450 int new_threads = __kmp_teams_max_nth / num_teams; 7451 if (!__kmp_reserve_warn) { // user asked for too many threads 7452 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7453 __kmp_msg(kmp_ms_warning, 7454 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7455 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7456 } 7457 num_threads = new_threads; 7458 } 7459 } 7460 thr->th.th_teams_size.nth = num_threads; 7461 } 7462 7463 // Set the proc_bind var to use in the following parallel region. 7464 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7465 kmp_info_t *thr = __kmp_threads[gtid]; 7466 thr->th.th_set_proc_bind = proc_bind; 7467 } 7468 7469 /* Launch the worker threads into the microtask. */ 7470 7471 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7472 kmp_info_t *this_thr = __kmp_threads[gtid]; 7473 7474 #ifdef KMP_DEBUG 7475 int f; 7476 #endif /* KMP_DEBUG */ 7477 7478 KMP_DEBUG_ASSERT(team); 7479 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7480 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7481 KMP_MB(); /* Flush all pending memory write invalidates. */ 7482 7483 team->t.t_construct = 0; /* no single directives seen yet */ 7484 team->t.t_ordered.dt.t_value = 7485 0; /* thread 0 enters the ordered section first */ 7486 7487 /* Reset the identifiers on the dispatch buffer */ 7488 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7489 if (team->t.t_max_nproc > 1) { 7490 int i; 7491 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7492 team->t.t_disp_buffer[i].buffer_index = i; 7493 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7494 } 7495 } else { 7496 team->t.t_disp_buffer[0].buffer_index = 0; 7497 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7498 } 7499 7500 KMP_MB(); /* Flush all pending memory write invalidates. */ 7501 KMP_ASSERT(this_thr->th.th_team == team); 7502 7503 #ifdef KMP_DEBUG 7504 for (f = 0; f < team->t.t_nproc; f++) { 7505 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7506 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7507 } 7508 #endif /* KMP_DEBUG */ 7509 7510 /* release the worker threads so they may begin working */ 7511 __kmp_fork_barrier(gtid, 0); 7512 } 7513 7514 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7515 kmp_info_t *this_thr = __kmp_threads[gtid]; 7516 7517 KMP_DEBUG_ASSERT(team); 7518 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7519 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7520 KMP_MB(); /* Flush all pending memory write invalidates. */ 7521 7522 /* Join barrier after fork */ 7523 7524 #ifdef KMP_DEBUG 7525 if (__kmp_threads[gtid] && 7526 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7527 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7528 __kmp_threads[gtid]); 7529 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7530 "team->t.t_nproc=%d\n", 7531 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7532 team->t.t_nproc); 7533 __kmp_print_structure(); 7534 } 7535 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7536 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7537 #endif /* KMP_DEBUG */ 7538 7539 __kmp_join_barrier(gtid); /* wait for everyone */ 7540 #if OMPT_SUPPORT 7541 if (ompt_enabled.enabled && 7542 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7543 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7544 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7545 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7546 #if OMPT_OPTIONAL 7547 void *codeptr = NULL; 7548 if (KMP_MASTER_TID(ds_tid) && 7549 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7550 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7551 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7552 7553 if (ompt_enabled.ompt_callback_sync_region_wait) { 7554 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7555 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7556 codeptr); 7557 } 7558 if (ompt_enabled.ompt_callback_sync_region) { 7559 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7560 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7561 codeptr); 7562 } 7563 #endif 7564 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7565 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7566 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7567 } 7568 } 7569 #endif 7570 7571 KMP_MB(); /* Flush all pending memory write invalidates. */ 7572 KMP_ASSERT(this_thr->th.th_team == team); 7573 } 7574 7575 /* ------------------------------------------------------------------------ */ 7576 7577 #ifdef USE_LOAD_BALANCE 7578 7579 // Return the worker threads actively spinning in the hot team, if we 7580 // are at the outermost level of parallelism. Otherwise, return 0. 7581 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7582 int i; 7583 int retval; 7584 kmp_team_t *hot_team; 7585 7586 if (root->r.r_active) { 7587 return 0; 7588 } 7589 hot_team = root->r.r_hot_team; 7590 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7591 return hot_team->t.t_nproc - 1; // Don't count master thread 7592 } 7593 7594 // Skip the master thread - it is accounted for elsewhere. 7595 retval = 0; 7596 for (i = 1; i < hot_team->t.t_nproc; i++) { 7597 if (hot_team->t.t_threads[i]->th.th_active) { 7598 retval++; 7599 } 7600 } 7601 return retval; 7602 } 7603 7604 // Perform an automatic adjustment to the number of 7605 // threads used by the next parallel region. 7606 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7607 int retval; 7608 int pool_active; 7609 int hot_team_active; 7610 int team_curr_active; 7611 int system_active; 7612 7613 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7614 set_nproc)); 7615 KMP_DEBUG_ASSERT(root); 7616 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7617 ->th.th_current_task->td_icvs.dynamic == TRUE); 7618 KMP_DEBUG_ASSERT(set_nproc > 1); 7619 7620 if (set_nproc == 1) { 7621 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7622 return 1; 7623 } 7624 7625 // Threads that are active in the thread pool, active in the hot team for this 7626 // particular root (if we are at the outer par level), and the currently 7627 // executing thread (to become the master) are available to add to the new 7628 // team, but are currently contributing to the system load, and must be 7629 // accounted for. 7630 pool_active = __kmp_thread_pool_active_nth; 7631 hot_team_active = __kmp_active_hot_team_nproc(root); 7632 team_curr_active = pool_active + hot_team_active + 1; 7633 7634 // Check the system load. 7635 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7636 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7637 "hot team active = %d\n", 7638 system_active, pool_active, hot_team_active)); 7639 7640 if (system_active < 0) { 7641 // There was an error reading the necessary info from /proc, so use the 7642 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7643 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7644 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7645 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7646 7647 // Make this call behave like the thread limit algorithm. 7648 retval = __kmp_avail_proc - __kmp_nth + 7649 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7650 if (retval > set_nproc) { 7651 retval = set_nproc; 7652 } 7653 if (retval < KMP_MIN_NTH) { 7654 retval = KMP_MIN_NTH; 7655 } 7656 7657 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7658 retval)); 7659 return retval; 7660 } 7661 7662 // There is a slight delay in the load balance algorithm in detecting new 7663 // running procs. The real system load at this instant should be at least as 7664 // large as the #active omp thread that are available to add to the team. 7665 if (system_active < team_curr_active) { 7666 system_active = team_curr_active; 7667 } 7668 retval = __kmp_avail_proc - system_active + team_curr_active; 7669 if (retval > set_nproc) { 7670 retval = set_nproc; 7671 } 7672 if (retval < KMP_MIN_NTH) { 7673 retval = KMP_MIN_NTH; 7674 } 7675 7676 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7677 return retval; 7678 } // __kmp_load_balance_nproc() 7679 7680 #endif /* USE_LOAD_BALANCE */ 7681 7682 /* ------------------------------------------------------------------------ */ 7683 7684 /* NOTE: this is called with the __kmp_init_lock held */ 7685 void __kmp_cleanup(void) { 7686 int f; 7687 7688 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7689 7690 if (TCR_4(__kmp_init_parallel)) { 7691 #if KMP_HANDLE_SIGNALS 7692 __kmp_remove_signals(); 7693 #endif 7694 TCW_4(__kmp_init_parallel, FALSE); 7695 } 7696 7697 if (TCR_4(__kmp_init_middle)) { 7698 #if KMP_AFFINITY_SUPPORTED 7699 __kmp_affinity_uninitialize(); 7700 #endif /* KMP_AFFINITY_SUPPORTED */ 7701 __kmp_cleanup_hierarchy(); 7702 TCW_4(__kmp_init_middle, FALSE); 7703 } 7704 7705 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7706 7707 if (__kmp_init_serial) { 7708 __kmp_runtime_destroy(); 7709 __kmp_init_serial = FALSE; 7710 } 7711 7712 __kmp_cleanup_threadprivate_caches(); 7713 7714 for (f = 0; f < __kmp_threads_capacity; f++) { 7715 if (__kmp_root[f] != NULL) { 7716 __kmp_free(__kmp_root[f]); 7717 __kmp_root[f] = NULL; 7718 } 7719 } 7720 __kmp_free(__kmp_threads); 7721 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7722 // there is no need in freeing __kmp_root. 7723 __kmp_threads = NULL; 7724 __kmp_root = NULL; 7725 __kmp_threads_capacity = 0; 7726 7727 #if KMP_USE_DYNAMIC_LOCK 7728 __kmp_cleanup_indirect_user_locks(); 7729 #else 7730 __kmp_cleanup_user_locks(); 7731 #endif 7732 7733 #if KMP_AFFINITY_SUPPORTED 7734 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7735 __kmp_cpuinfo_file = NULL; 7736 #endif /* KMP_AFFINITY_SUPPORTED */ 7737 7738 #if KMP_USE_ADAPTIVE_LOCKS 7739 #if KMP_DEBUG_ADAPTIVE_LOCKS 7740 __kmp_print_speculative_stats(); 7741 #endif 7742 #endif 7743 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7744 __kmp_nested_nth.nth = NULL; 7745 __kmp_nested_nth.size = 0; 7746 __kmp_nested_nth.used = 0; 7747 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7748 __kmp_nested_proc_bind.bind_types = NULL; 7749 __kmp_nested_proc_bind.size = 0; 7750 __kmp_nested_proc_bind.used = 0; 7751 if (__kmp_affinity_format) { 7752 KMP_INTERNAL_FREE(__kmp_affinity_format); 7753 __kmp_affinity_format = NULL; 7754 } 7755 7756 __kmp_i18n_catclose(); 7757 7758 #if KMP_USE_HIER_SCHED 7759 __kmp_hier_scheds.deallocate(); 7760 #endif 7761 7762 #if KMP_STATS_ENABLED 7763 __kmp_stats_fini(); 7764 #endif 7765 7766 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7767 } 7768 7769 /* ------------------------------------------------------------------------ */ 7770 7771 int __kmp_ignore_mppbeg(void) { 7772 char *env; 7773 7774 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7775 if (__kmp_str_match_false(env)) 7776 return FALSE; 7777 } 7778 // By default __kmpc_begin() is no-op. 7779 return TRUE; 7780 } 7781 7782 int __kmp_ignore_mppend(void) { 7783 char *env; 7784 7785 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7786 if (__kmp_str_match_false(env)) 7787 return FALSE; 7788 } 7789 // By default __kmpc_end() is no-op. 7790 return TRUE; 7791 } 7792 7793 void __kmp_internal_begin(void) { 7794 int gtid; 7795 kmp_root_t *root; 7796 7797 /* this is a very important step as it will register new sibling threads 7798 and assign these new uber threads a new gtid */ 7799 gtid = __kmp_entry_gtid(); 7800 root = __kmp_threads[gtid]->th.th_root; 7801 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7802 7803 if (root->r.r_begin) 7804 return; 7805 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7806 if (root->r.r_begin) { 7807 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7808 return; 7809 } 7810 7811 root->r.r_begin = TRUE; 7812 7813 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7814 } 7815 7816 /* ------------------------------------------------------------------------ */ 7817 7818 void __kmp_user_set_library(enum library_type arg) { 7819 int gtid; 7820 kmp_root_t *root; 7821 kmp_info_t *thread; 7822 7823 /* first, make sure we are initialized so we can get our gtid */ 7824 7825 gtid = __kmp_entry_gtid(); 7826 thread = __kmp_threads[gtid]; 7827 7828 root = thread->th.th_root; 7829 7830 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7831 library_serial)); 7832 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7833 thread */ 7834 KMP_WARNING(SetLibraryIncorrectCall); 7835 return; 7836 } 7837 7838 switch (arg) { 7839 case library_serial: 7840 thread->th.th_set_nproc = 0; 7841 set__nproc(thread, 1); 7842 break; 7843 case library_turnaround: 7844 thread->th.th_set_nproc = 0; 7845 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7846 : __kmp_dflt_team_nth_ub); 7847 break; 7848 case library_throughput: 7849 thread->th.th_set_nproc = 0; 7850 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7851 : __kmp_dflt_team_nth_ub); 7852 break; 7853 default: 7854 KMP_FATAL(UnknownLibraryType, arg); 7855 } 7856 7857 __kmp_aux_set_library(arg); 7858 } 7859 7860 void __kmp_aux_set_stacksize(size_t arg) { 7861 if (!__kmp_init_serial) 7862 __kmp_serial_initialize(); 7863 7864 #if KMP_OS_DARWIN 7865 if (arg & (0x1000 - 1)) { 7866 arg &= ~(0x1000 - 1); 7867 if (arg + 0x1000) /* check for overflow if we round up */ 7868 arg += 0x1000; 7869 } 7870 #endif 7871 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7872 7873 /* only change the default stacksize before the first parallel region */ 7874 if (!TCR_4(__kmp_init_parallel)) { 7875 size_t value = arg; /* argument is in bytes */ 7876 7877 if (value < __kmp_sys_min_stksize) 7878 value = __kmp_sys_min_stksize; 7879 else if (value > KMP_MAX_STKSIZE) 7880 value = KMP_MAX_STKSIZE; 7881 7882 __kmp_stksize = value; 7883 7884 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7885 } 7886 7887 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7888 } 7889 7890 /* set the behaviour of the runtime library */ 7891 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7892 void __kmp_aux_set_library(enum library_type arg) { 7893 __kmp_library = arg; 7894 7895 switch (__kmp_library) { 7896 case library_serial: { 7897 KMP_INFORM(LibraryIsSerial); 7898 } break; 7899 case library_turnaround: 7900 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7901 __kmp_use_yield = 2; // only yield when oversubscribed 7902 break; 7903 case library_throughput: 7904 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7905 __kmp_dflt_blocktime = 200; 7906 break; 7907 default: 7908 KMP_FATAL(UnknownLibraryType, arg); 7909 } 7910 } 7911 7912 /* Getting team information common for all team API */ 7913 // Returns NULL if not in teams construct 7914 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 7915 kmp_info_t *thr = __kmp_entry_thread(); 7916 teams_serialized = 0; 7917 if (thr->th.th_teams_microtask) { 7918 kmp_team_t *team = thr->th.th_team; 7919 int tlevel = thr->th.th_teams_level; // the level of the teams construct 7920 int ii = team->t.t_level; 7921 teams_serialized = team->t.t_serialized; 7922 int level = tlevel + 1; 7923 KMP_DEBUG_ASSERT(ii >= tlevel); 7924 while (ii > level) { 7925 for (teams_serialized = team->t.t_serialized; 7926 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 7927 } 7928 if (team->t.t_serialized && (!teams_serialized)) { 7929 team = team->t.t_parent; 7930 continue; 7931 } 7932 if (ii > level) { 7933 team = team->t.t_parent; 7934 ii--; 7935 } 7936 } 7937 return team; 7938 } 7939 return NULL; 7940 } 7941 7942 int __kmp_aux_get_team_num() { 7943 int serialized; 7944 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7945 if (team) { 7946 if (serialized > 1) { 7947 return 0; // teams region is serialized ( 1 team of 1 thread ). 7948 } else { 7949 return team->t.t_master_tid; 7950 } 7951 } 7952 return 0; 7953 } 7954 7955 int __kmp_aux_get_num_teams() { 7956 int serialized; 7957 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7958 if (team) { 7959 if (serialized > 1) { 7960 return 1; 7961 } else { 7962 return team->t.t_parent->t.t_nproc; 7963 } 7964 } 7965 return 1; 7966 } 7967 7968 /* ------------------------------------------------------------------------ */ 7969 7970 /* 7971 * Affinity Format Parser 7972 * 7973 * Field is in form of: %[[[0].]size]type 7974 * % and type are required (%% means print a literal '%') 7975 * type is either single char or long name surrounded by {}, 7976 * e.g., N or {num_threads} 7977 * 0 => leading zeros 7978 * . => right justified when size is specified 7979 * by default output is left justified 7980 * size is the *minimum* field length 7981 * All other characters are printed as is 7982 * 7983 * Available field types: 7984 * L {thread_level} - omp_get_level() 7985 * n {thread_num} - omp_get_thread_num() 7986 * h {host} - name of host machine 7987 * P {process_id} - process id (integer) 7988 * T {thread_identifier} - native thread identifier (integer) 7989 * N {num_threads} - omp_get_num_threads() 7990 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 7991 * a {thread_affinity} - comma separated list of integers or integer ranges 7992 * (values of affinity mask) 7993 * 7994 * Implementation-specific field types can be added 7995 * If a type is unknown, print "undefined" 7996 */ 7997 7998 // Structure holding the short name, long name, and corresponding data type 7999 // for snprintf. A table of these will represent the entire valid keyword 8000 // field types. 8001 typedef struct kmp_affinity_format_field_t { 8002 char short_name; // from spec e.g., L -> thread level 8003 const char *long_name; // from spec thread_level -> thread level 8004 char field_format; // data type for snprintf (typically 'd' or 's' 8005 // for integer or string) 8006 } kmp_affinity_format_field_t; 8007 8008 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 8009 #if KMP_AFFINITY_SUPPORTED 8010 {'A', "thread_affinity", 's'}, 8011 #endif 8012 {'t', "team_num", 'd'}, 8013 {'T', "num_teams", 'd'}, 8014 {'L', "nesting_level", 'd'}, 8015 {'n', "thread_num", 'd'}, 8016 {'N', "num_threads", 'd'}, 8017 {'a', "ancestor_tnum", 'd'}, 8018 {'H', "host", 's'}, 8019 {'P', "process_id", 'd'}, 8020 {'i', "native_thread_id", 'd'}}; 8021 8022 // Return the number of characters it takes to hold field 8023 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 8024 const char **ptr, 8025 kmp_str_buf_t *field_buffer) { 8026 int rc, format_index, field_value; 8027 const char *width_left, *width_right; 8028 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 8029 static const int FORMAT_SIZE = 20; 8030 char format[FORMAT_SIZE] = {0}; 8031 char absolute_short_name = 0; 8032 8033 KMP_DEBUG_ASSERT(gtid >= 0); 8034 KMP_DEBUG_ASSERT(th); 8035 KMP_DEBUG_ASSERT(**ptr == '%'); 8036 KMP_DEBUG_ASSERT(field_buffer); 8037 8038 __kmp_str_buf_clear(field_buffer); 8039 8040 // Skip the initial % 8041 (*ptr)++; 8042 8043 // Check for %% first 8044 if (**ptr == '%') { 8045 __kmp_str_buf_cat(field_buffer, "%", 1); 8046 (*ptr)++; // skip over the second % 8047 return 1; 8048 } 8049 8050 // Parse field modifiers if they are present 8051 pad_zeros = false; 8052 if (**ptr == '0') { 8053 pad_zeros = true; 8054 (*ptr)++; // skip over 0 8055 } 8056 right_justify = false; 8057 if (**ptr == '.') { 8058 right_justify = true; 8059 (*ptr)++; // skip over . 8060 } 8061 // Parse width of field: [width_left, width_right) 8062 width_left = width_right = NULL; 8063 if (**ptr >= '0' && **ptr <= '9') { 8064 width_left = *ptr; 8065 SKIP_DIGITS(*ptr); 8066 width_right = *ptr; 8067 } 8068 8069 // Create the format for KMP_SNPRINTF based on flags parsed above 8070 format_index = 0; 8071 format[format_index++] = '%'; 8072 if (!right_justify) 8073 format[format_index++] = '-'; 8074 if (pad_zeros) 8075 format[format_index++] = '0'; 8076 if (width_left && width_right) { 8077 int i = 0; 8078 // Only allow 8 digit number widths. 8079 // This also prevents overflowing format variable 8080 while (i < 8 && width_left < width_right) { 8081 format[format_index++] = *width_left; 8082 width_left++; 8083 i++; 8084 } 8085 } 8086 8087 // Parse a name (long or short) 8088 // Canonicalize the name into absolute_short_name 8089 found_valid_name = false; 8090 parse_long_name = (**ptr == '{'); 8091 if (parse_long_name) 8092 (*ptr)++; // skip initial left brace 8093 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 8094 sizeof(__kmp_affinity_format_table[0]); 8095 ++i) { 8096 char short_name = __kmp_affinity_format_table[i].short_name; 8097 const char *long_name = __kmp_affinity_format_table[i].long_name; 8098 char field_format = __kmp_affinity_format_table[i].field_format; 8099 if (parse_long_name) { 8100 size_t length = KMP_STRLEN(long_name); 8101 if (strncmp(*ptr, long_name, length) == 0) { 8102 found_valid_name = true; 8103 (*ptr) += length; // skip the long name 8104 } 8105 } else if (**ptr == short_name) { 8106 found_valid_name = true; 8107 (*ptr)++; // skip the short name 8108 } 8109 if (found_valid_name) { 8110 format[format_index++] = field_format; 8111 format[format_index++] = '\0'; 8112 absolute_short_name = short_name; 8113 break; 8114 } 8115 } 8116 if (parse_long_name) { 8117 if (**ptr != '}') { 8118 absolute_short_name = 0; 8119 } else { 8120 (*ptr)++; // skip over the right brace 8121 } 8122 } 8123 8124 // Attempt to fill the buffer with the requested 8125 // value using snprintf within __kmp_str_buf_print() 8126 switch (absolute_short_name) { 8127 case 't': 8128 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 8129 break; 8130 case 'T': 8131 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 8132 break; 8133 case 'L': 8134 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 8135 break; 8136 case 'n': 8137 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 8138 break; 8139 case 'H': { 8140 static const int BUFFER_SIZE = 256; 8141 char buf[BUFFER_SIZE]; 8142 __kmp_expand_host_name(buf, BUFFER_SIZE); 8143 rc = __kmp_str_buf_print(field_buffer, format, buf); 8144 } break; 8145 case 'P': 8146 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 8147 break; 8148 case 'i': 8149 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 8150 break; 8151 case 'N': 8152 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 8153 break; 8154 case 'a': 8155 field_value = 8156 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 8157 rc = __kmp_str_buf_print(field_buffer, format, field_value); 8158 break; 8159 #if KMP_AFFINITY_SUPPORTED 8160 case 'A': { 8161 kmp_str_buf_t buf; 8162 __kmp_str_buf_init(&buf); 8163 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 8164 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 8165 __kmp_str_buf_free(&buf); 8166 } break; 8167 #endif 8168 default: 8169 // According to spec, If an implementation does not have info for field 8170 // type, then "undefined" is printed 8171 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 8172 // Skip the field 8173 if (parse_long_name) { 8174 SKIP_TOKEN(*ptr); 8175 if (**ptr == '}') 8176 (*ptr)++; 8177 } else { 8178 (*ptr)++; 8179 } 8180 } 8181 8182 KMP_ASSERT(format_index <= FORMAT_SIZE); 8183 return rc; 8184 } 8185 8186 /* 8187 * Return number of characters needed to hold the affinity string 8188 * (not including null byte character) 8189 * The resultant string is printed to buffer, which the caller can then 8190 * handle afterwards 8191 */ 8192 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 8193 kmp_str_buf_t *buffer) { 8194 const char *parse_ptr; 8195 size_t retval; 8196 const kmp_info_t *th; 8197 kmp_str_buf_t field; 8198 8199 KMP_DEBUG_ASSERT(buffer); 8200 KMP_DEBUG_ASSERT(gtid >= 0); 8201 8202 __kmp_str_buf_init(&field); 8203 __kmp_str_buf_clear(buffer); 8204 8205 th = __kmp_threads[gtid]; 8206 retval = 0; 8207 8208 // If format is NULL or zero-length string, then we use 8209 // affinity-format-var ICV 8210 parse_ptr = format; 8211 if (parse_ptr == NULL || *parse_ptr == '\0') { 8212 parse_ptr = __kmp_affinity_format; 8213 } 8214 KMP_DEBUG_ASSERT(parse_ptr); 8215 8216 while (*parse_ptr != '\0') { 8217 // Parse a field 8218 if (*parse_ptr == '%') { 8219 // Put field in the buffer 8220 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8221 __kmp_str_buf_catbuf(buffer, &field); 8222 retval += rc; 8223 } else { 8224 // Put literal character in buffer 8225 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8226 retval++; 8227 parse_ptr++; 8228 } 8229 } 8230 __kmp_str_buf_free(&field); 8231 return retval; 8232 } 8233 8234 // Displays the affinity string to stdout 8235 void __kmp_aux_display_affinity(int gtid, const char *format) { 8236 kmp_str_buf_t buf; 8237 __kmp_str_buf_init(&buf); 8238 __kmp_aux_capture_affinity(gtid, format, &buf); 8239 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8240 __kmp_str_buf_free(&buf); 8241 } 8242 8243 /* ------------------------------------------------------------------------ */ 8244 8245 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8246 int blocktime = arg; /* argument is in milliseconds */ 8247 #if KMP_USE_MONITOR 8248 int bt_intervals; 8249 #endif 8250 kmp_int8 bt_set; 8251 8252 __kmp_save_internal_controls(thread); 8253 8254 /* Normalize and set blocktime for the teams */ 8255 if (blocktime < KMP_MIN_BLOCKTIME) 8256 blocktime = KMP_MIN_BLOCKTIME; 8257 else if (blocktime > KMP_MAX_BLOCKTIME) 8258 blocktime = KMP_MAX_BLOCKTIME; 8259 8260 set__blocktime_team(thread->th.th_team, tid, blocktime); 8261 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8262 8263 #if KMP_USE_MONITOR 8264 /* Calculate and set blocktime intervals for the teams */ 8265 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8266 8267 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8268 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8269 #endif 8270 8271 /* Set whether blocktime has been set to "TRUE" */ 8272 bt_set = TRUE; 8273 8274 set__bt_set_team(thread->th.th_team, tid, bt_set); 8275 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8276 #if KMP_USE_MONITOR 8277 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8278 "bt_intervals=%d, monitor_updates=%d\n", 8279 __kmp_gtid_from_tid(tid, thread->th.th_team), 8280 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8281 __kmp_monitor_wakeups)); 8282 #else 8283 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8284 __kmp_gtid_from_tid(tid, thread->th.th_team), 8285 thread->th.th_team->t.t_id, tid, blocktime)); 8286 #endif 8287 } 8288 8289 void __kmp_aux_set_defaults(char const *str, size_t len) { 8290 if (!__kmp_init_serial) { 8291 __kmp_serial_initialize(); 8292 } 8293 __kmp_env_initialize(str); 8294 8295 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8296 __kmp_env_print(); 8297 } 8298 } // __kmp_aux_set_defaults 8299 8300 /* ------------------------------------------------------------------------ */ 8301 /* internal fast reduction routines */ 8302 8303 PACKED_REDUCTION_METHOD_T 8304 __kmp_determine_reduction_method( 8305 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8306 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8307 kmp_critical_name *lck) { 8308 8309 // Default reduction method: critical construct ( lck != NULL, like in current 8310 // PAROPT ) 8311 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8312 // can be selected by RTL 8313 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8314 // can be selected by RTL 8315 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8316 // among generated by PAROPT. 8317 8318 PACKED_REDUCTION_METHOD_T retval; 8319 8320 int team_size; 8321 8322 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8323 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8324 8325 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8326 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8327 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8328 8329 retval = critical_reduce_block; 8330 8331 // another choice of getting a team size (with 1 dynamic deference) is slower 8332 team_size = __kmp_get_team_num_threads(global_tid); 8333 if (team_size == 1) { 8334 8335 retval = empty_reduce_block; 8336 8337 } else { 8338 8339 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8340 8341 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8342 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8343 8344 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8345 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8346 8347 int teamsize_cutoff = 4; 8348 8349 #if KMP_MIC_SUPPORTED 8350 if (__kmp_mic_type != non_mic) { 8351 teamsize_cutoff = 8; 8352 } 8353 #endif 8354 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8355 if (tree_available) { 8356 if (team_size <= teamsize_cutoff) { 8357 if (atomic_available) { 8358 retval = atomic_reduce_block; 8359 } 8360 } else { 8361 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8362 } 8363 } else if (atomic_available) { 8364 retval = atomic_reduce_block; 8365 } 8366 #else 8367 #error "Unknown or unsupported OS" 8368 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8369 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8370 8371 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8372 8373 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8374 8375 // basic tuning 8376 8377 if (atomic_available) { 8378 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8379 retval = atomic_reduce_block; 8380 } 8381 } // otherwise: use critical section 8382 8383 #elif KMP_OS_DARWIN 8384 8385 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8386 if (atomic_available && (num_vars <= 3)) { 8387 retval = atomic_reduce_block; 8388 } else if (tree_available) { 8389 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8390 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8391 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8392 } 8393 } // otherwise: use critical section 8394 8395 #else 8396 #error "Unknown or unsupported OS" 8397 #endif 8398 8399 #else 8400 #error "Unknown or unsupported architecture" 8401 #endif 8402 } 8403 8404 // KMP_FORCE_REDUCTION 8405 8406 // If the team is serialized (team_size == 1), ignore the forced reduction 8407 // method and stay with the unsynchronized method (empty_reduce_block) 8408 if (__kmp_force_reduction_method != reduction_method_not_defined && 8409 team_size != 1) { 8410 8411 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8412 8413 int atomic_available, tree_available; 8414 8415 switch ((forced_retval = __kmp_force_reduction_method)) { 8416 case critical_reduce_block: 8417 KMP_ASSERT(lck); // lck should be != 0 8418 break; 8419 8420 case atomic_reduce_block: 8421 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8422 if (!atomic_available) { 8423 KMP_WARNING(RedMethodNotSupported, "atomic"); 8424 forced_retval = critical_reduce_block; 8425 } 8426 break; 8427 8428 case tree_reduce_block: 8429 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8430 if (!tree_available) { 8431 KMP_WARNING(RedMethodNotSupported, "tree"); 8432 forced_retval = critical_reduce_block; 8433 } else { 8434 #if KMP_FAST_REDUCTION_BARRIER 8435 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8436 #endif 8437 } 8438 break; 8439 8440 default: 8441 KMP_ASSERT(0); // "unsupported method specified" 8442 } 8443 8444 retval = forced_retval; 8445 } 8446 8447 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8448 8449 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8450 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8451 8452 return (retval); 8453 } 8454 // this function is for testing set/get/determine reduce method 8455 kmp_int32 __kmp_get_reduce_method(void) { 8456 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8457 } 8458 8459 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8460 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8461 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8462 8463 // Hard pause shuts down the runtime completely. Resume happens naturally when 8464 // OpenMP is used subsequently. 8465 void __kmp_hard_pause() { 8466 __kmp_pause_status = kmp_hard_paused; 8467 __kmp_internal_end_thread(-1); 8468 } 8469 8470 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8471 void __kmp_resume_if_soft_paused() { 8472 if (__kmp_pause_status == kmp_soft_paused) { 8473 __kmp_pause_status = kmp_not_paused; 8474 8475 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8476 kmp_info_t *thread = __kmp_threads[gtid]; 8477 if (thread) { // Wake it if sleeping 8478 kmp_flag_64<> fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, 8479 thread); 8480 if (fl.is_sleeping()) 8481 fl.resume(gtid); 8482 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8483 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8484 } else { // thread holds the lock and may sleep soon 8485 do { // until either the thread sleeps, or we can get the lock 8486 if (fl.is_sleeping()) { 8487 fl.resume(gtid); 8488 break; 8489 } else if (__kmp_try_suspend_mx(thread)) { 8490 __kmp_unlock_suspend_mx(thread); 8491 break; 8492 } 8493 } while (1); 8494 } 8495 } 8496 } 8497 } 8498 } 8499 8500 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8501 // TODO: add warning messages 8502 int __kmp_pause_resource(kmp_pause_status_t level) { 8503 if (level == kmp_not_paused) { // requesting resume 8504 if (__kmp_pause_status == kmp_not_paused) { 8505 // error message about runtime not being paused, so can't resume 8506 return 1; 8507 } else { 8508 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8509 __kmp_pause_status == kmp_hard_paused); 8510 __kmp_pause_status = kmp_not_paused; 8511 return 0; 8512 } 8513 } else if (level == kmp_soft_paused) { // requesting soft pause 8514 if (__kmp_pause_status != kmp_not_paused) { 8515 // error message about already being paused 8516 return 1; 8517 } else { 8518 __kmp_soft_pause(); 8519 return 0; 8520 } 8521 } else if (level == kmp_hard_paused) { // requesting hard pause 8522 if (__kmp_pause_status != kmp_not_paused) { 8523 // error message about already being paused 8524 return 1; 8525 } else { 8526 __kmp_hard_pause(); 8527 return 0; 8528 } 8529 } else { 8530 // error message about invalid level 8531 return 1; 8532 } 8533 } 8534 8535 void __kmp_omp_display_env(int verbose) { 8536 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8537 if (__kmp_init_serial == 0) 8538 __kmp_do_serial_initialize(); 8539 __kmp_display_env_impl(!verbose, verbose); 8540 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8541 } 8542 8543 // Globals and functions for hidden helper task 8544 kmp_info_t **__kmp_hidden_helper_threads; 8545 kmp_info_t *__kmp_hidden_helper_main_thread; 8546 kmp_int32 __kmp_hidden_helper_threads_num = 8; 8547 std::atomic<kmp_int32> __kmp_unexecuted_hidden_helper_tasks; 8548 kmp_int32 __kmp_enable_hidden_helper = TRUE; 8549 8550 namespace { 8551 std::atomic<kmp_int32> __kmp_hit_hidden_helper_threads_num; 8552 8553 void __kmp_hidden_helper_wrapper_fn(int *gtid, int *, ...) { 8554 // This is an explicit synchronization on all hidden helper threads in case 8555 // that when a regular thread pushes a hidden helper task to one hidden 8556 // helper thread, the thread has not been awaken once since they're released 8557 // by the main thread after creating the team. 8558 KMP_ATOMIC_INC(&__kmp_hit_hidden_helper_threads_num); 8559 while (KMP_ATOMIC_LD_ACQ(&__kmp_hit_hidden_helper_threads_num) != 8560 __kmp_hidden_helper_threads_num) 8561 ; 8562 8563 // If main thread, then wait for signal 8564 if (__kmpc_master(nullptr, *gtid)) { 8565 // First, unset the initial state and release the initial thread 8566 TCW_4(__kmp_init_hidden_helper_threads, FALSE); 8567 __kmp_hidden_helper_initz_release(); 8568 __kmp_hidden_helper_main_thread_wait(); 8569 // Now wake up all worker threads 8570 for (int i = 1; i < __kmp_hit_hidden_helper_threads_num; ++i) { 8571 __kmp_hidden_helper_worker_thread_signal(); 8572 } 8573 } 8574 } 8575 } // namespace 8576 8577 void __kmp_hidden_helper_threads_initz_routine() { 8578 // Create a new root for hidden helper team/threads 8579 const int gtid = __kmp_register_root(TRUE); 8580 __kmp_hidden_helper_main_thread = __kmp_threads[gtid]; 8581 __kmp_hidden_helper_threads = &__kmp_threads[gtid]; 8582 __kmp_hidden_helper_main_thread->th.th_set_nproc = 8583 __kmp_hidden_helper_threads_num; 8584 8585 KMP_ATOMIC_ST_REL(&__kmp_hit_hidden_helper_threads_num, 0); 8586 8587 __kmpc_fork_call(nullptr, 0, __kmp_hidden_helper_wrapper_fn); 8588 8589 // Set the initialization flag to FALSE 8590 TCW_SYNC_4(__kmp_init_hidden_helper, FALSE); 8591 8592 __kmp_hidden_helper_threads_deinitz_release(); 8593 } 8594