1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 /* these are temporary issues to be dealt with */ 36 #define KMP_USE_PRCTL 0 37 38 #if KMP_OS_WINDOWS 39 #include <process.h> 40 #endif 41 42 #include "tsan_annotations.h" 43 44 #if defined(KMP_GOMP_COMPAT) 45 char const __kmp_version_alt_comp[] = 46 KMP_VERSION_PREFIX "alternative compiler support: yes"; 47 #endif /* defined(KMP_GOMP_COMPAT) */ 48 49 char const __kmp_version_omp_api[] = 50 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 51 52 #ifdef KMP_DEBUG 53 char const __kmp_version_lock[] = 54 KMP_VERSION_PREFIX "lock type: run time selectable"; 55 #endif /* KMP_DEBUG */ 56 57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 58 59 /* ------------------------------------------------------------------------ */ 60 61 #if KMP_USE_MONITOR 62 kmp_info_t __kmp_monitor; 63 #endif 64 65 /* Forward declarations */ 66 67 void __kmp_cleanup(void); 68 69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 70 int gtid); 71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 72 kmp_internal_control_t *new_icvs, 73 ident_t *loc); 74 #if KMP_AFFINITY_SUPPORTED 75 static void __kmp_partition_places(kmp_team_t *team, 76 int update_master_only = 0); 77 #endif 78 static void __kmp_do_serial_initialize(void); 79 void __kmp_fork_barrier(int gtid, int tid); 80 void __kmp_join_barrier(int gtid); 81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 82 kmp_internal_control_t *new_icvs, ident_t *loc); 83 84 #ifdef USE_LOAD_BALANCE 85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 86 #endif 87 88 static int __kmp_expand_threads(int nNeed); 89 #if KMP_OS_WINDOWS 90 static int __kmp_unregister_root_other_thread(int gtid); 91 #endif 92 static void __kmp_unregister_library(void); // called by __kmp_internal_end() 93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 95 96 /* Calculate the identifier of the current thread */ 97 /* fast (and somewhat portable) way to get unique identifier of executing 98 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 99 int __kmp_get_global_thread_id() { 100 int i; 101 kmp_info_t **other_threads; 102 size_t stack_data; 103 char *stack_addr; 104 size_t stack_size; 105 char *stack_base; 106 107 KA_TRACE( 108 1000, 109 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 110 __kmp_nth, __kmp_all_nth)); 111 112 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 113 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 114 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 115 __kmp_init_gtid for this to work. */ 116 117 if (!TCR_4(__kmp_init_gtid)) 118 return KMP_GTID_DNE; 119 120 #ifdef KMP_TDATA_GTID 121 if (TCR_4(__kmp_gtid_mode) >= 3) { 122 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 123 return __kmp_gtid; 124 } 125 #endif 126 if (TCR_4(__kmp_gtid_mode) >= 2) { 127 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 128 return __kmp_gtid_get_specific(); 129 } 130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 131 132 stack_addr = (char *)&stack_data; 133 other_threads = __kmp_threads; 134 135 /* ATT: The code below is a source of potential bugs due to unsynchronized 136 access to __kmp_threads array. For example: 137 1. Current thread loads other_threads[i] to thr and checks it, it is 138 non-NULL. 139 2. Current thread is suspended by OS. 140 3. Another thread unregisters and finishes (debug versions of free() 141 may fill memory with something like 0xEF). 142 4. Current thread is resumed. 143 5. Current thread reads junk from *thr. 144 TODO: Fix it. --ln */ 145 146 for (i = 0; i < __kmp_threads_capacity; i++) { 147 148 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 149 if (!thr) 150 continue; 151 152 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 153 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 154 155 /* stack grows down -- search through all of the active threads */ 156 157 if (stack_addr <= stack_base) { 158 size_t stack_diff = stack_base - stack_addr; 159 160 if (stack_diff <= stack_size) { 161 /* The only way we can be closer than the allocated */ 162 /* stack size is if we are running on this thread. */ 163 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 164 return i; 165 } 166 } 167 } 168 169 /* get specific to try and determine our gtid */ 170 KA_TRACE(1000, 171 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 172 "thread, using TLS\n")); 173 i = __kmp_gtid_get_specific(); 174 175 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 176 177 /* if we havn't been assigned a gtid, then return code */ 178 if (i < 0) 179 return i; 180 181 /* dynamically updated stack window for uber threads to avoid get_specific 182 call */ 183 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 184 KMP_FATAL(StackOverflow, i); 185 } 186 187 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 188 if (stack_addr > stack_base) { 189 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 190 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 191 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 192 stack_base); 193 } else { 194 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 195 stack_base - stack_addr); 196 } 197 198 /* Reprint stack bounds for ubermaster since they have been refined */ 199 if (__kmp_storage_map) { 200 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 201 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 202 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 203 other_threads[i]->th.th_info.ds.ds_stacksize, 204 "th_%d stack (refinement)", i); 205 } 206 return i; 207 } 208 209 int __kmp_get_global_thread_id_reg() { 210 int gtid; 211 212 if (!__kmp_init_serial) { 213 gtid = KMP_GTID_DNE; 214 } else 215 #ifdef KMP_TDATA_GTID 216 if (TCR_4(__kmp_gtid_mode) >= 3) { 217 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 218 gtid = __kmp_gtid; 219 } else 220 #endif 221 if (TCR_4(__kmp_gtid_mode) >= 2) { 222 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 223 gtid = __kmp_gtid_get_specific(); 224 } else { 225 KA_TRACE(1000, 226 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 227 gtid = __kmp_get_global_thread_id(); 228 } 229 230 /* we must be a new uber master sibling thread */ 231 if (gtid == KMP_GTID_DNE) { 232 KA_TRACE(10, 233 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 234 "Registering a new gtid.\n")); 235 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 236 if (!__kmp_init_serial) { 237 __kmp_do_serial_initialize(); 238 gtid = __kmp_gtid_get_specific(); 239 } else { 240 gtid = __kmp_register_root(FALSE); 241 } 242 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 243 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 244 } 245 246 KMP_DEBUG_ASSERT(gtid >= 0); 247 248 return gtid; 249 } 250 251 /* caller must hold forkjoin_lock */ 252 void __kmp_check_stack_overlap(kmp_info_t *th) { 253 int f; 254 char *stack_beg = NULL; 255 char *stack_end = NULL; 256 int gtid; 257 258 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 259 if (__kmp_storage_map) { 260 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 261 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 262 263 gtid = __kmp_gtid_from_thread(th); 264 265 if (gtid == KMP_GTID_MONITOR) { 266 __kmp_print_storage_map_gtid( 267 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 268 "th_%s stack (%s)", "mon", 269 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 270 } else { 271 __kmp_print_storage_map_gtid( 272 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 273 "th_%d stack (%s)", gtid, 274 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 275 } 276 } 277 278 /* No point in checking ubermaster threads since they use refinement and 279 * cannot overlap */ 280 gtid = __kmp_gtid_from_thread(th); 281 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 282 KA_TRACE(10, 283 ("__kmp_check_stack_overlap: performing extensive checking\n")); 284 if (stack_beg == NULL) { 285 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 286 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 287 } 288 289 for (f = 0; f < __kmp_threads_capacity; f++) { 290 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 291 292 if (f_th && f_th != th) { 293 char *other_stack_end = 294 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 295 char *other_stack_beg = 296 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 297 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 298 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 299 300 /* Print the other stack values before the abort */ 301 if (__kmp_storage_map) 302 __kmp_print_storage_map_gtid( 303 -1, other_stack_beg, other_stack_end, 304 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 305 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 306 307 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 308 __kmp_msg_null); 309 } 310 } 311 } 312 } 313 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 314 } 315 316 /* ------------------------------------------------------------------------ */ 317 318 void __kmp_infinite_loop(void) { 319 static int done = FALSE; 320 321 while (!done) { 322 KMP_YIELD(TRUE); 323 } 324 } 325 326 #define MAX_MESSAGE 512 327 328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 329 char const *format, ...) { 330 char buffer[MAX_MESSAGE]; 331 va_list ap; 332 333 va_start(ap, format); 334 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 335 p2, (unsigned long)size, format); 336 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 337 __kmp_vprintf(kmp_err, buffer, ap); 338 #if KMP_PRINT_DATA_PLACEMENT 339 int node; 340 if (gtid >= 0) { 341 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 342 if (__kmp_storage_map_verbose) { 343 node = __kmp_get_host_node(p1); 344 if (node < 0) /* doesn't work, so don't try this next time */ 345 __kmp_storage_map_verbose = FALSE; 346 else { 347 char *last; 348 int lastNode; 349 int localProc = __kmp_get_cpu_from_gtid(gtid); 350 351 const int page_size = KMP_GET_PAGE_SIZE(); 352 353 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 354 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 355 if (localProc >= 0) 356 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 357 localProc >> 1); 358 else 359 __kmp_printf_no_lock(" GTID %d\n", gtid); 360 #if KMP_USE_PRCTL 361 /* The more elaborate format is disabled for now because of the prctl 362 * hanging bug. */ 363 do { 364 last = p1; 365 lastNode = node; 366 /* This loop collates adjacent pages with the same host node. */ 367 do { 368 (char *)p1 += page_size; 369 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 370 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 371 lastNode); 372 } while (p1 <= p2); 373 #else 374 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 375 (char *)p1 + (page_size - 1), 376 __kmp_get_host_node(p1)); 377 if (p1 < p2) { 378 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 379 (char *)p2 + (page_size - 1), 380 __kmp_get_host_node(p2)); 381 } 382 #endif 383 } 384 } 385 } else 386 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 387 } 388 #endif /* KMP_PRINT_DATA_PLACEMENT */ 389 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 390 } 391 392 void __kmp_warn(char const *format, ...) { 393 char buffer[MAX_MESSAGE]; 394 va_list ap; 395 396 if (__kmp_generate_warnings == kmp_warnings_off) { 397 return; 398 } 399 400 va_start(ap, format); 401 402 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 403 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 404 __kmp_vprintf(kmp_err, buffer, ap); 405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 406 407 va_end(ap); 408 } 409 410 void __kmp_abort_process() { 411 // Later threads may stall here, but that's ok because abort() will kill them. 412 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 413 414 if (__kmp_debug_buf) { 415 __kmp_dump_debug_buffer(); 416 } 417 418 if (KMP_OS_WINDOWS) { 419 // Let other threads know of abnormal termination and prevent deadlock 420 // if abort happened during library initialization or shutdown 421 __kmp_global.g.g_abort = SIGABRT; 422 423 /* On Windows* OS by default abort() causes pop-up error box, which stalls 424 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 425 boxes. _set_abort_behavior() works well, but this function is not 426 available in VS7 (this is not problem for DLL, but it is a problem for 427 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 428 help, at least in some versions of MS C RTL. 429 430 It seems following sequence is the only way to simulate abort() and 431 avoid pop-up error box. */ 432 raise(SIGABRT); 433 _exit(3); // Just in case, if signal ignored, exit anyway. 434 } else { 435 abort(); 436 } 437 438 __kmp_infinite_loop(); 439 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 440 441 } // __kmp_abort_process 442 443 void __kmp_abort_thread(void) { 444 // TODO: Eliminate g_abort global variable and this function. 445 // In case of abort just call abort(), it will kill all the threads. 446 __kmp_infinite_loop(); 447 } // __kmp_abort_thread 448 449 /* Print out the storage map for the major kmp_info_t thread data structures 450 that are allocated together. */ 451 452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 453 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 454 gtid); 455 456 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 457 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 458 459 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 460 sizeof(kmp_local_t), "th_%d.th_local", gtid); 461 462 __kmp_print_storage_map_gtid( 463 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 464 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 465 466 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 467 &thr->th.th_bar[bs_plain_barrier + 1], 468 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 469 gtid); 470 471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 472 &thr->th.th_bar[bs_forkjoin_barrier + 1], 473 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 474 gtid); 475 476 #if KMP_FAST_REDUCTION_BARRIER 477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 478 &thr->th.th_bar[bs_reduction_barrier + 1], 479 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 480 gtid); 481 #endif // KMP_FAST_REDUCTION_BARRIER 482 } 483 484 /* Print out the storage map for the major kmp_team_t team data structures 485 that are allocated together. */ 486 487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 488 int team_id, int num_thr) { 489 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 490 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 491 header, team_id); 492 493 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 494 &team->t.t_bar[bs_last_barrier], 495 sizeof(kmp_balign_team_t) * bs_last_barrier, 496 "%s_%d.t_bar", header, team_id); 497 498 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 499 &team->t.t_bar[bs_plain_barrier + 1], 500 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 501 header, team_id); 502 503 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 504 &team->t.t_bar[bs_forkjoin_barrier + 1], 505 sizeof(kmp_balign_team_t), 506 "%s_%d.t_bar[forkjoin]", header, team_id); 507 508 #if KMP_FAST_REDUCTION_BARRIER 509 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 510 &team->t.t_bar[bs_reduction_barrier + 1], 511 sizeof(kmp_balign_team_t), 512 "%s_%d.t_bar[reduction]", header, team_id); 513 #endif // KMP_FAST_REDUCTION_BARRIER 514 515 __kmp_print_storage_map_gtid( 516 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 517 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 518 519 __kmp_print_storage_map_gtid( 520 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 521 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 522 523 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 524 &team->t.t_disp_buffer[num_disp_buff], 525 sizeof(dispatch_shared_info_t) * num_disp_buff, 526 "%s_%d.t_disp_buffer", header, team_id); 527 } 528 529 static void __kmp_init_allocator() { __kmp_init_memkind(); } 530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 531 532 /* ------------------------------------------------------------------------ */ 533 534 #if KMP_DYNAMIC_LIB 535 #if KMP_OS_WINDOWS 536 537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) { 538 // TODO: Change to __kmp_break_bootstrap_lock(). 539 __kmp_init_bootstrap_lock(lck); // make the lock released 540 } 541 542 static void __kmp_reset_locks_on_process_detach(int gtid_req) { 543 int i; 544 int thread_count; 545 546 // PROCESS_DETACH is expected to be called by a thread that executes 547 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one 548 // calling ProcessExit or FreeLibrary). So, it might be safe to access the 549 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some 550 // threads can be still alive here, although being about to be terminated. The 551 // threads in the array with ds_thread==0 are most suspicious. Actually, it 552 // can be not safe to access the __kmp_threads[]. 553 554 // TODO: does it make sense to check __kmp_roots[] ? 555 556 // Let's check that there are no other alive threads registered with the OMP 557 // lib. 558 while (1) { 559 thread_count = 0; 560 for (i = 0; i < __kmp_threads_capacity; ++i) { 561 if (!__kmp_threads) 562 continue; 563 kmp_info_t *th = __kmp_threads[i]; 564 if (th == NULL) 565 continue; 566 int gtid = th->th.th_info.ds.ds_gtid; 567 if (gtid == gtid_req) 568 continue; 569 if (gtid < 0) 570 continue; 571 DWORD exit_val; 572 int alive = __kmp_is_thread_alive(th, &exit_val); 573 if (alive) { 574 ++thread_count; 575 } 576 } 577 if (thread_count == 0) 578 break; // success 579 } 580 581 // Assume that I'm alone. Now it might be safe to check and reset locks. 582 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset. 583 __kmp_reset_lock(&__kmp_forkjoin_lock); 584 #ifdef KMP_DEBUG 585 __kmp_reset_lock(&__kmp_stdio_lock); 586 #endif // KMP_DEBUG 587 } 588 589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 590 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 591 592 switch (fdwReason) { 593 594 case DLL_PROCESS_ATTACH: 595 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 596 597 return TRUE; 598 599 case DLL_PROCESS_DETACH: 600 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 601 602 if (lpReserved != NULL) { 603 // lpReserved is used for telling the difference: 604 // lpReserved == NULL when FreeLibrary() was called, 605 // lpReserved != NULL when the process terminates. 606 // When FreeLibrary() is called, worker threads remain alive. So they will 607 // release the forkjoin lock by themselves. When the process terminates, 608 // worker threads disappear triggering the problem of unreleased forkjoin 609 // lock as described below. 610 611 // A worker thread can take the forkjoin lock. The problem comes up if 612 // that worker thread becomes dead before it releases the forkjoin lock. 613 // The forkjoin lock remains taken, while the thread executing 614 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try 615 // to take the forkjoin lock and will always fail, so that the application 616 // will never finish [normally]. This scenario is possible if 617 // __kmpc_end() has not been executed. It looks like it's not a corner 618 // case, but common cases: 619 // - the main function was compiled by an alternative compiler; 620 // - the main function was compiled by icl but without /Qopenmp 621 // (application with plugins); 622 // - application terminates by calling C exit(), Fortran CALL EXIT() or 623 // Fortran STOP. 624 // - alive foreign thread prevented __kmpc_end from doing cleanup. 625 // 626 // This is a hack to work around the problem. 627 // TODO: !!! figure out something better. 628 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific()); 629 } 630 631 __kmp_internal_end_library(__kmp_gtid_get_specific()); 632 633 return TRUE; 634 635 case DLL_THREAD_ATTACH: 636 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 637 638 /* if we want to register new siblings all the time here call 639 * __kmp_get_gtid(); */ 640 return TRUE; 641 642 case DLL_THREAD_DETACH: 643 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 644 645 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 646 return TRUE; 647 } 648 649 return TRUE; 650 } 651 652 #endif /* KMP_OS_WINDOWS */ 653 #endif /* KMP_DYNAMIC_LIB */ 654 655 /* __kmp_parallel_deo -- Wait until it's our turn. */ 656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 657 int gtid = *gtid_ref; 658 #ifdef BUILD_PARALLEL_ORDERED 659 kmp_team_t *team = __kmp_team_from_gtid(gtid); 660 #endif /* BUILD_PARALLEL_ORDERED */ 661 662 if (__kmp_env_consistency_check) { 663 if (__kmp_threads[gtid]->th.th_root->r.r_active) 664 #if KMP_USE_DYNAMIC_LOCK 665 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 666 #else 667 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 668 #endif 669 } 670 #ifdef BUILD_PARALLEL_ORDERED 671 if (!team->t.t_serialized) { 672 KMP_MB(); 673 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 674 NULL); 675 KMP_MB(); 676 } 677 #endif /* BUILD_PARALLEL_ORDERED */ 678 } 679 680 /* __kmp_parallel_dxo -- Signal the next task. */ 681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 682 int gtid = *gtid_ref; 683 #ifdef BUILD_PARALLEL_ORDERED 684 int tid = __kmp_tid_from_gtid(gtid); 685 kmp_team_t *team = __kmp_team_from_gtid(gtid); 686 #endif /* BUILD_PARALLEL_ORDERED */ 687 688 if (__kmp_env_consistency_check) { 689 if (__kmp_threads[gtid]->th.th_root->r.r_active) 690 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 691 } 692 #ifdef BUILD_PARALLEL_ORDERED 693 if (!team->t.t_serialized) { 694 KMP_MB(); /* Flush all pending memory write invalidates. */ 695 696 /* use the tid of the next thread in this team */ 697 /* TODO replace with general release procedure */ 698 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 699 700 KMP_MB(); /* Flush all pending memory write invalidates. */ 701 } 702 #endif /* BUILD_PARALLEL_ORDERED */ 703 } 704 705 /* ------------------------------------------------------------------------ */ 706 /* The BARRIER for a SINGLE process section is always explicit */ 707 708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 709 int status; 710 kmp_info_t *th; 711 kmp_team_t *team; 712 713 if (!TCR_4(__kmp_init_parallel)) 714 __kmp_parallel_initialize(); 715 __kmp_resume_if_soft_paused(); 716 717 th = __kmp_threads[gtid]; 718 team = th->th.th_team; 719 status = 0; 720 721 th->th.th_ident = id_ref; 722 723 if (team->t.t_serialized) { 724 status = 1; 725 } else { 726 kmp_int32 old_this = th->th.th_local.this_construct; 727 728 ++th->th.th_local.this_construct; 729 /* try to set team count to thread count--success means thread got the 730 single block */ 731 /* TODO: Should this be acquire or release? */ 732 if (team->t.t_construct == old_this) { 733 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 734 th->th.th_local.this_construct); 735 } 736 #if USE_ITT_BUILD 737 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 738 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 739 team->t.t_active_level == 740 1) { // Only report metadata by master of active team at level 1 741 __kmp_itt_metadata_single(id_ref); 742 } 743 #endif /* USE_ITT_BUILD */ 744 } 745 746 if (__kmp_env_consistency_check) { 747 if (status && push_ws) { 748 __kmp_push_workshare(gtid, ct_psingle, id_ref); 749 } else { 750 __kmp_check_workshare(gtid, ct_psingle, id_ref); 751 } 752 } 753 #if USE_ITT_BUILD 754 if (status) { 755 __kmp_itt_single_start(gtid); 756 } 757 #endif /* USE_ITT_BUILD */ 758 return status; 759 } 760 761 void __kmp_exit_single(int gtid) { 762 #if USE_ITT_BUILD 763 __kmp_itt_single_end(gtid); 764 #endif /* USE_ITT_BUILD */ 765 if (__kmp_env_consistency_check) 766 __kmp_pop_workshare(gtid, ct_psingle, NULL); 767 } 768 769 /* determine if we can go parallel or must use a serialized parallel region and 770 * how many threads we can use 771 * set_nproc is the number of threads requested for the team 772 * returns 0 if we should serialize or only use one thread, 773 * otherwise the number of threads to use 774 * The forkjoin lock is held by the caller. */ 775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 776 int master_tid, int set_nthreads, 777 int enter_teams) { 778 int capacity; 779 int new_nthreads; 780 KMP_DEBUG_ASSERT(__kmp_init_serial); 781 KMP_DEBUG_ASSERT(root && parent_team); 782 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 783 784 // If dyn-var is set, dynamically adjust the number of desired threads, 785 // according to the method specified by dynamic_mode. 786 new_nthreads = set_nthreads; 787 if (!get__dynamic_2(parent_team, master_tid)) { 788 ; 789 } 790 #ifdef USE_LOAD_BALANCE 791 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 792 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 793 if (new_nthreads == 1) { 794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 795 "reservation to 1 thread\n", 796 master_tid)); 797 return 1; 798 } 799 if (new_nthreads < set_nthreads) { 800 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 801 "reservation to %d threads\n", 802 master_tid, new_nthreads)); 803 } 804 } 805 #endif /* USE_LOAD_BALANCE */ 806 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 807 new_nthreads = __kmp_avail_proc - __kmp_nth + 808 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 809 if (new_nthreads <= 1) { 810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 811 "reservation to 1 thread\n", 812 master_tid)); 813 return 1; 814 } 815 if (new_nthreads < set_nthreads) { 816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 817 "reservation to %d threads\n", 818 master_tid, new_nthreads)); 819 } else { 820 new_nthreads = set_nthreads; 821 } 822 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 823 if (set_nthreads > 2) { 824 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 825 new_nthreads = (new_nthreads % set_nthreads) + 1; 826 if (new_nthreads == 1) { 827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 828 "reservation to 1 thread\n", 829 master_tid)); 830 return 1; 831 } 832 if (new_nthreads < set_nthreads) { 833 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 834 "reservation to %d threads\n", 835 master_tid, new_nthreads)); 836 } 837 } 838 } else { 839 KMP_ASSERT(0); 840 } 841 842 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 843 if (__kmp_nth + new_nthreads - 844 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 845 __kmp_max_nth) { 846 int tl_nthreads = __kmp_max_nth - __kmp_nth + 847 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 848 if (tl_nthreads <= 0) { 849 tl_nthreads = 1; 850 } 851 852 // If dyn-var is false, emit a 1-time warning. 853 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 854 __kmp_reserve_warn = 1; 855 __kmp_msg(kmp_ms_warning, 856 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 857 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 858 } 859 if (tl_nthreads == 1) { 860 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 861 "reduced reservation to 1 thread\n", 862 master_tid)); 863 return 1; 864 } 865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 866 "reservation to %d threads\n", 867 master_tid, tl_nthreads)); 868 new_nthreads = tl_nthreads; 869 } 870 871 // Respect OMP_THREAD_LIMIT 872 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 873 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 874 if (cg_nthreads + new_nthreads - 875 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 876 max_cg_threads) { 877 int tl_nthreads = max_cg_threads - cg_nthreads + 878 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 879 if (tl_nthreads <= 0) { 880 tl_nthreads = 1; 881 } 882 883 // If dyn-var is false, emit a 1-time warning. 884 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 885 __kmp_reserve_warn = 1; 886 __kmp_msg(kmp_ms_warning, 887 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 888 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 889 } 890 if (tl_nthreads == 1) { 891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 892 "reduced reservation to 1 thread\n", 893 master_tid)); 894 return 1; 895 } 896 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 897 "reservation to %d threads\n", 898 master_tid, tl_nthreads)); 899 new_nthreads = tl_nthreads; 900 } 901 902 // Check if the threads array is large enough, or needs expanding. 903 // See comment in __kmp_register_root() about the adjustment if 904 // __kmp_threads[0] == NULL. 905 capacity = __kmp_threads_capacity; 906 if (TCR_PTR(__kmp_threads[0]) == NULL) { 907 --capacity; 908 } 909 if (__kmp_nth + new_nthreads - 910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 911 capacity) { 912 // Expand the threads array. 913 int slotsRequired = __kmp_nth + new_nthreads - 914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 915 capacity; 916 int slotsAdded = __kmp_expand_threads(slotsRequired); 917 if (slotsAdded < slotsRequired) { 918 // The threads array was not expanded enough. 919 new_nthreads -= (slotsRequired - slotsAdded); 920 KMP_ASSERT(new_nthreads >= 1); 921 922 // If dyn-var is false, emit a 1-time warning. 923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 924 __kmp_reserve_warn = 1; 925 if (__kmp_tp_cached) { 926 __kmp_msg(kmp_ms_warning, 927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 930 } else { 931 __kmp_msg(kmp_ms_warning, 932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 934 } 935 } 936 } 937 } 938 939 #ifdef KMP_DEBUG 940 if (new_nthreads == 1) { 941 KC_TRACE(10, 942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 943 "dead roots and rechecking; requested %d threads\n", 944 __kmp_get_gtid(), set_nthreads)); 945 } else { 946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 947 " %d threads\n", 948 __kmp_get_gtid(), new_nthreads, set_nthreads)); 949 } 950 #endif // KMP_DEBUG 951 return new_nthreads; 952 } 953 954 /* Allocate threads from the thread pool and assign them to the new team. We are 955 assured that there are enough threads available, because we checked on that 956 earlier within critical section forkjoin */ 957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 958 kmp_info_t *master_th, int master_gtid) { 959 int i; 960 int use_hot_team; 961 962 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 963 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 964 KMP_MB(); 965 966 /* first, let's setup the master thread */ 967 master_th->th.th_info.ds.ds_tid = 0; 968 master_th->th.th_team = team; 969 master_th->th.th_team_nproc = team->t.t_nproc; 970 master_th->th.th_team_master = master_th; 971 master_th->th.th_team_serialized = FALSE; 972 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 973 974 /* make sure we are not the optimized hot team */ 975 #if KMP_NESTED_HOT_TEAMS 976 use_hot_team = 0; 977 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 978 if (hot_teams) { // hot teams array is not allocated if 979 // KMP_HOT_TEAMS_MAX_LEVEL=0 980 int level = team->t.t_active_level - 1; // index in array of hot teams 981 if (master_th->th.th_teams_microtask) { // are we inside the teams? 982 if (master_th->th.th_teams_size.nteams > 1) { 983 ++level; // level was not increased in teams construct for 984 // team_of_masters 985 } 986 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 987 master_th->th.th_teams_level == team->t.t_level) { 988 ++level; // level was not increased in teams construct for 989 // team_of_workers before the parallel 990 } // team->t.t_level will be increased inside parallel 991 } 992 if (level < __kmp_hot_teams_max_level) { 993 if (hot_teams[level].hot_team) { 994 // hot team has already been allocated for given level 995 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 996 use_hot_team = 1; // the team is ready to use 997 } else { 998 use_hot_team = 0; // AC: threads are not allocated yet 999 hot_teams[level].hot_team = team; // remember new hot team 1000 hot_teams[level].hot_team_nth = team->t.t_nproc; 1001 } 1002 } else { 1003 use_hot_team = 0; 1004 } 1005 } 1006 #else 1007 use_hot_team = team == root->r.r_hot_team; 1008 #endif 1009 if (!use_hot_team) { 1010 1011 /* install the master thread */ 1012 team->t.t_threads[0] = master_th; 1013 __kmp_initialize_info(master_th, team, 0, master_gtid); 1014 1015 /* now, install the worker threads */ 1016 for (i = 1; i < team->t.t_nproc; i++) { 1017 1018 /* fork or reallocate a new thread and install it in team */ 1019 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1020 team->t.t_threads[i] = thr; 1021 KMP_DEBUG_ASSERT(thr); 1022 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1023 /* align team and thread arrived states */ 1024 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1025 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1026 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1027 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1028 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1029 team->t.t_bar[bs_plain_barrier].b_arrived)); 1030 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1031 thr->th.th_teams_level = master_th->th.th_teams_level; 1032 thr->th.th_teams_size = master_th->th.th_teams_size; 1033 { // Initialize threads' barrier data. 1034 int b; 1035 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1036 for (b = 0; b < bs_last_barrier; ++b) { 1037 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1038 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1039 #if USE_DEBUGGER 1040 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1041 #endif 1042 } 1043 } 1044 } 1045 1046 #if KMP_AFFINITY_SUPPORTED 1047 __kmp_partition_places(team); 1048 #endif 1049 } 1050 1051 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1052 for (i = 0; i < team->t.t_nproc; i++) { 1053 kmp_info_t *thr = team->t.t_threads[i]; 1054 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1055 thr->th.th_prev_level != team->t.t_level) { 1056 team->t.t_display_affinity = 1; 1057 break; 1058 } 1059 } 1060 } 1061 1062 KMP_MB(); 1063 } 1064 1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1066 // Propagate any changes to the floating point control registers out to the team 1067 // We try to avoid unnecessary writes to the relevant cache line in the team 1068 // structure, so we don't make changes unless they are needed. 1069 inline static void propagateFPControl(kmp_team_t *team) { 1070 if (__kmp_inherit_fp_control) { 1071 kmp_int16 x87_fpu_control_word; 1072 kmp_uint32 mxcsr; 1073 1074 // Get master values of FPU control flags (both X87 and vector) 1075 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1076 __kmp_store_mxcsr(&mxcsr); 1077 mxcsr &= KMP_X86_MXCSR_MASK; 1078 1079 // There is no point looking at t_fp_control_saved here. 1080 // If it is TRUE, we still have to update the values if they are different 1081 // from those we now have. If it is FALSE we didn't save anything yet, but 1082 // our objective is the same. We have to ensure that the values in the team 1083 // are the same as those we have. 1084 // So, this code achieves what we need whether or not t_fp_control_saved is 1085 // true. By checking whether the value needs updating we avoid unnecessary 1086 // writes that would put the cache-line into a written state, causing all 1087 // threads in the team to have to read it again. 1088 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1089 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1090 // Although we don't use this value, other code in the runtime wants to know 1091 // whether it should restore them. So we must ensure it is correct. 1092 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1093 } else { 1094 // Similarly here. Don't write to this cache-line in the team structure 1095 // unless we have to. 1096 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1097 } 1098 } 1099 1100 // Do the opposite, setting the hardware registers to the updated values from 1101 // the team. 1102 inline static void updateHWFPControl(kmp_team_t *team) { 1103 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1104 // Only reset the fp control regs if they have been changed in the team. 1105 // the parallel region that we are exiting. 1106 kmp_int16 x87_fpu_control_word; 1107 kmp_uint32 mxcsr; 1108 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1109 __kmp_store_mxcsr(&mxcsr); 1110 mxcsr &= KMP_X86_MXCSR_MASK; 1111 1112 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1113 __kmp_clear_x87_fpu_status_word(); 1114 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1115 } 1116 1117 if (team->t.t_mxcsr != mxcsr) { 1118 __kmp_load_mxcsr(&team->t.t_mxcsr); 1119 } 1120 } 1121 } 1122 #else 1123 #define propagateFPControl(x) ((void)0) 1124 #define updateHWFPControl(x) ((void)0) 1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1126 1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1128 int realloc); // forward declaration 1129 1130 /* Run a parallel region that has been serialized, so runs only in a team of the 1131 single master thread. */ 1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1133 kmp_info_t *this_thr; 1134 kmp_team_t *serial_team; 1135 1136 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1137 1138 /* Skip all this code for autopar serialized loops since it results in 1139 unacceptable overhead */ 1140 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1141 return; 1142 1143 if (!TCR_4(__kmp_init_parallel)) 1144 __kmp_parallel_initialize(); 1145 __kmp_resume_if_soft_paused(); 1146 1147 this_thr = __kmp_threads[global_tid]; 1148 serial_team = this_thr->th.th_serial_team; 1149 1150 /* utilize the serialized team held by this thread */ 1151 KMP_DEBUG_ASSERT(serial_team); 1152 KMP_MB(); 1153 1154 if (__kmp_tasking_mode != tskm_immediate_exec) { 1155 KMP_DEBUG_ASSERT( 1156 this_thr->th.th_task_team == 1157 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1158 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1159 NULL); 1160 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1161 "team %p, new task_team = NULL\n", 1162 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1163 this_thr->th.th_task_team = NULL; 1164 } 1165 1166 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1167 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1168 proc_bind = proc_bind_false; 1169 } else if (proc_bind == proc_bind_default) { 1170 // No proc_bind clause was specified, so use the current value 1171 // of proc-bind-var for this parallel region. 1172 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1173 } 1174 // Reset for next parallel region 1175 this_thr->th.th_set_proc_bind = proc_bind_default; 1176 1177 #if OMPT_SUPPORT 1178 ompt_data_t ompt_parallel_data = ompt_data_none; 1179 ompt_data_t *implicit_task_data; 1180 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1181 if (ompt_enabled.enabled && 1182 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1183 1184 ompt_task_info_t *parent_task_info; 1185 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1186 1187 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1188 if (ompt_enabled.ompt_callback_parallel_begin) { 1189 int team_size = 1; 1190 1191 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1192 &(parent_task_info->task_data), &(parent_task_info->frame), 1193 &ompt_parallel_data, team_size, 1194 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1195 } 1196 } 1197 #endif // OMPT_SUPPORT 1198 1199 if (this_thr->th.th_team != serial_team) { 1200 // Nested level will be an index in the nested nthreads array 1201 int level = this_thr->th.th_team->t.t_level; 1202 1203 if (serial_team->t.t_serialized) { 1204 /* this serial team was already used 1205 TODO increase performance by making this locks more specific */ 1206 kmp_team_t *new_team; 1207 1208 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1209 1210 new_team = 1211 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1212 #if OMPT_SUPPORT 1213 ompt_parallel_data, 1214 #endif 1215 proc_bind, &this_thr->th.th_current_task->td_icvs, 1216 0 USE_NESTED_HOT_ARG(NULL)); 1217 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1218 KMP_ASSERT(new_team); 1219 1220 /* setup new serialized team and install it */ 1221 new_team->t.t_threads[0] = this_thr; 1222 new_team->t.t_parent = this_thr->th.th_team; 1223 serial_team = new_team; 1224 this_thr->th.th_serial_team = serial_team; 1225 1226 KF_TRACE( 1227 10, 1228 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1229 global_tid, serial_team)); 1230 1231 /* TODO the above breaks the requirement that if we run out of resources, 1232 then we can still guarantee that serialized teams are ok, since we may 1233 need to allocate a new one */ 1234 } else { 1235 KF_TRACE( 1236 10, 1237 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1238 global_tid, serial_team)); 1239 } 1240 1241 /* we have to initialize this serial team */ 1242 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1243 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1244 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1245 serial_team->t.t_ident = loc; 1246 serial_team->t.t_serialized = 1; 1247 serial_team->t.t_nproc = 1; 1248 serial_team->t.t_parent = this_thr->th.th_team; 1249 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1250 this_thr->th.th_team = serial_team; 1251 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1252 1253 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1254 this_thr->th.th_current_task)); 1255 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1256 this_thr->th.th_current_task->td_flags.executing = 0; 1257 1258 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1259 1260 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1261 implicit task for each serialized task represented by 1262 team->t.t_serialized? */ 1263 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1264 &this_thr->th.th_current_task->td_parent->td_icvs); 1265 1266 // Thread value exists in the nested nthreads array for the next nested 1267 // level 1268 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1269 this_thr->th.th_current_task->td_icvs.nproc = 1270 __kmp_nested_nth.nth[level + 1]; 1271 } 1272 1273 if (__kmp_nested_proc_bind.used && 1274 (level + 1 < __kmp_nested_proc_bind.used)) { 1275 this_thr->th.th_current_task->td_icvs.proc_bind = 1276 __kmp_nested_proc_bind.bind_types[level + 1]; 1277 } 1278 1279 #if USE_DEBUGGER 1280 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1281 #endif 1282 this_thr->th.th_info.ds.ds_tid = 0; 1283 1284 /* set thread cache values */ 1285 this_thr->th.th_team_nproc = 1; 1286 this_thr->th.th_team_master = this_thr; 1287 this_thr->th.th_team_serialized = 1; 1288 1289 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1290 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1291 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1292 1293 propagateFPControl(serial_team); 1294 1295 /* check if we need to allocate dispatch buffers stack */ 1296 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1297 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1298 serial_team->t.t_dispatch->th_disp_buffer = 1299 (dispatch_private_info_t *)__kmp_allocate( 1300 sizeof(dispatch_private_info_t)); 1301 } 1302 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1303 1304 KMP_MB(); 1305 1306 } else { 1307 /* this serialized team is already being used, 1308 * that's fine, just add another nested level */ 1309 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1310 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1311 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1312 ++serial_team->t.t_serialized; 1313 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1314 1315 // Nested level will be an index in the nested nthreads array 1316 int level = this_thr->th.th_team->t.t_level; 1317 // Thread value exists in the nested nthreads array for the next nested 1318 // level 1319 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1320 this_thr->th.th_current_task->td_icvs.nproc = 1321 __kmp_nested_nth.nth[level + 1]; 1322 } 1323 serial_team->t.t_level++; 1324 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1325 "of serial team %p to %d\n", 1326 global_tid, serial_team, serial_team->t.t_level)); 1327 1328 /* allocate/push dispatch buffers stack */ 1329 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1330 { 1331 dispatch_private_info_t *disp_buffer = 1332 (dispatch_private_info_t *)__kmp_allocate( 1333 sizeof(dispatch_private_info_t)); 1334 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1335 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1336 } 1337 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1338 1339 KMP_MB(); 1340 } 1341 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1342 1343 // Perform the display affinity functionality for 1344 // serialized parallel regions 1345 if (__kmp_display_affinity) { 1346 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1347 this_thr->th.th_prev_num_threads != 1) { 1348 // NULL means use the affinity-format-var ICV 1349 __kmp_aux_display_affinity(global_tid, NULL); 1350 this_thr->th.th_prev_level = serial_team->t.t_level; 1351 this_thr->th.th_prev_num_threads = 1; 1352 } 1353 } 1354 1355 if (__kmp_env_consistency_check) 1356 __kmp_push_parallel(global_tid, NULL); 1357 #if OMPT_SUPPORT 1358 serial_team->t.ompt_team_info.master_return_address = codeptr; 1359 if (ompt_enabled.enabled && 1360 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1361 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1362 1363 ompt_lw_taskteam_t lw_taskteam; 1364 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1365 &ompt_parallel_data, codeptr); 1366 1367 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1368 // don't use lw_taskteam after linking. content was swaped 1369 1370 /* OMPT implicit task begin */ 1371 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1372 if (ompt_enabled.ompt_callback_implicit_task) { 1373 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1374 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1375 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1376 OMPT_CUR_TASK_INFO(this_thr) 1377 ->thread_num = __kmp_tid_from_gtid(global_tid); 1378 } 1379 1380 /* OMPT state */ 1381 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1382 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1383 } 1384 #endif 1385 } 1386 1387 /* most of the work for a fork */ 1388 /* return true if we really went parallel, false if serialized */ 1389 int __kmp_fork_call(ident_t *loc, int gtid, 1390 enum fork_context_e call_context, // Intel, GNU, ... 1391 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1392 kmp_va_list ap) { 1393 void **argv; 1394 int i; 1395 int master_tid; 1396 int master_this_cons; 1397 kmp_team_t *team; 1398 kmp_team_t *parent_team; 1399 kmp_info_t *master_th; 1400 kmp_root_t *root; 1401 int nthreads; 1402 int master_active; 1403 int master_set_numthreads; 1404 int level; 1405 int active_level; 1406 int teams_level; 1407 #if KMP_NESTED_HOT_TEAMS 1408 kmp_hot_team_ptr_t **p_hot_teams; 1409 #endif 1410 { // KMP_TIME_BLOCK 1411 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1412 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1413 1414 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1415 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1416 /* Some systems prefer the stack for the root thread(s) to start with */ 1417 /* some gap from the parent stack to prevent false sharing. */ 1418 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1419 /* These 2 lines below are so this does not get optimized out */ 1420 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1421 __kmp_stkpadding += (short)((kmp_int64)dummy); 1422 } 1423 1424 /* initialize if needed */ 1425 KMP_DEBUG_ASSERT( 1426 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1427 if (!TCR_4(__kmp_init_parallel)) 1428 __kmp_parallel_initialize(); 1429 __kmp_resume_if_soft_paused(); 1430 1431 /* setup current data */ 1432 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1433 // shutdown 1434 parent_team = master_th->th.th_team; 1435 master_tid = master_th->th.th_info.ds.ds_tid; 1436 master_this_cons = master_th->th.th_local.this_construct; 1437 root = master_th->th.th_root; 1438 master_active = root->r.r_active; 1439 master_set_numthreads = master_th->th.th_set_nproc; 1440 1441 #if OMPT_SUPPORT 1442 ompt_data_t ompt_parallel_data = ompt_data_none; 1443 ompt_data_t *parent_task_data; 1444 ompt_frame_t *ompt_frame; 1445 ompt_data_t *implicit_task_data; 1446 void *return_address = NULL; 1447 1448 if (ompt_enabled.enabled) { 1449 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1450 NULL, NULL); 1451 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1452 } 1453 #endif 1454 1455 // Nested level will be an index in the nested nthreads array 1456 level = parent_team->t.t_level; 1457 // used to launch non-serial teams even if nested is not allowed 1458 active_level = parent_team->t.t_active_level; 1459 // needed to check nesting inside the teams 1460 teams_level = master_th->th.th_teams_level; 1461 #if KMP_NESTED_HOT_TEAMS 1462 p_hot_teams = &master_th->th.th_hot_teams; 1463 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1464 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1465 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1466 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1467 // it is either actual or not needed (when active_level > 0) 1468 (*p_hot_teams)[0].hot_team_nth = 1; 1469 } 1470 #endif 1471 1472 #if OMPT_SUPPORT 1473 if (ompt_enabled.enabled) { 1474 if (ompt_enabled.ompt_callback_parallel_begin) { 1475 int team_size = master_set_numthreads 1476 ? master_set_numthreads 1477 : get__nproc_2(parent_team, master_tid); 1478 int flags = OMPT_INVOKER(call_context) | 1479 ((microtask == (microtask_t)__kmp_teams_master) 1480 ? ompt_parallel_league 1481 : ompt_parallel_team); 1482 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1483 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1484 return_address); 1485 } 1486 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1487 } 1488 #endif 1489 1490 master_th->th.th_ident = loc; 1491 1492 if (master_th->th.th_teams_microtask && ap && 1493 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1494 // AC: This is start of parallel that is nested inside teams construct. 1495 // The team is actual (hot), all workers are ready at the fork barrier. 1496 // No lock needed to initialize the team a bit, then free workers. 1497 parent_team->t.t_ident = loc; 1498 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1499 parent_team->t.t_argc = argc; 1500 argv = (void **)parent_team->t.t_argv; 1501 for (i = argc - 1; i >= 0; --i) 1502 *argv++ = va_arg(kmp_va_deref(ap), void *); 1503 // Increment our nested depth levels, but not increase the serialization 1504 if (parent_team == master_th->th.th_serial_team) { 1505 // AC: we are in serialized parallel 1506 __kmpc_serialized_parallel(loc, gtid); 1507 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1508 1509 if (call_context == fork_context_gnu) { 1510 // AC: need to decrement t_serialized for enquiry functions to work 1511 // correctly, will restore at join time 1512 parent_team->t.t_serialized--; 1513 return TRUE; 1514 } 1515 1516 #if OMPT_SUPPORT 1517 void *dummy; 1518 void **exit_frame_p; 1519 1520 ompt_lw_taskteam_t lw_taskteam; 1521 1522 if (ompt_enabled.enabled) { 1523 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1524 &ompt_parallel_data, return_address); 1525 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1526 1527 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1528 // don't use lw_taskteam after linking. content was swaped 1529 1530 /* OMPT implicit task begin */ 1531 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1532 if (ompt_enabled.ompt_callback_implicit_task) { 1533 OMPT_CUR_TASK_INFO(master_th) 1534 ->thread_num = __kmp_tid_from_gtid(gtid); 1535 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1536 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1537 implicit_task_data, 1, 1538 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1539 } 1540 1541 /* OMPT state */ 1542 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1543 } else { 1544 exit_frame_p = &dummy; 1545 } 1546 #endif 1547 // AC: need to decrement t_serialized for enquiry functions to work 1548 // correctly, will restore at join time 1549 parent_team->t.t_serialized--; 1550 1551 { 1552 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1553 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1554 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1555 #if OMPT_SUPPORT 1556 , 1557 exit_frame_p 1558 #endif 1559 ); 1560 } 1561 1562 #if OMPT_SUPPORT 1563 if (ompt_enabled.enabled) { 1564 *exit_frame_p = NULL; 1565 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1566 if (ompt_enabled.ompt_callback_implicit_task) { 1567 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1568 ompt_scope_end, NULL, implicit_task_data, 1, 1569 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1570 } 1571 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1572 __ompt_lw_taskteam_unlink(master_th); 1573 if (ompt_enabled.ompt_callback_parallel_end) { 1574 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1575 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1576 OMPT_INVOKER(call_context) | ompt_parallel_team, 1577 return_address); 1578 } 1579 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1580 } 1581 #endif 1582 return TRUE; 1583 } 1584 1585 parent_team->t.t_pkfn = microtask; 1586 parent_team->t.t_invoke = invoker; 1587 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1588 parent_team->t.t_active_level++; 1589 parent_team->t.t_level++; 1590 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1591 1592 #if OMPT_SUPPORT 1593 if (ompt_enabled.enabled) { 1594 ompt_lw_taskteam_t lw_taskteam; 1595 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1596 &ompt_parallel_data, return_address); 1597 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1598 } 1599 #endif 1600 1601 /* Change number of threads in the team if requested */ 1602 if (master_set_numthreads) { // The parallel has num_threads clause 1603 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1604 // AC: only can reduce number of threads dynamically, can't increase 1605 kmp_info_t **other_threads = parent_team->t.t_threads; 1606 parent_team->t.t_nproc = master_set_numthreads; 1607 for (i = 0; i < master_set_numthreads; ++i) { 1608 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1609 } 1610 // Keep extra threads hot in the team for possible next parallels 1611 } 1612 master_th->th.th_set_nproc = 0; 1613 } 1614 1615 #if USE_DEBUGGER 1616 if (__kmp_debugging) { // Let debugger override number of threads. 1617 int nth = __kmp_omp_num_threads(loc); 1618 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1619 master_set_numthreads = nth; 1620 } 1621 } 1622 #endif 1623 1624 #if USE_ITT_BUILD 1625 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1626 KMP_ITT_DEBUG) && 1627 __kmp_forkjoin_frames_mode == 3 && 1628 parent_team->t.t_active_level == 1 // only report frames at level 1 1629 && master_th->th.th_teams_size.nteams == 1) { 1630 kmp_uint64 tmp_time = __itt_get_timestamp(); 1631 master_th->th.th_frame_time = tmp_time; 1632 parent_team->t.t_region_time = tmp_time; 1633 } 1634 if (__itt_stack_caller_create_ptr) { 1635 // create new stack stitching id before entering fork barrier 1636 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1637 } 1638 #endif /* USE_ITT_BUILD */ 1639 1640 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1641 "master_th=%p, gtid=%d\n", 1642 root, parent_team, master_th, gtid)); 1643 __kmp_internal_fork(loc, gtid, parent_team); 1644 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1645 "master_th=%p, gtid=%d\n", 1646 root, parent_team, master_th, gtid)); 1647 1648 if (call_context == fork_context_gnu) 1649 return TRUE; 1650 1651 /* Invoke microtask for MASTER thread */ 1652 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1653 parent_team->t.t_id, parent_team->t.t_pkfn)); 1654 1655 if (!parent_team->t.t_invoke(gtid)) { 1656 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1657 } 1658 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1659 parent_team->t.t_id, parent_team->t.t_pkfn)); 1660 KMP_MB(); /* Flush all pending memory write invalidates. */ 1661 1662 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1663 1664 return TRUE; 1665 } // Parallel closely nested in teams construct 1666 1667 #if KMP_DEBUG 1668 if (__kmp_tasking_mode != tskm_immediate_exec) { 1669 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1670 parent_team->t.t_task_team[master_th->th.th_task_state]); 1671 } 1672 #endif 1673 1674 if (parent_team->t.t_active_level >= 1675 master_th->th.th_current_task->td_icvs.max_active_levels) { 1676 nthreads = 1; 1677 } else { 1678 int enter_teams = ((ap == NULL && active_level == 0) || 1679 (ap && teams_level > 0 && teams_level == level)); 1680 nthreads = 1681 master_set_numthreads 1682 ? master_set_numthreads 1683 : get__nproc_2( 1684 parent_team, 1685 master_tid); // TODO: get nproc directly from current task 1686 1687 // Check if we need to take forkjoin lock? (no need for serialized 1688 // parallel out of teams construct). This code moved here from 1689 // __kmp_reserve_threads() to speedup nested serialized parallels. 1690 if (nthreads > 1) { 1691 if ((get__max_active_levels(master_th) == 1 && 1692 (root->r.r_in_parallel && !enter_teams)) || 1693 (__kmp_library == library_serial)) { 1694 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1695 " threads\n", 1696 gtid, nthreads)); 1697 nthreads = 1; 1698 } 1699 } 1700 if (nthreads > 1) { 1701 /* determine how many new threads we can use */ 1702 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1703 /* AC: If we execute teams from parallel region (on host), then teams 1704 should be created but each can only have 1 thread if nesting is 1705 disabled. If teams called from serial region, then teams and their 1706 threads should be created regardless of the nesting setting. */ 1707 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1708 nthreads, enter_teams); 1709 if (nthreads == 1) { 1710 // Free lock for single thread execution here; for multi-thread 1711 // execution it will be freed later after team of threads created 1712 // and initialized 1713 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1714 } 1715 } 1716 } 1717 KMP_DEBUG_ASSERT(nthreads > 0); 1718 1719 // If we temporarily changed the set number of threads then restore it now 1720 master_th->th.th_set_nproc = 0; 1721 1722 /* create a serialized parallel region? */ 1723 if (nthreads == 1) { 1724 /* josh todo: hypothetical question: what do we do for OS X*? */ 1725 #if KMP_OS_LINUX && \ 1726 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1727 void *args[argc]; 1728 #else 1729 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1730 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1731 KMP_ARCH_AARCH64) */ 1732 1733 KA_TRACE(20, 1734 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1735 1736 __kmpc_serialized_parallel(loc, gtid); 1737 1738 if (call_context == fork_context_intel) { 1739 /* TODO this sucks, use the compiler itself to pass args! :) */ 1740 master_th->th.th_serial_team->t.t_ident = loc; 1741 if (!ap) { 1742 // revert change made in __kmpc_serialized_parallel() 1743 master_th->th.th_serial_team->t.t_level--; 1744 // Get args from parent team for teams construct 1745 1746 #if OMPT_SUPPORT 1747 void *dummy; 1748 void **exit_frame_p; 1749 ompt_task_info_t *task_info; 1750 1751 ompt_lw_taskteam_t lw_taskteam; 1752 1753 if (ompt_enabled.enabled) { 1754 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1755 &ompt_parallel_data, return_address); 1756 1757 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1758 // don't use lw_taskteam after linking. content was swaped 1759 1760 task_info = OMPT_CUR_TASK_INFO(master_th); 1761 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1762 if (ompt_enabled.ompt_callback_implicit_task) { 1763 OMPT_CUR_TASK_INFO(master_th) 1764 ->thread_num = __kmp_tid_from_gtid(gtid); 1765 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1766 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1767 &(task_info->task_data), 1, 1768 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1769 ompt_task_implicit); 1770 } 1771 1772 /* OMPT state */ 1773 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1774 } else { 1775 exit_frame_p = &dummy; 1776 } 1777 #endif 1778 1779 { 1780 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1781 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1782 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1783 parent_team->t.t_argv 1784 #if OMPT_SUPPORT 1785 , 1786 exit_frame_p 1787 #endif 1788 ); 1789 } 1790 1791 #if OMPT_SUPPORT 1792 if (ompt_enabled.enabled) { 1793 *exit_frame_p = NULL; 1794 if (ompt_enabled.ompt_callback_implicit_task) { 1795 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1796 ompt_scope_end, NULL, &(task_info->task_data), 1, 1797 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1798 ompt_task_implicit); 1799 } 1800 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1801 __ompt_lw_taskteam_unlink(master_th); 1802 if (ompt_enabled.ompt_callback_parallel_end) { 1803 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1804 &ompt_parallel_data, parent_task_data, 1805 OMPT_INVOKER(call_context) | ompt_parallel_team, 1806 return_address); 1807 } 1808 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1809 } 1810 #endif 1811 } else if (microtask == (microtask_t)__kmp_teams_master) { 1812 KMP_DEBUG_ASSERT(master_th->th.th_team == 1813 master_th->th.th_serial_team); 1814 team = master_th->th.th_team; 1815 // team->t.t_pkfn = microtask; 1816 team->t.t_invoke = invoker; 1817 __kmp_alloc_argv_entries(argc, team, TRUE); 1818 team->t.t_argc = argc; 1819 argv = (void **)team->t.t_argv; 1820 if (ap) { 1821 for (i = argc - 1; i >= 0; --i) 1822 *argv++ = va_arg(kmp_va_deref(ap), void *); 1823 } else { 1824 for (i = 0; i < argc; ++i) 1825 // Get args from parent team for teams construct 1826 argv[i] = parent_team->t.t_argv[i]; 1827 } 1828 // AC: revert change made in __kmpc_serialized_parallel() 1829 // because initial code in teams should have level=0 1830 team->t.t_level--; 1831 // AC: call special invoker for outer "parallel" of teams construct 1832 invoker(gtid); 1833 #if OMPT_SUPPORT 1834 if (ompt_enabled.enabled) { 1835 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1836 if (ompt_enabled.ompt_callback_implicit_task) { 1837 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1838 ompt_scope_end, NULL, &(task_info->task_data), 0, 1839 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1840 } 1841 if (ompt_enabled.ompt_callback_parallel_end) { 1842 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1843 &ompt_parallel_data, parent_task_data, 1844 OMPT_INVOKER(call_context) | ompt_parallel_league, 1845 return_address); 1846 } 1847 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1848 } 1849 #endif 1850 } else { 1851 argv = args; 1852 for (i = argc - 1; i >= 0; --i) 1853 *argv++ = va_arg(kmp_va_deref(ap), void *); 1854 KMP_MB(); 1855 1856 #if OMPT_SUPPORT 1857 void *dummy; 1858 void **exit_frame_p; 1859 ompt_task_info_t *task_info; 1860 1861 ompt_lw_taskteam_t lw_taskteam; 1862 1863 if (ompt_enabled.enabled) { 1864 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1865 &ompt_parallel_data, return_address); 1866 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1867 // don't use lw_taskteam after linking. content was swaped 1868 task_info = OMPT_CUR_TASK_INFO(master_th); 1869 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1870 1871 /* OMPT implicit task begin */ 1872 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1873 if (ompt_enabled.ompt_callback_implicit_task) { 1874 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1875 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1876 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1877 ompt_task_implicit); 1878 OMPT_CUR_TASK_INFO(master_th) 1879 ->thread_num = __kmp_tid_from_gtid(gtid); 1880 } 1881 1882 /* OMPT state */ 1883 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1884 } else { 1885 exit_frame_p = &dummy; 1886 } 1887 #endif 1888 1889 { 1890 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1891 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1892 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1893 #if OMPT_SUPPORT 1894 , 1895 exit_frame_p 1896 #endif 1897 ); 1898 } 1899 1900 #if OMPT_SUPPORT 1901 if (ompt_enabled.enabled) { 1902 *exit_frame_p = NULL; 1903 if (ompt_enabled.ompt_callback_implicit_task) { 1904 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1905 ompt_scope_end, NULL, &(task_info->task_data), 1, 1906 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1907 ompt_task_implicit); 1908 } 1909 1910 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1911 __ompt_lw_taskteam_unlink(master_th); 1912 if (ompt_enabled.ompt_callback_parallel_end) { 1913 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1914 &ompt_parallel_data, parent_task_data, 1915 OMPT_INVOKER(call_context) | ompt_parallel_team, 1916 return_address); 1917 } 1918 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1919 } 1920 #endif 1921 } 1922 } else if (call_context == fork_context_gnu) { 1923 #if OMPT_SUPPORT 1924 ompt_lw_taskteam_t lwt; 1925 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1926 return_address); 1927 1928 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1929 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1930 // don't use lw_taskteam after linking. content was swaped 1931 #endif 1932 1933 // we were called from GNU native code 1934 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1935 return FALSE; 1936 } else { 1937 KMP_ASSERT2(call_context < fork_context_last, 1938 "__kmp_fork_call: unknown fork_context parameter"); 1939 } 1940 1941 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1942 KMP_MB(); 1943 return FALSE; 1944 } // if (nthreads == 1) 1945 1946 // GEH: only modify the executing flag in the case when not serialized 1947 // serialized case is handled in kmpc_serialized_parallel 1948 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1949 "curtask=%p, curtask_max_aclevel=%d\n", 1950 parent_team->t.t_active_level, master_th, 1951 master_th->th.th_current_task, 1952 master_th->th.th_current_task->td_icvs.max_active_levels)); 1953 // TODO: GEH - cannot do this assertion because root thread not set up as 1954 // executing 1955 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1956 master_th->th.th_current_task->td_flags.executing = 0; 1957 1958 if (!master_th->th.th_teams_microtask || level > teams_level) { 1959 /* Increment our nested depth level */ 1960 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1961 } 1962 1963 // See if we need to make a copy of the ICVs. 1964 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1965 if ((level + 1 < __kmp_nested_nth.used) && 1966 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1967 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1968 } else { 1969 nthreads_icv = 0; // don't update 1970 } 1971 1972 // Figure out the proc_bind_policy for the new team. 1973 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1974 kmp_proc_bind_t proc_bind_icv = 1975 proc_bind_default; // proc_bind_default means don't update 1976 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1977 proc_bind = proc_bind_false; 1978 } else { 1979 if (proc_bind == proc_bind_default) { 1980 // No proc_bind clause specified; use current proc-bind-var for this 1981 // parallel region 1982 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1983 } 1984 /* else: The proc_bind policy was specified explicitly on parallel clause. 1985 This overrides proc-bind-var for this parallel region, but does not 1986 change proc-bind-var. */ 1987 // Figure the value of proc-bind-var for the child threads. 1988 if ((level + 1 < __kmp_nested_proc_bind.used) && 1989 (__kmp_nested_proc_bind.bind_types[level + 1] != 1990 master_th->th.th_current_task->td_icvs.proc_bind)) { 1991 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1992 } 1993 } 1994 1995 // Reset for next parallel region 1996 master_th->th.th_set_proc_bind = proc_bind_default; 1997 1998 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1999 kmp_internal_control_t new_icvs; 2000 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2001 new_icvs.next = NULL; 2002 if (nthreads_icv > 0) { 2003 new_icvs.nproc = nthreads_icv; 2004 } 2005 if (proc_bind_icv != proc_bind_default) { 2006 new_icvs.proc_bind = proc_bind_icv; 2007 } 2008 2009 /* allocate a new parallel team */ 2010 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2011 team = __kmp_allocate_team(root, nthreads, nthreads, 2012 #if OMPT_SUPPORT 2013 ompt_parallel_data, 2014 #endif 2015 proc_bind, &new_icvs, 2016 argc USE_NESTED_HOT_ARG(master_th)); 2017 } else { 2018 /* allocate a new parallel team */ 2019 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2020 team = __kmp_allocate_team(root, nthreads, nthreads, 2021 #if OMPT_SUPPORT 2022 ompt_parallel_data, 2023 #endif 2024 proc_bind, 2025 &master_th->th.th_current_task->td_icvs, 2026 argc USE_NESTED_HOT_ARG(master_th)); 2027 } 2028 KF_TRACE( 2029 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2030 2031 /* setup the new team */ 2032 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2033 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2034 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2035 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2036 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2037 #if OMPT_SUPPORT 2038 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2039 return_address); 2040 #endif 2041 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2042 // TODO: parent_team->t.t_level == INT_MAX ??? 2043 if (!master_th->th.th_teams_microtask || level > teams_level) { 2044 int new_level = parent_team->t.t_level + 1; 2045 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2046 new_level = parent_team->t.t_active_level + 1; 2047 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2048 } else { 2049 // AC: Do not increase parallel level at start of the teams construct 2050 int new_level = parent_team->t.t_level; 2051 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2052 new_level = parent_team->t.t_active_level; 2053 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2054 } 2055 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2056 // set master's schedule as new run-time schedule 2057 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2058 2059 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2060 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2061 2062 // Update the floating point rounding in the team if required. 2063 propagateFPControl(team); 2064 2065 if (__kmp_tasking_mode != tskm_immediate_exec) { 2066 // Set master's task team to team's task team. Unless this is hot team, it 2067 // should be NULL. 2068 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2069 parent_team->t.t_task_team[master_th->th.th_task_state]); 2070 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2071 "%p, new task_team %p / team %p\n", 2072 __kmp_gtid_from_thread(master_th), 2073 master_th->th.th_task_team, parent_team, 2074 team->t.t_task_team[master_th->th.th_task_state], team)); 2075 2076 if (active_level || master_th->th.th_task_team) { 2077 // Take a memo of master's task_state 2078 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2079 if (master_th->th.th_task_state_top >= 2080 master_th->th.th_task_state_stack_sz) { // increase size 2081 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2082 kmp_uint8 *old_stack, *new_stack; 2083 kmp_uint32 i; 2084 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2085 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2086 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2087 } 2088 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2089 ++i) { // zero-init rest of stack 2090 new_stack[i] = 0; 2091 } 2092 old_stack = master_th->th.th_task_state_memo_stack; 2093 master_th->th.th_task_state_memo_stack = new_stack; 2094 master_th->th.th_task_state_stack_sz = new_size; 2095 __kmp_free(old_stack); 2096 } 2097 // Store master's task_state on stack 2098 master_th->th 2099 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2100 master_th->th.th_task_state; 2101 master_th->th.th_task_state_top++; 2102 #if KMP_NESTED_HOT_TEAMS 2103 if (master_th->th.th_hot_teams && 2104 active_level < __kmp_hot_teams_max_level && 2105 team == master_th->th.th_hot_teams[active_level].hot_team) { 2106 // Restore master's nested state if nested hot team 2107 master_th->th.th_task_state = 2108 master_th->th 2109 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2110 } else { 2111 #endif 2112 master_th->th.th_task_state = 0; 2113 #if KMP_NESTED_HOT_TEAMS 2114 } 2115 #endif 2116 } 2117 #if !KMP_NESTED_HOT_TEAMS 2118 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2119 (team == root->r.r_hot_team)); 2120 #endif 2121 } 2122 2123 KA_TRACE( 2124 20, 2125 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2126 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2127 team->t.t_nproc)); 2128 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2129 (team->t.t_master_tid == 0 && 2130 (team->t.t_parent == root->r.r_root_team || 2131 team->t.t_parent->t.t_serialized))); 2132 KMP_MB(); 2133 2134 /* now, setup the arguments */ 2135 argv = (void **)team->t.t_argv; 2136 if (ap) { 2137 for (i = argc - 1; i >= 0; --i) { 2138 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2139 KMP_CHECK_UPDATE(*argv, new_argv); 2140 argv++; 2141 } 2142 } else { 2143 for (i = 0; i < argc; ++i) { 2144 // Get args from parent team for teams construct 2145 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2146 } 2147 } 2148 2149 /* now actually fork the threads */ 2150 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2151 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2152 root->r.r_active = TRUE; 2153 2154 __kmp_fork_team_threads(root, team, master_th, gtid); 2155 __kmp_setup_icv_copy(team, nthreads, 2156 &master_th->th.th_current_task->td_icvs, loc); 2157 2158 #if OMPT_SUPPORT 2159 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2160 #endif 2161 2162 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2163 2164 #if USE_ITT_BUILD 2165 if (team->t.t_active_level == 1 // only report frames at level 1 2166 && !master_th->th.th_teams_microtask) { // not in teams construct 2167 #if USE_ITT_NOTIFY 2168 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2169 (__kmp_forkjoin_frames_mode == 3 || 2170 __kmp_forkjoin_frames_mode == 1)) { 2171 kmp_uint64 tmp_time = 0; 2172 if (__itt_get_timestamp_ptr) 2173 tmp_time = __itt_get_timestamp(); 2174 // Internal fork - report frame begin 2175 master_th->th.th_frame_time = tmp_time; 2176 if (__kmp_forkjoin_frames_mode == 3) 2177 team->t.t_region_time = tmp_time; 2178 } else 2179 // only one notification scheme (either "submit" or "forking/joined", not both) 2180 #endif /* USE_ITT_NOTIFY */ 2181 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2182 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2183 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2184 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2185 } 2186 } 2187 #endif /* USE_ITT_BUILD */ 2188 2189 /* now go on and do the work */ 2190 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2191 KMP_MB(); 2192 KF_TRACE(10, 2193 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2194 root, team, master_th, gtid)); 2195 2196 #if USE_ITT_BUILD 2197 if (__itt_stack_caller_create_ptr) { 2198 team->t.t_stack_id = 2199 __kmp_itt_stack_caller_create(); // create new stack stitching id 2200 // before entering fork barrier 2201 } 2202 #endif /* USE_ITT_BUILD */ 2203 2204 // AC: skip __kmp_internal_fork at teams construct, let only master 2205 // threads execute 2206 if (ap) { 2207 __kmp_internal_fork(loc, gtid, team); 2208 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2209 "master_th=%p, gtid=%d\n", 2210 root, team, master_th, gtid)); 2211 } 2212 2213 if (call_context == fork_context_gnu) { 2214 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2215 return TRUE; 2216 } 2217 2218 /* Invoke microtask for MASTER thread */ 2219 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2220 team->t.t_id, team->t.t_pkfn)); 2221 } // END of timer KMP_fork_call block 2222 2223 #if KMP_STATS_ENABLED 2224 // If beginning a teams construct, then change thread state 2225 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2226 if (!ap) { 2227 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2228 } 2229 #endif 2230 2231 if (!team->t.t_invoke(gtid)) { 2232 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2233 } 2234 2235 #if KMP_STATS_ENABLED 2236 // If was beginning of a teams construct, then reset thread state 2237 if (!ap) { 2238 KMP_SET_THREAD_STATE(previous_state); 2239 } 2240 #endif 2241 2242 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2243 team->t.t_id, team->t.t_pkfn)); 2244 KMP_MB(); /* Flush all pending memory write invalidates. */ 2245 2246 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2247 2248 #if OMPT_SUPPORT 2249 if (ompt_enabled.enabled) { 2250 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2251 } 2252 #endif 2253 2254 return TRUE; 2255 } 2256 2257 #if OMPT_SUPPORT 2258 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2259 kmp_team_t *team) { 2260 // restore state outside the region 2261 thread->th.ompt_thread_info.state = 2262 ((team->t.t_serialized) ? ompt_state_work_serial 2263 : ompt_state_work_parallel); 2264 } 2265 2266 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2267 kmp_team_t *team, ompt_data_t *parallel_data, 2268 int flags, void *codeptr) { 2269 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2270 if (ompt_enabled.ompt_callback_parallel_end) { 2271 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2272 parallel_data, &(task_info->task_data), flags, codeptr); 2273 } 2274 2275 task_info->frame.enter_frame = ompt_data_none; 2276 __kmp_join_restore_state(thread, team); 2277 } 2278 #endif 2279 2280 void __kmp_join_call(ident_t *loc, int gtid 2281 #if OMPT_SUPPORT 2282 , 2283 enum fork_context_e fork_context 2284 #endif 2285 , 2286 int exit_teams) { 2287 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2288 kmp_team_t *team; 2289 kmp_team_t *parent_team; 2290 kmp_info_t *master_th; 2291 kmp_root_t *root; 2292 int master_active; 2293 2294 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2295 2296 /* setup current data */ 2297 master_th = __kmp_threads[gtid]; 2298 root = master_th->th.th_root; 2299 team = master_th->th.th_team; 2300 parent_team = team->t.t_parent; 2301 2302 master_th->th.th_ident = loc; 2303 2304 #if OMPT_SUPPORT 2305 void *team_microtask = (void *)team->t.t_pkfn; 2306 // For GOMP interface with serialized parallel, need the 2307 // __kmpc_end_serialized_parallel to call hooks for OMPT end-implicit-task 2308 // and end-parallel events. 2309 if (ompt_enabled.enabled && 2310 !(team->t.t_serialized && fork_context == fork_context_gnu)) { 2311 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2312 } 2313 #endif 2314 2315 #if KMP_DEBUG 2316 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2317 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2318 "th_task_team = %p\n", 2319 __kmp_gtid_from_thread(master_th), team, 2320 team->t.t_task_team[master_th->th.th_task_state], 2321 master_th->th.th_task_team)); 2322 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2323 team->t.t_task_team[master_th->th.th_task_state]); 2324 } 2325 #endif 2326 2327 if (team->t.t_serialized) { 2328 if (master_th->th.th_teams_microtask) { 2329 // We are in teams construct 2330 int level = team->t.t_level; 2331 int tlevel = master_th->th.th_teams_level; 2332 if (level == tlevel) { 2333 // AC: we haven't incremented it earlier at start of teams construct, 2334 // so do it here - at the end of teams construct 2335 team->t.t_level++; 2336 } else if (level == tlevel + 1) { 2337 // AC: we are exiting parallel inside teams, need to increment 2338 // serialization in order to restore it in the next call to 2339 // __kmpc_end_serialized_parallel 2340 team->t.t_serialized++; 2341 } 2342 } 2343 __kmpc_end_serialized_parallel(loc, gtid); 2344 2345 #if OMPT_SUPPORT 2346 if (ompt_enabled.enabled) { 2347 __kmp_join_restore_state(master_th, parent_team); 2348 } 2349 #endif 2350 2351 return; 2352 } 2353 2354 master_active = team->t.t_master_active; 2355 2356 if (!exit_teams) { 2357 // AC: No barrier for internal teams at exit from teams construct. 2358 // But there is barrier for external team (league). 2359 __kmp_internal_join(loc, gtid, team); 2360 } else { 2361 master_th->th.th_task_state = 2362 0; // AC: no tasking in teams (out of any parallel) 2363 } 2364 2365 KMP_MB(); 2366 2367 #if OMPT_SUPPORT 2368 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2369 void *codeptr = team->t.ompt_team_info.master_return_address; 2370 #endif 2371 2372 #if USE_ITT_BUILD 2373 if (__itt_stack_caller_create_ptr) { 2374 // destroy the stack stitching id after join barrier 2375 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2376 } 2377 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2378 if (team->t.t_active_level == 1 && 2379 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2380 master_th->th.th_teams_size.nteams == 1)) { 2381 master_th->th.th_ident = loc; 2382 // only one notification scheme (either "submit" or "forking/joined", not 2383 // both) 2384 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2385 __kmp_forkjoin_frames_mode == 3) 2386 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2387 master_th->th.th_frame_time, 0, loc, 2388 master_th->th.th_team_nproc, 1); 2389 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2390 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2391 __kmp_itt_region_joined(gtid); 2392 } // active_level == 1 2393 #endif /* USE_ITT_BUILD */ 2394 2395 if (master_th->th.th_teams_microtask && !exit_teams && 2396 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2397 team->t.t_level == master_th->th.th_teams_level + 1) { 2398 // AC: We need to leave the team structure intact at the end of parallel 2399 // inside the teams construct, so that at the next parallel same (hot) team 2400 // works, only adjust nesting levels 2401 #if OMPT_SUPPORT 2402 ompt_data_t ompt_parallel_data = ompt_data_none; 2403 if (ompt_enabled.enabled) { 2404 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2405 if (ompt_enabled.ompt_callback_implicit_task) { 2406 int ompt_team_size = team->t.t_nproc; 2407 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2408 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2409 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2410 } 2411 task_info->frame.exit_frame = ompt_data_none; 2412 task_info->task_data = ompt_data_none; 2413 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2414 __ompt_lw_taskteam_unlink(master_th); 2415 } 2416 #endif 2417 /* Decrement our nested depth level */ 2418 team->t.t_level--; 2419 team->t.t_active_level--; 2420 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2421 2422 // Restore number of threads in the team if needed. This code relies on 2423 // the proper adjustment of th_teams_size.nth after the fork in 2424 // __kmp_teams_master on each teams master in the case that 2425 // __kmp_reserve_threads reduced it. 2426 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2427 int old_num = master_th->th.th_team_nproc; 2428 int new_num = master_th->th.th_teams_size.nth; 2429 kmp_info_t **other_threads = team->t.t_threads; 2430 team->t.t_nproc = new_num; 2431 for (int i = 0; i < old_num; ++i) { 2432 other_threads[i]->th.th_team_nproc = new_num; 2433 } 2434 // Adjust states of non-used threads of the team 2435 for (int i = old_num; i < new_num; ++i) { 2436 // Re-initialize thread's barrier data. 2437 KMP_DEBUG_ASSERT(other_threads[i]); 2438 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2439 for (int b = 0; b < bs_last_barrier; ++b) { 2440 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2441 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2442 #if USE_DEBUGGER 2443 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2444 #endif 2445 } 2446 if (__kmp_tasking_mode != tskm_immediate_exec) { 2447 // Synchronize thread's task state 2448 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2449 } 2450 } 2451 } 2452 2453 #if OMPT_SUPPORT 2454 if (ompt_enabled.enabled) { 2455 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2456 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2457 } 2458 #endif 2459 2460 return; 2461 } 2462 2463 /* do cleanup and restore the parent team */ 2464 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2465 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2466 2467 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2468 2469 /* jc: The following lock has instructions with REL and ACQ semantics, 2470 separating the parallel user code called in this parallel region 2471 from the serial user code called after this function returns. */ 2472 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2473 2474 if (!master_th->th.th_teams_microtask || 2475 team->t.t_level > master_th->th.th_teams_level) { 2476 /* Decrement our nested depth level */ 2477 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2478 } 2479 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2480 2481 #if OMPT_SUPPORT 2482 if (ompt_enabled.enabled) { 2483 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2484 if (ompt_enabled.ompt_callback_implicit_task) { 2485 int flags = (team_microtask == (void *)__kmp_teams_master) 2486 ? ompt_task_initial 2487 : ompt_task_implicit; 2488 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2489 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2490 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2491 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2492 } 2493 task_info->frame.exit_frame = ompt_data_none; 2494 task_info->task_data = ompt_data_none; 2495 } 2496 #endif 2497 2498 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2499 master_th, team)); 2500 __kmp_pop_current_task_from_thread(master_th); 2501 2502 #if KMP_AFFINITY_SUPPORTED 2503 // Restore master thread's partition. 2504 master_th->th.th_first_place = team->t.t_first_place; 2505 master_th->th.th_last_place = team->t.t_last_place; 2506 #endif // KMP_AFFINITY_SUPPORTED 2507 master_th->th.th_def_allocator = team->t.t_def_allocator; 2508 2509 updateHWFPControl(team); 2510 2511 if (root->r.r_active != master_active) 2512 root->r.r_active = master_active; 2513 2514 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2515 master_th)); // this will free worker threads 2516 2517 /* this race was fun to find. make sure the following is in the critical 2518 region otherwise assertions may fail occasionally since the old team may be 2519 reallocated and the hierarchy appears inconsistent. it is actually safe to 2520 run and won't cause any bugs, but will cause those assertion failures. it's 2521 only one deref&assign so might as well put this in the critical region */ 2522 master_th->th.th_team = parent_team; 2523 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2524 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2525 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2526 2527 /* restore serialized team, if need be */ 2528 if (parent_team->t.t_serialized && 2529 parent_team != master_th->th.th_serial_team && 2530 parent_team != root->r.r_root_team) { 2531 __kmp_free_team(root, 2532 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2533 master_th->th.th_serial_team = parent_team; 2534 } 2535 2536 if (__kmp_tasking_mode != tskm_immediate_exec) { 2537 if (master_th->th.th_task_state_top > 2538 0) { // Restore task state from memo stack 2539 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2540 // Remember master's state if we re-use this nested hot team 2541 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2542 master_th->th.th_task_state; 2543 --master_th->th.th_task_state_top; // pop 2544 // Now restore state at this level 2545 master_th->th.th_task_state = 2546 master_th->th 2547 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2548 } 2549 // Copy the task team from the parent team to the master thread 2550 master_th->th.th_task_team = 2551 parent_team->t.t_task_team[master_th->th.th_task_state]; 2552 KA_TRACE(20, 2553 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2554 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2555 parent_team)); 2556 } 2557 2558 // TODO: GEH - cannot do this assertion because root thread not set up as 2559 // executing 2560 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2561 master_th->th.th_current_task->td_flags.executing = 1; 2562 2563 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2564 2565 #if OMPT_SUPPORT 2566 int flags = 2567 OMPT_INVOKER(fork_context) | 2568 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2569 : ompt_parallel_team); 2570 if (ompt_enabled.enabled) { 2571 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2572 codeptr); 2573 } 2574 #endif 2575 2576 KMP_MB(); 2577 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2578 } 2579 2580 /* Check whether we should push an internal control record onto the 2581 serial team stack. If so, do it. */ 2582 void __kmp_save_internal_controls(kmp_info_t *thread) { 2583 2584 if (thread->th.th_team != thread->th.th_serial_team) { 2585 return; 2586 } 2587 if (thread->th.th_team->t.t_serialized > 1) { 2588 int push = 0; 2589 2590 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2591 push = 1; 2592 } else { 2593 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2594 thread->th.th_team->t.t_serialized) { 2595 push = 1; 2596 } 2597 } 2598 if (push) { /* push a record on the serial team's stack */ 2599 kmp_internal_control_t *control = 2600 (kmp_internal_control_t *)__kmp_allocate( 2601 sizeof(kmp_internal_control_t)); 2602 2603 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2604 2605 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2606 2607 control->next = thread->th.th_team->t.t_control_stack_top; 2608 thread->th.th_team->t.t_control_stack_top = control; 2609 } 2610 } 2611 } 2612 2613 /* Changes set_nproc */ 2614 void __kmp_set_num_threads(int new_nth, int gtid) { 2615 kmp_info_t *thread; 2616 kmp_root_t *root; 2617 2618 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2619 KMP_DEBUG_ASSERT(__kmp_init_serial); 2620 2621 if (new_nth < 1) 2622 new_nth = 1; 2623 else if (new_nth > __kmp_max_nth) 2624 new_nth = __kmp_max_nth; 2625 2626 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2627 thread = __kmp_threads[gtid]; 2628 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2629 return; // nothing to do 2630 2631 __kmp_save_internal_controls(thread); 2632 2633 set__nproc(thread, new_nth); 2634 2635 // If this omp_set_num_threads() call will cause the hot team size to be 2636 // reduced (in the absence of a num_threads clause), then reduce it now, 2637 // rather than waiting for the next parallel region. 2638 root = thread->th.th_root; 2639 if (__kmp_init_parallel && (!root->r.r_active) && 2640 (root->r.r_hot_team->t.t_nproc > new_nth) 2641 #if KMP_NESTED_HOT_TEAMS 2642 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2643 #endif 2644 ) { 2645 kmp_team_t *hot_team = root->r.r_hot_team; 2646 int f; 2647 2648 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2649 2650 // Release the extra threads we don't need any more. 2651 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2652 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2653 if (__kmp_tasking_mode != tskm_immediate_exec) { 2654 // When decreasing team size, threads no longer in the team should unref 2655 // task team. 2656 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2657 } 2658 __kmp_free_thread(hot_team->t.t_threads[f]); 2659 hot_team->t.t_threads[f] = NULL; 2660 } 2661 hot_team->t.t_nproc = new_nth; 2662 #if KMP_NESTED_HOT_TEAMS 2663 if (thread->th.th_hot_teams) { 2664 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2665 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2666 } 2667 #endif 2668 2669 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2670 2671 // Update the t_nproc field in the threads that are still active. 2672 for (f = 0; f < new_nth; f++) { 2673 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2674 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2675 } 2676 // Special flag in case omp_set_num_threads() call 2677 hot_team->t.t_size_changed = -1; 2678 } 2679 } 2680 2681 /* Changes max_active_levels */ 2682 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2683 kmp_info_t *thread; 2684 2685 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2686 "%d = (%d)\n", 2687 gtid, max_active_levels)); 2688 KMP_DEBUG_ASSERT(__kmp_init_serial); 2689 2690 // validate max_active_levels 2691 if (max_active_levels < 0) { 2692 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2693 // We ignore this call if the user has specified a negative value. 2694 // The current setting won't be changed. The last valid setting will be 2695 // used. A warning will be issued (if warnings are allowed as controlled by 2696 // the KMP_WARNINGS env var). 2697 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2698 "max_active_levels for thread %d = (%d)\n", 2699 gtid, max_active_levels)); 2700 return; 2701 } 2702 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2703 // it's OK, the max_active_levels is within the valid range: [ 0; 2704 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2705 // We allow a zero value. (implementation defined behavior) 2706 } else { 2707 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2708 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2709 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2710 // Current upper limit is MAX_INT. (implementation defined behavior) 2711 // If the input exceeds the upper limit, we correct the input to be the 2712 // upper limit. (implementation defined behavior) 2713 // Actually, the flow should never get here until we use MAX_INT limit. 2714 } 2715 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2716 "max_active_levels for thread %d = (%d)\n", 2717 gtid, max_active_levels)); 2718 2719 thread = __kmp_threads[gtid]; 2720 2721 __kmp_save_internal_controls(thread); 2722 2723 set__max_active_levels(thread, max_active_levels); 2724 } 2725 2726 /* Gets max_active_levels */ 2727 int __kmp_get_max_active_levels(int gtid) { 2728 kmp_info_t *thread; 2729 2730 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2731 KMP_DEBUG_ASSERT(__kmp_init_serial); 2732 2733 thread = __kmp_threads[gtid]; 2734 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2735 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2736 "curtask_maxaclevel=%d\n", 2737 gtid, thread->th.th_current_task, 2738 thread->th.th_current_task->td_icvs.max_active_levels)); 2739 return thread->th.th_current_task->td_icvs.max_active_levels; 2740 } 2741 2742 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2743 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2744 2745 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2746 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2747 kmp_info_t *thread; 2748 kmp_sched_t orig_kind; 2749 // kmp_team_t *team; 2750 2751 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2752 gtid, (int)kind, chunk)); 2753 KMP_DEBUG_ASSERT(__kmp_init_serial); 2754 2755 // Check if the kind parameter is valid, correct if needed. 2756 // Valid parameters should fit in one of two intervals - standard or extended: 2757 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2758 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2759 orig_kind = kind; 2760 kind = __kmp_sched_without_mods(kind); 2761 2762 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2763 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2764 // TODO: Hint needs attention in case we change the default schedule. 2765 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2766 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2767 __kmp_msg_null); 2768 kind = kmp_sched_default; 2769 chunk = 0; // ignore chunk value in case of bad kind 2770 } 2771 2772 thread = __kmp_threads[gtid]; 2773 2774 __kmp_save_internal_controls(thread); 2775 2776 if (kind < kmp_sched_upper_std) { 2777 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2778 // differ static chunked vs. unchunked: chunk should be invalid to 2779 // indicate unchunked schedule (which is the default) 2780 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2781 } else { 2782 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2783 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2784 } 2785 } else { 2786 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2787 // kmp_sched_lower - 2 ]; 2788 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2789 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2790 kmp_sched_lower - 2]; 2791 } 2792 __kmp_sched_apply_mods_intkind( 2793 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2794 if (kind == kmp_sched_auto || chunk < 1) { 2795 // ignore parameter chunk for schedule auto 2796 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2797 } else { 2798 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2799 } 2800 } 2801 2802 /* Gets def_sched_var ICV values */ 2803 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2804 kmp_info_t *thread; 2805 enum sched_type th_type; 2806 2807 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2808 KMP_DEBUG_ASSERT(__kmp_init_serial); 2809 2810 thread = __kmp_threads[gtid]; 2811 2812 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2813 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2814 case kmp_sch_static: 2815 case kmp_sch_static_greedy: 2816 case kmp_sch_static_balanced: 2817 *kind = kmp_sched_static; 2818 __kmp_sched_apply_mods_stdkind(kind, th_type); 2819 *chunk = 0; // chunk was not set, try to show this fact via zero value 2820 return; 2821 case kmp_sch_static_chunked: 2822 *kind = kmp_sched_static; 2823 break; 2824 case kmp_sch_dynamic_chunked: 2825 *kind = kmp_sched_dynamic; 2826 break; 2827 case kmp_sch_guided_chunked: 2828 case kmp_sch_guided_iterative_chunked: 2829 case kmp_sch_guided_analytical_chunked: 2830 *kind = kmp_sched_guided; 2831 break; 2832 case kmp_sch_auto: 2833 *kind = kmp_sched_auto; 2834 break; 2835 case kmp_sch_trapezoidal: 2836 *kind = kmp_sched_trapezoidal; 2837 break; 2838 #if KMP_STATIC_STEAL_ENABLED 2839 case kmp_sch_static_steal: 2840 *kind = kmp_sched_static_steal; 2841 break; 2842 #endif 2843 default: 2844 KMP_FATAL(UnknownSchedulingType, th_type); 2845 } 2846 2847 __kmp_sched_apply_mods_stdkind(kind, th_type); 2848 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2849 } 2850 2851 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2852 2853 int ii, dd; 2854 kmp_team_t *team; 2855 kmp_info_t *thr; 2856 2857 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2858 KMP_DEBUG_ASSERT(__kmp_init_serial); 2859 2860 // validate level 2861 if (level == 0) 2862 return 0; 2863 if (level < 0) 2864 return -1; 2865 thr = __kmp_threads[gtid]; 2866 team = thr->th.th_team; 2867 ii = team->t.t_level; 2868 if (level > ii) 2869 return -1; 2870 2871 if (thr->th.th_teams_microtask) { 2872 // AC: we are in teams region where multiple nested teams have same level 2873 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2874 if (level <= 2875 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2876 KMP_DEBUG_ASSERT(ii >= tlevel); 2877 // AC: As we need to pass by the teams league, we need to artificially 2878 // increase ii 2879 if (ii == tlevel) { 2880 ii += 2; // three teams have same level 2881 } else { 2882 ii++; // two teams have same level 2883 } 2884 } 2885 } 2886 2887 if (ii == level) 2888 return __kmp_tid_from_gtid(gtid); 2889 2890 dd = team->t.t_serialized; 2891 level++; 2892 while (ii > level) { 2893 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2894 } 2895 if ((team->t.t_serialized) && (!dd)) { 2896 team = team->t.t_parent; 2897 continue; 2898 } 2899 if (ii > level) { 2900 team = team->t.t_parent; 2901 dd = team->t.t_serialized; 2902 ii--; 2903 } 2904 } 2905 2906 return (dd > 1) ? (0) : (team->t.t_master_tid); 2907 } 2908 2909 int __kmp_get_team_size(int gtid, int level) { 2910 2911 int ii, dd; 2912 kmp_team_t *team; 2913 kmp_info_t *thr; 2914 2915 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2916 KMP_DEBUG_ASSERT(__kmp_init_serial); 2917 2918 // validate level 2919 if (level == 0) 2920 return 1; 2921 if (level < 0) 2922 return -1; 2923 thr = __kmp_threads[gtid]; 2924 team = thr->th.th_team; 2925 ii = team->t.t_level; 2926 if (level > ii) 2927 return -1; 2928 2929 if (thr->th.th_teams_microtask) { 2930 // AC: we are in teams region where multiple nested teams have same level 2931 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2932 if (level <= 2933 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2934 KMP_DEBUG_ASSERT(ii >= tlevel); 2935 // AC: As we need to pass by the teams league, we need to artificially 2936 // increase ii 2937 if (ii == tlevel) { 2938 ii += 2; // three teams have same level 2939 } else { 2940 ii++; // two teams have same level 2941 } 2942 } 2943 } 2944 2945 while (ii > level) { 2946 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2947 } 2948 if (team->t.t_serialized && (!dd)) { 2949 team = team->t.t_parent; 2950 continue; 2951 } 2952 if (ii > level) { 2953 team = team->t.t_parent; 2954 ii--; 2955 } 2956 } 2957 2958 return team->t.t_nproc; 2959 } 2960 2961 kmp_r_sched_t __kmp_get_schedule_global() { 2962 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2963 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2964 // independently. So one can get the updated schedule here. 2965 2966 kmp_r_sched_t r_sched; 2967 2968 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2969 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2970 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2971 // different roots (even in OMP 2.5) 2972 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2973 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2974 if (s == kmp_sch_static) { 2975 // replace STATIC with more detailed schedule (balanced or greedy) 2976 r_sched.r_sched_type = __kmp_static; 2977 } else if (s == kmp_sch_guided_chunked) { 2978 // replace GUIDED with more detailed schedule (iterative or analytical) 2979 r_sched.r_sched_type = __kmp_guided; 2980 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2981 r_sched.r_sched_type = __kmp_sched; 2982 } 2983 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2984 2985 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2986 // __kmp_chunk may be wrong here (if it was not ever set) 2987 r_sched.chunk = KMP_DEFAULT_CHUNK; 2988 } else { 2989 r_sched.chunk = __kmp_chunk; 2990 } 2991 2992 return r_sched; 2993 } 2994 2995 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2996 at least argc number of *t_argv entries for the requested team. */ 2997 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2998 2999 KMP_DEBUG_ASSERT(team); 3000 if (!realloc || argc > team->t.t_max_argc) { 3001 3002 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3003 "current entries=%d\n", 3004 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3005 /* if previously allocated heap space for args, free them */ 3006 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3007 __kmp_free((void *)team->t.t_argv); 3008 3009 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3010 /* use unused space in the cache line for arguments */ 3011 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3012 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3013 "argv entries\n", 3014 team->t.t_id, team->t.t_max_argc)); 3015 team->t.t_argv = &team->t.t_inline_argv[0]; 3016 if (__kmp_storage_map) { 3017 __kmp_print_storage_map_gtid( 3018 -1, &team->t.t_inline_argv[0], 3019 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3020 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3021 team->t.t_id); 3022 } 3023 } else { 3024 /* allocate space for arguments in the heap */ 3025 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3026 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3027 : 2 * argc; 3028 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3029 "argv entries\n", 3030 team->t.t_id, team->t.t_max_argc)); 3031 team->t.t_argv = 3032 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3033 if (__kmp_storage_map) { 3034 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3035 &team->t.t_argv[team->t.t_max_argc], 3036 sizeof(void *) * team->t.t_max_argc, 3037 "team_%d.t_argv", team->t.t_id); 3038 } 3039 } 3040 } 3041 } 3042 3043 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3044 int i; 3045 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3046 team->t.t_threads = 3047 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3048 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3049 sizeof(dispatch_shared_info_t) * num_disp_buff); 3050 team->t.t_dispatch = 3051 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3052 team->t.t_implicit_task_taskdata = 3053 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3054 team->t.t_max_nproc = max_nth; 3055 3056 /* setup dispatch buffers */ 3057 for (i = 0; i < num_disp_buff; ++i) { 3058 team->t.t_disp_buffer[i].buffer_index = i; 3059 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3060 } 3061 } 3062 3063 static void __kmp_free_team_arrays(kmp_team_t *team) { 3064 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3065 int i; 3066 for (i = 0; i < team->t.t_max_nproc; ++i) { 3067 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3068 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3069 team->t.t_dispatch[i].th_disp_buffer = NULL; 3070 } 3071 } 3072 #if KMP_USE_HIER_SCHED 3073 __kmp_dispatch_free_hierarchies(team); 3074 #endif 3075 __kmp_free(team->t.t_threads); 3076 __kmp_free(team->t.t_disp_buffer); 3077 __kmp_free(team->t.t_dispatch); 3078 __kmp_free(team->t.t_implicit_task_taskdata); 3079 team->t.t_threads = NULL; 3080 team->t.t_disp_buffer = NULL; 3081 team->t.t_dispatch = NULL; 3082 team->t.t_implicit_task_taskdata = 0; 3083 } 3084 3085 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3086 kmp_info_t **oldThreads = team->t.t_threads; 3087 3088 __kmp_free(team->t.t_disp_buffer); 3089 __kmp_free(team->t.t_dispatch); 3090 __kmp_free(team->t.t_implicit_task_taskdata); 3091 __kmp_allocate_team_arrays(team, max_nth); 3092 3093 KMP_MEMCPY(team->t.t_threads, oldThreads, 3094 team->t.t_nproc * sizeof(kmp_info_t *)); 3095 3096 __kmp_free(oldThreads); 3097 } 3098 3099 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3100 3101 kmp_r_sched_t r_sched = 3102 __kmp_get_schedule_global(); // get current state of scheduling globals 3103 3104 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3105 3106 kmp_internal_control_t g_icvs = { 3107 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3108 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3109 // adjustment of threads (per thread) 3110 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3111 // whether blocktime is explicitly set 3112 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3113 #if KMP_USE_MONITOR 3114 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3115 // intervals 3116 #endif 3117 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3118 // next parallel region (per thread) 3119 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3120 __kmp_cg_max_nth, // int thread_limit; 3121 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3122 // for max_active_levels 3123 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3124 // {sched,chunk} pair 3125 __kmp_nested_proc_bind.bind_types[0], 3126 __kmp_default_device, 3127 NULL // struct kmp_internal_control *next; 3128 }; 3129 3130 return g_icvs; 3131 } 3132 3133 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3134 3135 kmp_internal_control_t gx_icvs; 3136 gx_icvs.serial_nesting_level = 3137 0; // probably =team->t.t_serial like in save_inter_controls 3138 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3139 gx_icvs.next = NULL; 3140 3141 return gx_icvs; 3142 } 3143 3144 static void __kmp_initialize_root(kmp_root_t *root) { 3145 int f; 3146 kmp_team_t *root_team; 3147 kmp_team_t *hot_team; 3148 int hot_team_max_nth; 3149 kmp_r_sched_t r_sched = 3150 __kmp_get_schedule_global(); // get current state of scheduling globals 3151 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3152 KMP_DEBUG_ASSERT(root); 3153 KMP_ASSERT(!root->r.r_begin); 3154 3155 /* setup the root state structure */ 3156 __kmp_init_lock(&root->r.r_begin_lock); 3157 root->r.r_begin = FALSE; 3158 root->r.r_active = FALSE; 3159 root->r.r_in_parallel = 0; 3160 root->r.r_blocktime = __kmp_dflt_blocktime; 3161 3162 /* setup the root team for this task */ 3163 /* allocate the root team structure */ 3164 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3165 3166 root_team = 3167 __kmp_allocate_team(root, 3168 1, // new_nproc 3169 1, // max_nproc 3170 #if OMPT_SUPPORT 3171 ompt_data_none, // root parallel id 3172 #endif 3173 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3174 0 // argc 3175 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3176 ); 3177 #if USE_DEBUGGER 3178 // Non-NULL value should be assigned to make the debugger display the root 3179 // team. 3180 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3181 #endif 3182 3183 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3184 3185 root->r.r_root_team = root_team; 3186 root_team->t.t_control_stack_top = NULL; 3187 3188 /* initialize root team */ 3189 root_team->t.t_threads[0] = NULL; 3190 root_team->t.t_nproc = 1; 3191 root_team->t.t_serialized = 1; 3192 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3193 root_team->t.t_sched.sched = r_sched.sched; 3194 KA_TRACE( 3195 20, 3196 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3197 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3198 3199 /* setup the hot team for this task */ 3200 /* allocate the hot team structure */ 3201 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3202 3203 hot_team = 3204 __kmp_allocate_team(root, 3205 1, // new_nproc 3206 __kmp_dflt_team_nth_ub * 2, // max_nproc 3207 #if OMPT_SUPPORT 3208 ompt_data_none, // root parallel id 3209 #endif 3210 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3211 0 // argc 3212 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3213 ); 3214 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3215 3216 root->r.r_hot_team = hot_team; 3217 root_team->t.t_control_stack_top = NULL; 3218 3219 /* first-time initialization */ 3220 hot_team->t.t_parent = root_team; 3221 3222 /* initialize hot team */ 3223 hot_team_max_nth = hot_team->t.t_max_nproc; 3224 for (f = 0; f < hot_team_max_nth; ++f) { 3225 hot_team->t.t_threads[f] = NULL; 3226 } 3227 hot_team->t.t_nproc = 1; 3228 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3229 hot_team->t.t_sched.sched = r_sched.sched; 3230 hot_team->t.t_size_changed = 0; 3231 } 3232 3233 #ifdef KMP_DEBUG 3234 3235 typedef struct kmp_team_list_item { 3236 kmp_team_p const *entry; 3237 struct kmp_team_list_item *next; 3238 } kmp_team_list_item_t; 3239 typedef kmp_team_list_item_t *kmp_team_list_t; 3240 3241 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3242 kmp_team_list_t list, // List of teams. 3243 kmp_team_p const *team // Team to add. 3244 ) { 3245 3246 // List must terminate with item where both entry and next are NULL. 3247 // Team is added to the list only once. 3248 // List is sorted in ascending order by team id. 3249 // Team id is *not* a key. 3250 3251 kmp_team_list_t l; 3252 3253 KMP_DEBUG_ASSERT(list != NULL); 3254 if (team == NULL) { 3255 return; 3256 } 3257 3258 __kmp_print_structure_team_accum(list, team->t.t_parent); 3259 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3260 3261 // Search list for the team. 3262 l = list; 3263 while (l->next != NULL && l->entry != team) { 3264 l = l->next; 3265 } 3266 if (l->next != NULL) { 3267 return; // Team has been added before, exit. 3268 } 3269 3270 // Team is not found. Search list again for insertion point. 3271 l = list; 3272 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3273 l = l->next; 3274 } 3275 3276 // Insert team. 3277 { 3278 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3279 sizeof(kmp_team_list_item_t)); 3280 *item = *l; 3281 l->entry = team; 3282 l->next = item; 3283 } 3284 } 3285 3286 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3287 3288 ) { 3289 __kmp_printf("%s", title); 3290 if (team != NULL) { 3291 __kmp_printf("%2x %p\n", team->t.t_id, team); 3292 } else { 3293 __kmp_printf(" - (nil)\n"); 3294 } 3295 } 3296 3297 static void __kmp_print_structure_thread(char const *title, 3298 kmp_info_p const *thread) { 3299 __kmp_printf("%s", title); 3300 if (thread != NULL) { 3301 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3302 } else { 3303 __kmp_printf(" - (nil)\n"); 3304 } 3305 } 3306 3307 void __kmp_print_structure(void) { 3308 3309 kmp_team_list_t list; 3310 3311 // Initialize list of teams. 3312 list = 3313 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3314 list->entry = NULL; 3315 list->next = NULL; 3316 3317 __kmp_printf("\n------------------------------\nGlobal Thread " 3318 "Table\n------------------------------\n"); 3319 { 3320 int gtid; 3321 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3322 __kmp_printf("%2d", gtid); 3323 if (__kmp_threads != NULL) { 3324 __kmp_printf(" %p", __kmp_threads[gtid]); 3325 } 3326 if (__kmp_root != NULL) { 3327 __kmp_printf(" %p", __kmp_root[gtid]); 3328 } 3329 __kmp_printf("\n"); 3330 } 3331 } 3332 3333 // Print out __kmp_threads array. 3334 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3335 "----------\n"); 3336 if (__kmp_threads != NULL) { 3337 int gtid; 3338 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3339 kmp_info_t const *thread = __kmp_threads[gtid]; 3340 if (thread != NULL) { 3341 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3342 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3343 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3344 __kmp_print_structure_team(" Serial Team: ", 3345 thread->th.th_serial_team); 3346 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3347 __kmp_print_structure_thread(" Master: ", 3348 thread->th.th_team_master); 3349 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3350 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3351 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3352 __kmp_print_structure_thread(" Next in pool: ", 3353 thread->th.th_next_pool); 3354 __kmp_printf("\n"); 3355 __kmp_print_structure_team_accum(list, thread->th.th_team); 3356 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3357 } 3358 } 3359 } else { 3360 __kmp_printf("Threads array is not allocated.\n"); 3361 } 3362 3363 // Print out __kmp_root array. 3364 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3365 "--------\n"); 3366 if (__kmp_root != NULL) { 3367 int gtid; 3368 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3369 kmp_root_t const *root = __kmp_root[gtid]; 3370 if (root != NULL) { 3371 __kmp_printf("GTID %2d %p:\n", gtid, root); 3372 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3373 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3374 __kmp_print_structure_thread(" Uber Thread: ", 3375 root->r.r_uber_thread); 3376 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3377 __kmp_printf(" In Parallel: %2d\n", 3378 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3379 __kmp_printf("\n"); 3380 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3381 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3382 } 3383 } 3384 } else { 3385 __kmp_printf("Ubers array is not allocated.\n"); 3386 } 3387 3388 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3389 "--------\n"); 3390 while (list->next != NULL) { 3391 kmp_team_p const *team = list->entry; 3392 int i; 3393 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3394 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3395 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3396 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3397 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3398 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3399 for (i = 0; i < team->t.t_nproc; ++i) { 3400 __kmp_printf(" Thread %2d: ", i); 3401 __kmp_print_structure_thread("", team->t.t_threads[i]); 3402 } 3403 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3404 __kmp_printf("\n"); 3405 list = list->next; 3406 } 3407 3408 // Print out __kmp_thread_pool and __kmp_team_pool. 3409 __kmp_printf("\n------------------------------\nPools\n----------------------" 3410 "--------\n"); 3411 __kmp_print_structure_thread("Thread pool: ", 3412 CCAST(kmp_info_t *, __kmp_thread_pool)); 3413 __kmp_print_structure_team("Team pool: ", 3414 CCAST(kmp_team_t *, __kmp_team_pool)); 3415 __kmp_printf("\n"); 3416 3417 // Free team list. 3418 while (list != NULL) { 3419 kmp_team_list_item_t *item = list; 3420 list = list->next; 3421 KMP_INTERNAL_FREE(item); 3422 } 3423 } 3424 3425 #endif 3426 3427 //--------------------------------------------------------------------------- 3428 // Stuff for per-thread fast random number generator 3429 // Table of primes 3430 static const unsigned __kmp_primes[] = { 3431 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3432 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3433 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3434 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3435 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3436 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3437 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3438 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3439 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3440 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3441 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3442 3443 //--------------------------------------------------------------------------- 3444 // __kmp_get_random: Get a random number using a linear congruential method. 3445 unsigned short __kmp_get_random(kmp_info_t *thread) { 3446 unsigned x = thread->th.th_x; 3447 unsigned short r = x >> 16; 3448 3449 thread->th.th_x = x * thread->th.th_a + 1; 3450 3451 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3452 thread->th.th_info.ds.ds_tid, r)); 3453 3454 return r; 3455 } 3456 //-------------------------------------------------------- 3457 // __kmp_init_random: Initialize a random number generator 3458 void __kmp_init_random(kmp_info_t *thread) { 3459 unsigned seed = thread->th.th_info.ds.ds_tid; 3460 3461 thread->th.th_a = 3462 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3463 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3464 KA_TRACE(30, 3465 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3466 } 3467 3468 #if KMP_OS_WINDOWS 3469 /* reclaim array entries for root threads that are already dead, returns number 3470 * reclaimed */ 3471 static int __kmp_reclaim_dead_roots(void) { 3472 int i, r = 0; 3473 3474 for (i = 0; i < __kmp_threads_capacity; ++i) { 3475 if (KMP_UBER_GTID(i) && 3476 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3477 !__kmp_root[i] 3478 ->r.r_active) { // AC: reclaim only roots died in non-active state 3479 r += __kmp_unregister_root_other_thread(i); 3480 } 3481 } 3482 return r; 3483 } 3484 #endif 3485 3486 /* This function attempts to create free entries in __kmp_threads and 3487 __kmp_root, and returns the number of free entries generated. 3488 3489 For Windows* OS static library, the first mechanism used is to reclaim array 3490 entries for root threads that are already dead. 3491 3492 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3493 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3494 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3495 threadprivate cache array has been created. Synchronization with 3496 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3497 3498 After any dead root reclamation, if the clipping value allows array expansion 3499 to result in the generation of a total of nNeed free slots, the function does 3500 that expansion. If not, nothing is done beyond the possible initial root 3501 thread reclamation. 3502 3503 If any argument is negative, the behavior is undefined. */ 3504 static int __kmp_expand_threads(int nNeed) { 3505 int added = 0; 3506 int minimumRequiredCapacity; 3507 int newCapacity; 3508 kmp_info_t **newThreads; 3509 kmp_root_t **newRoot; 3510 3511 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3512 // resizing __kmp_threads does not need additional protection if foreign 3513 // threads are present 3514 3515 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3516 /* only for Windows static library */ 3517 /* reclaim array entries for root threads that are already dead */ 3518 added = __kmp_reclaim_dead_roots(); 3519 3520 if (nNeed) { 3521 nNeed -= added; 3522 if (nNeed < 0) 3523 nNeed = 0; 3524 } 3525 #endif 3526 if (nNeed <= 0) 3527 return added; 3528 3529 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3530 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3531 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3532 // > __kmp_max_nth in one of two ways: 3533 // 3534 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3535 // may not be reused by another thread, so we may need to increase 3536 // __kmp_threads_capacity to __kmp_max_nth + 1. 3537 // 3538 // 2) New foreign root(s) are encountered. We always register new foreign 3539 // roots. This may cause a smaller # of threads to be allocated at 3540 // subsequent parallel regions, but the worker threads hang around (and 3541 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3542 // 3543 // Anyway, that is the reason for moving the check to see if 3544 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3545 // instead of having it performed here. -BB 3546 3547 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3548 3549 /* compute expansion headroom to check if we can expand */ 3550 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3551 /* possible expansion too small -- give up */ 3552 return added; 3553 } 3554 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3555 3556 newCapacity = __kmp_threads_capacity; 3557 do { 3558 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3559 : __kmp_sys_max_nth; 3560 } while (newCapacity < minimumRequiredCapacity); 3561 newThreads = (kmp_info_t **)__kmp_allocate( 3562 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3563 newRoot = 3564 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3565 KMP_MEMCPY(newThreads, __kmp_threads, 3566 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3567 KMP_MEMCPY(newRoot, __kmp_root, 3568 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3569 3570 kmp_info_t **temp_threads = __kmp_threads; 3571 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3572 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3573 __kmp_free(temp_threads); 3574 added += newCapacity - __kmp_threads_capacity; 3575 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3576 3577 if (newCapacity > __kmp_tp_capacity) { 3578 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3579 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3580 __kmp_threadprivate_resize_cache(newCapacity); 3581 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3582 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3583 } 3584 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3585 } 3586 3587 return added; 3588 } 3589 3590 /* Register the current thread as a root thread and obtain our gtid. We must 3591 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3592 thread that calls from __kmp_do_serial_initialize() */ 3593 int __kmp_register_root(int initial_thread) { 3594 kmp_info_t *root_thread; 3595 kmp_root_t *root; 3596 int gtid; 3597 int capacity; 3598 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3599 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3600 KMP_MB(); 3601 3602 /* 2007-03-02: 3603 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3604 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3605 work as expected -- it may return false (that means there is at least one 3606 empty slot in __kmp_threads array), but it is possible the only free slot 3607 is #0, which is reserved for initial thread and so cannot be used for this 3608 one. Following code workarounds this bug. 3609 3610 However, right solution seems to be not reserving slot #0 for initial 3611 thread because: 3612 (1) there is no magic in slot #0, 3613 (2) we cannot detect initial thread reliably (the first thread which does 3614 serial initialization may be not a real initial thread). 3615 */ 3616 capacity = __kmp_threads_capacity; 3617 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3618 --capacity; 3619 } 3620 3621 /* see if there are too many threads */ 3622 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3623 if (__kmp_tp_cached) { 3624 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3625 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3626 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3627 } else { 3628 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3629 __kmp_msg_null); 3630 } 3631 } 3632 3633 /* find an available thread slot */ 3634 /* Don't reassign the zero slot since we need that to only be used by initial 3635 thread */ 3636 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL; 3637 gtid++) 3638 ; 3639 KA_TRACE(1, 3640 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3641 KMP_ASSERT(gtid < __kmp_threads_capacity); 3642 3643 /* update global accounting */ 3644 __kmp_all_nth++; 3645 TCW_4(__kmp_nth, __kmp_nth + 1); 3646 3647 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3648 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3649 if (__kmp_adjust_gtid_mode) { 3650 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3651 if (TCR_4(__kmp_gtid_mode) != 2) { 3652 TCW_4(__kmp_gtid_mode, 2); 3653 } 3654 } else { 3655 if (TCR_4(__kmp_gtid_mode) != 1) { 3656 TCW_4(__kmp_gtid_mode, 1); 3657 } 3658 } 3659 } 3660 3661 #ifdef KMP_ADJUST_BLOCKTIME 3662 /* Adjust blocktime to zero if necessary */ 3663 /* Middle initialization might not have occurred yet */ 3664 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3665 if (__kmp_nth > __kmp_avail_proc) { 3666 __kmp_zero_bt = TRUE; 3667 } 3668 } 3669 #endif /* KMP_ADJUST_BLOCKTIME */ 3670 3671 /* setup this new hierarchy */ 3672 if (!(root = __kmp_root[gtid])) { 3673 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3674 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3675 } 3676 3677 #if KMP_STATS_ENABLED 3678 // Initialize stats as soon as possible (right after gtid assignment). 3679 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3680 __kmp_stats_thread_ptr->startLife(); 3681 KMP_SET_THREAD_STATE(SERIAL_REGION); 3682 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3683 #endif 3684 __kmp_initialize_root(root); 3685 3686 /* setup new root thread structure */ 3687 if (root->r.r_uber_thread) { 3688 root_thread = root->r.r_uber_thread; 3689 } else { 3690 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3691 if (__kmp_storage_map) { 3692 __kmp_print_thread_storage_map(root_thread, gtid); 3693 } 3694 root_thread->th.th_info.ds.ds_gtid = gtid; 3695 #if OMPT_SUPPORT 3696 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3697 #endif 3698 root_thread->th.th_root = root; 3699 if (__kmp_env_consistency_check) { 3700 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3701 } 3702 #if USE_FAST_MEMORY 3703 __kmp_initialize_fast_memory(root_thread); 3704 #endif /* USE_FAST_MEMORY */ 3705 3706 #if KMP_USE_BGET 3707 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3708 __kmp_initialize_bget(root_thread); 3709 #endif 3710 __kmp_init_random(root_thread); // Initialize random number generator 3711 } 3712 3713 /* setup the serial team held in reserve by the root thread */ 3714 if (!root_thread->th.th_serial_team) { 3715 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3716 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3717 root_thread->th.th_serial_team = __kmp_allocate_team( 3718 root, 1, 1, 3719 #if OMPT_SUPPORT 3720 ompt_data_none, // root parallel id 3721 #endif 3722 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3723 } 3724 KMP_ASSERT(root_thread->th.th_serial_team); 3725 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3726 root_thread->th.th_serial_team)); 3727 3728 /* drop root_thread into place */ 3729 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3730 3731 root->r.r_root_team->t.t_threads[0] = root_thread; 3732 root->r.r_hot_team->t.t_threads[0] = root_thread; 3733 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3734 // AC: the team created in reserve, not for execution (it is unused for now). 3735 root_thread->th.th_serial_team->t.t_serialized = 0; 3736 root->r.r_uber_thread = root_thread; 3737 3738 /* initialize the thread, get it ready to go */ 3739 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3740 TCW_4(__kmp_init_gtid, TRUE); 3741 3742 /* prepare the master thread for get_gtid() */ 3743 __kmp_gtid_set_specific(gtid); 3744 3745 #if USE_ITT_BUILD 3746 __kmp_itt_thread_name(gtid); 3747 #endif /* USE_ITT_BUILD */ 3748 3749 #ifdef KMP_TDATA_GTID 3750 __kmp_gtid = gtid; 3751 #endif 3752 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3753 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3754 3755 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3756 "plain=%u\n", 3757 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3758 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3759 KMP_INIT_BARRIER_STATE)); 3760 { // Initialize barrier data. 3761 int b; 3762 for (b = 0; b < bs_last_barrier; ++b) { 3763 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3764 #if USE_DEBUGGER 3765 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3766 #endif 3767 } 3768 } 3769 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3770 KMP_INIT_BARRIER_STATE); 3771 3772 #if KMP_AFFINITY_SUPPORTED 3773 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3774 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3775 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3776 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3777 if (TCR_4(__kmp_init_middle)) { 3778 __kmp_affinity_set_init_mask(gtid, TRUE); 3779 } 3780 #endif /* KMP_AFFINITY_SUPPORTED */ 3781 root_thread->th.th_def_allocator = __kmp_def_allocator; 3782 root_thread->th.th_prev_level = 0; 3783 root_thread->th.th_prev_num_threads = 1; 3784 3785 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3786 tmp->cg_root = root_thread; 3787 tmp->cg_thread_limit = __kmp_cg_max_nth; 3788 tmp->cg_nthreads = 1; 3789 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3790 " cg_nthreads init to 1\n", 3791 root_thread, tmp)); 3792 tmp->up = NULL; 3793 root_thread->th.th_cg_roots = tmp; 3794 3795 __kmp_root_counter++; 3796 3797 #if OMPT_SUPPORT 3798 if (!initial_thread && ompt_enabled.enabled) { 3799 3800 kmp_info_t *root_thread = ompt_get_thread(); 3801 3802 ompt_set_thread_state(root_thread, ompt_state_overhead); 3803 3804 if (ompt_enabled.ompt_callback_thread_begin) { 3805 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3806 ompt_thread_initial, __ompt_get_thread_data_internal()); 3807 } 3808 ompt_data_t *task_data; 3809 ompt_data_t *parallel_data; 3810 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3811 if (ompt_enabled.ompt_callback_implicit_task) { 3812 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3813 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3814 } 3815 3816 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3817 } 3818 #endif 3819 3820 KMP_MB(); 3821 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3822 3823 return gtid; 3824 } 3825 3826 #if KMP_NESTED_HOT_TEAMS 3827 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3828 const int max_level) { 3829 int i, n, nth; 3830 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3831 if (!hot_teams || !hot_teams[level].hot_team) { 3832 return 0; 3833 } 3834 KMP_DEBUG_ASSERT(level < max_level); 3835 kmp_team_t *team = hot_teams[level].hot_team; 3836 nth = hot_teams[level].hot_team_nth; 3837 n = nth - 1; // master is not freed 3838 if (level < max_level - 1) { 3839 for (i = 0; i < nth; ++i) { 3840 kmp_info_t *th = team->t.t_threads[i]; 3841 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3842 if (i > 0 && th->th.th_hot_teams) { 3843 __kmp_free(th->th.th_hot_teams); 3844 th->th.th_hot_teams = NULL; 3845 } 3846 } 3847 } 3848 __kmp_free_team(root, team, NULL); 3849 return n; 3850 } 3851 #endif 3852 3853 // Resets a root thread and clear its root and hot teams. 3854 // Returns the number of __kmp_threads entries directly and indirectly freed. 3855 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3856 kmp_team_t *root_team = root->r.r_root_team; 3857 kmp_team_t *hot_team = root->r.r_hot_team; 3858 int n = hot_team->t.t_nproc; 3859 int i; 3860 3861 KMP_DEBUG_ASSERT(!root->r.r_active); 3862 3863 root->r.r_root_team = NULL; 3864 root->r.r_hot_team = NULL; 3865 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3866 // before call to __kmp_free_team(). 3867 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3868 #if KMP_NESTED_HOT_TEAMS 3869 if (__kmp_hot_teams_max_level > 3870 0) { // need to free nested hot teams and their threads if any 3871 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3872 kmp_info_t *th = hot_team->t.t_threads[i]; 3873 if (__kmp_hot_teams_max_level > 1) { 3874 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3875 } 3876 if (th->th.th_hot_teams) { 3877 __kmp_free(th->th.th_hot_teams); 3878 th->th.th_hot_teams = NULL; 3879 } 3880 } 3881 } 3882 #endif 3883 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3884 3885 // Before we can reap the thread, we need to make certain that all other 3886 // threads in the teams that had this root as ancestor have stopped trying to 3887 // steal tasks. 3888 if (__kmp_tasking_mode != tskm_immediate_exec) { 3889 __kmp_wait_to_unref_task_teams(); 3890 } 3891 3892 #if KMP_OS_WINDOWS 3893 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3894 KA_TRACE( 3895 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3896 "\n", 3897 (LPVOID) & (root->r.r_uber_thread->th), 3898 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3899 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3900 #endif /* KMP_OS_WINDOWS */ 3901 3902 #if OMPT_SUPPORT 3903 ompt_data_t *task_data; 3904 ompt_data_t *parallel_data; 3905 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3906 if (ompt_enabled.ompt_callback_implicit_task) { 3907 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3908 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3909 } 3910 if (ompt_enabled.ompt_callback_thread_end) { 3911 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3912 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3913 } 3914 #endif 3915 3916 TCW_4(__kmp_nth, 3917 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3918 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3919 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3920 " to %d\n", 3921 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3922 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3923 if (i == 1) { 3924 // need to free contention group structure 3925 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3926 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3927 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3928 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3929 root->r.r_uber_thread->th.th_cg_roots = NULL; 3930 } 3931 __kmp_reap_thread(root->r.r_uber_thread, 1); 3932 3933 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3934 // instead of freeing. 3935 root->r.r_uber_thread = NULL; 3936 /* mark root as no longer in use */ 3937 root->r.r_begin = FALSE; 3938 3939 return n; 3940 } 3941 3942 void __kmp_unregister_root_current_thread(int gtid) { 3943 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3944 /* this lock should be ok, since unregister_root_current_thread is never 3945 called during an abort, only during a normal close. furthermore, if you 3946 have the forkjoin lock, you should never try to get the initz lock */ 3947 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3948 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3949 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3950 "exiting T#%d\n", 3951 gtid)); 3952 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3953 return; 3954 } 3955 kmp_root_t *root = __kmp_root[gtid]; 3956 3957 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3958 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3959 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3960 KMP_ASSERT(root->r.r_active == FALSE); 3961 3962 KMP_MB(); 3963 3964 kmp_info_t *thread = __kmp_threads[gtid]; 3965 kmp_team_t *team = thread->th.th_team; 3966 kmp_task_team_t *task_team = thread->th.th_task_team; 3967 3968 // we need to wait for the proxy tasks before finishing the thread 3969 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3970 #if OMPT_SUPPORT 3971 // the runtime is shutting down so we won't report any events 3972 thread->th.ompt_thread_info.state = ompt_state_undefined; 3973 #endif 3974 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3975 } 3976 3977 __kmp_reset_root(gtid, root); 3978 3979 /* free up this thread slot */ 3980 __kmp_gtid_set_specific(KMP_GTID_DNE); 3981 #ifdef KMP_TDATA_GTID 3982 __kmp_gtid = KMP_GTID_DNE; 3983 #endif 3984 3985 KMP_MB(); 3986 KC_TRACE(10, 3987 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3988 3989 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3990 } 3991 3992 #if KMP_OS_WINDOWS 3993 /* __kmp_forkjoin_lock must be already held 3994 Unregisters a root thread that is not the current thread. Returns the number 3995 of __kmp_threads entries freed as a result. */ 3996 static int __kmp_unregister_root_other_thread(int gtid) { 3997 kmp_root_t *root = __kmp_root[gtid]; 3998 int r; 3999 4000 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4001 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4002 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4003 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4004 KMP_ASSERT(root->r.r_active == FALSE); 4005 4006 r = __kmp_reset_root(gtid, root); 4007 KC_TRACE(10, 4008 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4009 return r; 4010 } 4011 #endif 4012 4013 #if KMP_DEBUG 4014 void __kmp_task_info() { 4015 4016 kmp_int32 gtid = __kmp_entry_gtid(); 4017 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4018 kmp_info_t *this_thr = __kmp_threads[gtid]; 4019 kmp_team_t *steam = this_thr->th.th_serial_team; 4020 kmp_team_t *team = this_thr->th.th_team; 4021 4022 __kmp_printf( 4023 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4024 "ptask=%p\n", 4025 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4026 team->t.t_implicit_task_taskdata[tid].td_parent); 4027 } 4028 #endif // KMP_DEBUG 4029 4030 /* TODO optimize with one big memclr, take out what isn't needed, split 4031 responsibility to workers as much as possible, and delay initialization of 4032 features as much as possible */ 4033 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4034 int tid, int gtid) { 4035 /* this_thr->th.th_info.ds.ds_gtid is setup in 4036 kmp_allocate_thread/create_worker. 4037 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4038 kmp_info_t *master = team->t.t_threads[0]; 4039 KMP_DEBUG_ASSERT(this_thr != NULL); 4040 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4041 KMP_DEBUG_ASSERT(team); 4042 KMP_DEBUG_ASSERT(team->t.t_threads); 4043 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4044 KMP_DEBUG_ASSERT(master); 4045 KMP_DEBUG_ASSERT(master->th.th_root); 4046 4047 KMP_MB(); 4048 4049 TCW_SYNC_PTR(this_thr->th.th_team, team); 4050 4051 this_thr->th.th_info.ds.ds_tid = tid; 4052 this_thr->th.th_set_nproc = 0; 4053 if (__kmp_tasking_mode != tskm_immediate_exec) 4054 // When tasking is possible, threads are not safe to reap until they are 4055 // done tasking; this will be set when tasking code is exited in wait 4056 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4057 else // no tasking --> always safe to reap 4058 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4059 this_thr->th.th_set_proc_bind = proc_bind_default; 4060 #if KMP_AFFINITY_SUPPORTED 4061 this_thr->th.th_new_place = this_thr->th.th_current_place; 4062 #endif 4063 this_thr->th.th_root = master->th.th_root; 4064 4065 /* setup the thread's cache of the team structure */ 4066 this_thr->th.th_team_nproc = team->t.t_nproc; 4067 this_thr->th.th_team_master = master; 4068 this_thr->th.th_team_serialized = team->t.t_serialized; 4069 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4070 4071 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4072 4073 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4074 tid, gtid, this_thr, this_thr->th.th_current_task)); 4075 4076 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4077 team, tid, TRUE); 4078 4079 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4080 tid, gtid, this_thr, this_thr->th.th_current_task)); 4081 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4082 // __kmp_initialize_team()? 4083 4084 /* TODO no worksharing in speculative threads */ 4085 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4086 4087 this_thr->th.th_local.this_construct = 0; 4088 4089 if (!this_thr->th.th_pri_common) { 4090 this_thr->th.th_pri_common = 4091 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4092 if (__kmp_storage_map) { 4093 __kmp_print_storage_map_gtid( 4094 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4095 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4096 } 4097 this_thr->th.th_pri_head = NULL; 4098 } 4099 4100 if (this_thr != master && // Master's CG root is initialized elsewhere 4101 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4102 // Make new thread's CG root same as master's 4103 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4104 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4105 if (tmp) { 4106 // worker changes CG, need to check if old CG should be freed 4107 int i = tmp->cg_nthreads--; 4108 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4109 " on node %p of thread %p to %d\n", 4110 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4111 if (i == 1) { 4112 __kmp_free(tmp); // last thread left CG --> free it 4113 } 4114 } 4115 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4116 // Increment new thread's CG root's counter to add the new thread 4117 this_thr->th.th_cg_roots->cg_nthreads++; 4118 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4119 " node %p of thread %p to %d\n", 4120 this_thr, this_thr->th.th_cg_roots, 4121 this_thr->th.th_cg_roots->cg_root, 4122 this_thr->th.th_cg_roots->cg_nthreads)); 4123 this_thr->th.th_current_task->td_icvs.thread_limit = 4124 this_thr->th.th_cg_roots->cg_thread_limit; 4125 } 4126 4127 /* Initialize dynamic dispatch */ 4128 { 4129 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4130 // Use team max_nproc since this will never change for the team. 4131 size_t disp_size = 4132 sizeof(dispatch_private_info_t) * 4133 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4134 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4135 team->t.t_max_nproc)); 4136 KMP_ASSERT(dispatch); 4137 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4138 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4139 4140 dispatch->th_disp_index = 0; 4141 dispatch->th_doacross_buf_idx = 0; 4142 if (!dispatch->th_disp_buffer) { 4143 dispatch->th_disp_buffer = 4144 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4145 4146 if (__kmp_storage_map) { 4147 __kmp_print_storage_map_gtid( 4148 gtid, &dispatch->th_disp_buffer[0], 4149 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4150 ? 1 4151 : __kmp_dispatch_num_buffers], 4152 disp_size, "th_%d.th_dispatch.th_disp_buffer " 4153 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4154 gtid, team->t.t_id, gtid); 4155 } 4156 } else { 4157 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4158 } 4159 4160 dispatch->th_dispatch_pr_current = 0; 4161 dispatch->th_dispatch_sh_current = 0; 4162 4163 dispatch->th_deo_fcn = 0; /* ORDERED */ 4164 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4165 } 4166 4167 this_thr->th.th_next_pool = NULL; 4168 4169 if (!this_thr->th.th_task_state_memo_stack) { 4170 size_t i; 4171 this_thr->th.th_task_state_memo_stack = 4172 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4173 this_thr->th.th_task_state_top = 0; 4174 this_thr->th.th_task_state_stack_sz = 4; 4175 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4176 ++i) // zero init the stack 4177 this_thr->th.th_task_state_memo_stack[i] = 0; 4178 } 4179 4180 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4181 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4182 4183 KMP_MB(); 4184 } 4185 4186 /* allocate a new thread for the requesting team. this is only called from 4187 within a forkjoin critical section. we will first try to get an available 4188 thread from the thread pool. if none is available, we will fork a new one 4189 assuming we are able to create a new one. this should be assured, as the 4190 caller should check on this first. */ 4191 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4192 int new_tid) { 4193 kmp_team_t *serial_team; 4194 kmp_info_t *new_thr; 4195 int new_gtid; 4196 4197 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4198 KMP_DEBUG_ASSERT(root && team); 4199 #if !KMP_NESTED_HOT_TEAMS 4200 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4201 #endif 4202 KMP_MB(); 4203 4204 /* first, try to get one from the thread pool */ 4205 if (__kmp_thread_pool) { 4206 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4207 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4208 if (new_thr == __kmp_thread_pool_insert_pt) { 4209 __kmp_thread_pool_insert_pt = NULL; 4210 } 4211 TCW_4(new_thr->th.th_in_pool, FALSE); 4212 __kmp_suspend_initialize_thread(new_thr); 4213 __kmp_lock_suspend_mx(new_thr); 4214 if (new_thr->th.th_active_in_pool == TRUE) { 4215 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4216 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4217 new_thr->th.th_active_in_pool = FALSE; 4218 } 4219 __kmp_unlock_suspend_mx(new_thr); 4220 4221 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4222 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4223 KMP_ASSERT(!new_thr->th.th_team); 4224 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4225 4226 /* setup the thread structure */ 4227 __kmp_initialize_info(new_thr, team, new_tid, 4228 new_thr->th.th_info.ds.ds_gtid); 4229 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4230 4231 TCW_4(__kmp_nth, __kmp_nth + 1); 4232 4233 new_thr->th.th_task_state = 0; 4234 new_thr->th.th_task_state_top = 0; 4235 new_thr->th.th_task_state_stack_sz = 4; 4236 4237 #ifdef KMP_ADJUST_BLOCKTIME 4238 /* Adjust blocktime back to zero if necessary */ 4239 /* Middle initialization might not have occurred yet */ 4240 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4241 if (__kmp_nth > __kmp_avail_proc) { 4242 __kmp_zero_bt = TRUE; 4243 } 4244 } 4245 #endif /* KMP_ADJUST_BLOCKTIME */ 4246 4247 #if KMP_DEBUG 4248 // If thread entered pool via __kmp_free_thread, wait_flag should != 4249 // KMP_BARRIER_PARENT_FLAG. 4250 int b; 4251 kmp_balign_t *balign = new_thr->th.th_bar; 4252 for (b = 0; b < bs_last_barrier; ++b) 4253 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4254 #endif 4255 4256 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4257 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4258 4259 KMP_MB(); 4260 return new_thr; 4261 } 4262 4263 /* no, well fork a new one */ 4264 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4265 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4266 4267 #if KMP_USE_MONITOR 4268 // If this is the first worker thread the RTL is creating, then also 4269 // launch the monitor thread. We try to do this as early as possible. 4270 if (!TCR_4(__kmp_init_monitor)) { 4271 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4272 if (!TCR_4(__kmp_init_monitor)) { 4273 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4274 TCW_4(__kmp_init_monitor, 1); 4275 __kmp_create_monitor(&__kmp_monitor); 4276 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4277 #if KMP_OS_WINDOWS 4278 // AC: wait until monitor has started. This is a fix for CQ232808. 4279 // The reason is that if the library is loaded/unloaded in a loop with 4280 // small (parallel) work in between, then there is high probability that 4281 // monitor thread started after the library shutdown. At shutdown it is 4282 // too late to cope with the problem, because when the master is in 4283 // DllMain (process detach) the monitor has no chances to start (it is 4284 // blocked), and master has no means to inform the monitor that the 4285 // library has gone, because all the memory which the monitor can access 4286 // is going to be released/reset. 4287 while (TCR_4(__kmp_init_monitor) < 2) { 4288 KMP_YIELD(TRUE); 4289 } 4290 KF_TRACE(10, ("after monitor thread has started\n")); 4291 #endif 4292 } 4293 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4294 } 4295 #endif 4296 4297 KMP_MB(); 4298 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { 4299 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4300 } 4301 4302 /* allocate space for it. */ 4303 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4304 4305 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4306 4307 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4308 // suppress race conditions detection on synchronization flags in debug mode 4309 // this helps to analyze library internals eliminating false positives 4310 __itt_suppress_mark_range( 4311 __itt_suppress_range, __itt_suppress_threading_errors, 4312 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4313 __itt_suppress_mark_range( 4314 __itt_suppress_range, __itt_suppress_threading_errors, 4315 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4316 #if KMP_OS_WINDOWS 4317 __itt_suppress_mark_range( 4318 __itt_suppress_range, __itt_suppress_threading_errors, 4319 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4320 #else 4321 __itt_suppress_mark_range(__itt_suppress_range, 4322 __itt_suppress_threading_errors, 4323 &new_thr->th.th_suspend_init_count, 4324 sizeof(new_thr->th.th_suspend_init_count)); 4325 #endif 4326 // TODO: check if we need to also suppress b_arrived flags 4327 __itt_suppress_mark_range(__itt_suppress_range, 4328 __itt_suppress_threading_errors, 4329 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4330 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4331 __itt_suppress_mark_range(__itt_suppress_range, 4332 __itt_suppress_threading_errors, 4333 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4334 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4335 __itt_suppress_mark_range(__itt_suppress_range, 4336 __itt_suppress_threading_errors, 4337 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4338 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4339 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4340 if (__kmp_storage_map) { 4341 __kmp_print_thread_storage_map(new_thr, new_gtid); 4342 } 4343 4344 // add the reserve serialized team, initialized from the team's master thread 4345 { 4346 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4347 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4348 new_thr->th.th_serial_team = serial_team = 4349 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4350 #if OMPT_SUPPORT 4351 ompt_data_none, // root parallel id 4352 #endif 4353 proc_bind_default, &r_icvs, 4354 0 USE_NESTED_HOT_ARG(NULL)); 4355 } 4356 KMP_ASSERT(serial_team); 4357 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4358 // execution (it is unused for now). 4359 serial_team->t.t_threads[0] = new_thr; 4360 KF_TRACE(10, 4361 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4362 new_thr)); 4363 4364 /* setup the thread structures */ 4365 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4366 4367 #if USE_FAST_MEMORY 4368 __kmp_initialize_fast_memory(new_thr); 4369 #endif /* USE_FAST_MEMORY */ 4370 4371 #if KMP_USE_BGET 4372 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4373 __kmp_initialize_bget(new_thr); 4374 #endif 4375 4376 __kmp_init_random(new_thr); // Initialize random number generator 4377 4378 /* Initialize these only once when thread is grabbed for a team allocation */ 4379 KA_TRACE(20, 4380 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4381 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4382 4383 int b; 4384 kmp_balign_t *balign = new_thr->th.th_bar; 4385 for (b = 0; b < bs_last_barrier; ++b) { 4386 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4387 balign[b].bb.team = NULL; 4388 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4389 balign[b].bb.use_oncore_barrier = 0; 4390 } 4391 4392 new_thr->th.th_spin_here = FALSE; 4393 new_thr->th.th_next_waiting = 0; 4394 #if KMP_OS_UNIX 4395 new_thr->th.th_blocking = false; 4396 #endif 4397 4398 #if KMP_AFFINITY_SUPPORTED 4399 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4400 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4401 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4402 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4403 #endif 4404 new_thr->th.th_def_allocator = __kmp_def_allocator; 4405 new_thr->th.th_prev_level = 0; 4406 new_thr->th.th_prev_num_threads = 1; 4407 4408 TCW_4(new_thr->th.th_in_pool, FALSE); 4409 new_thr->th.th_active_in_pool = FALSE; 4410 TCW_4(new_thr->th.th_active, TRUE); 4411 4412 /* adjust the global counters */ 4413 __kmp_all_nth++; 4414 __kmp_nth++; 4415 4416 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4417 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4418 if (__kmp_adjust_gtid_mode) { 4419 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4420 if (TCR_4(__kmp_gtid_mode) != 2) { 4421 TCW_4(__kmp_gtid_mode, 2); 4422 } 4423 } else { 4424 if (TCR_4(__kmp_gtid_mode) != 1) { 4425 TCW_4(__kmp_gtid_mode, 1); 4426 } 4427 } 4428 } 4429 4430 #ifdef KMP_ADJUST_BLOCKTIME 4431 /* Adjust blocktime back to zero if necessary */ 4432 /* Middle initialization might not have occurred yet */ 4433 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4434 if (__kmp_nth > __kmp_avail_proc) { 4435 __kmp_zero_bt = TRUE; 4436 } 4437 } 4438 #endif /* KMP_ADJUST_BLOCKTIME */ 4439 4440 /* actually fork it and create the new worker thread */ 4441 KF_TRACE( 4442 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4443 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4444 KF_TRACE(10, 4445 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4446 4447 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4448 new_gtid)); 4449 KMP_MB(); 4450 return new_thr; 4451 } 4452 4453 /* Reinitialize team for reuse. 4454 The hot team code calls this case at every fork barrier, so EPCC barrier 4455 test are extremely sensitive to changes in it, esp. writes to the team 4456 struct, which cause a cache invalidation in all threads. 4457 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4458 static void __kmp_reinitialize_team(kmp_team_t *team, 4459 kmp_internal_control_t *new_icvs, 4460 ident_t *loc) { 4461 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4462 team->t.t_threads[0], team)); 4463 KMP_DEBUG_ASSERT(team && new_icvs); 4464 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4465 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4466 4467 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4468 // Copy ICVs to the master thread's implicit taskdata 4469 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4470 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4471 4472 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4473 team->t.t_threads[0], team)); 4474 } 4475 4476 /* Initialize the team data structure. 4477 This assumes the t_threads and t_max_nproc are already set. 4478 Also, we don't touch the arguments */ 4479 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4480 kmp_internal_control_t *new_icvs, 4481 ident_t *loc) { 4482 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4483 4484 /* verify */ 4485 KMP_DEBUG_ASSERT(team); 4486 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4487 KMP_DEBUG_ASSERT(team->t.t_threads); 4488 KMP_MB(); 4489 4490 team->t.t_master_tid = 0; /* not needed */ 4491 /* team->t.t_master_bar; not needed */ 4492 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4493 team->t.t_nproc = new_nproc; 4494 4495 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4496 team->t.t_next_pool = NULL; 4497 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4498 * up hot team */ 4499 4500 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4501 team->t.t_invoke = NULL; /* not needed */ 4502 4503 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4504 team->t.t_sched.sched = new_icvs->sched.sched; 4505 4506 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4507 team->t.t_fp_control_saved = FALSE; /* not needed */ 4508 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4509 team->t.t_mxcsr = 0; /* not needed */ 4510 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4511 4512 team->t.t_construct = 0; 4513 4514 team->t.t_ordered.dt.t_value = 0; 4515 team->t.t_master_active = FALSE; 4516 4517 #ifdef KMP_DEBUG 4518 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4519 #endif 4520 #if KMP_OS_WINDOWS 4521 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4522 #endif 4523 4524 team->t.t_control_stack_top = NULL; 4525 4526 __kmp_reinitialize_team(team, new_icvs, loc); 4527 4528 KMP_MB(); 4529 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4530 } 4531 4532 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4533 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4534 static void 4535 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4536 if (KMP_AFFINITY_CAPABLE()) { 4537 int status; 4538 if (old_mask != NULL) { 4539 status = __kmp_get_system_affinity(old_mask, TRUE); 4540 int error = errno; 4541 if (status != 0) { 4542 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4543 __kmp_msg_null); 4544 } 4545 } 4546 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4547 } 4548 } 4549 #endif 4550 4551 #if KMP_AFFINITY_SUPPORTED 4552 4553 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4554 // It calculates the worker + master thread's partition based upon the parent 4555 // thread's partition, and binds each worker to a thread in their partition. 4556 // The master thread's partition should already include its current binding. 4557 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4558 // Copy the master thread's place partition to the team struct 4559 kmp_info_t *master_th = team->t.t_threads[0]; 4560 KMP_DEBUG_ASSERT(master_th != NULL); 4561 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4562 int first_place = master_th->th.th_first_place; 4563 int last_place = master_th->th.th_last_place; 4564 int masters_place = master_th->th.th_current_place; 4565 team->t.t_first_place = first_place; 4566 team->t.t_last_place = last_place; 4567 4568 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4569 "bound to place %d partition = [%d,%d]\n", 4570 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4571 team->t.t_id, masters_place, first_place, last_place)); 4572 4573 switch (proc_bind) { 4574 4575 case proc_bind_default: 4576 // serial teams might have the proc_bind policy set to proc_bind_default. It 4577 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4578 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4579 break; 4580 4581 case proc_bind_master: { 4582 int f; 4583 int n_th = team->t.t_nproc; 4584 for (f = 1; f < n_th; f++) { 4585 kmp_info_t *th = team->t.t_threads[f]; 4586 KMP_DEBUG_ASSERT(th != NULL); 4587 th->th.th_first_place = first_place; 4588 th->th.th_last_place = last_place; 4589 th->th.th_new_place = masters_place; 4590 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4591 team->t.t_display_affinity != 1) { 4592 team->t.t_display_affinity = 1; 4593 } 4594 4595 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4596 "partition = [%d,%d]\n", 4597 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4598 f, masters_place, first_place, last_place)); 4599 } 4600 } break; 4601 4602 case proc_bind_close: { 4603 int f; 4604 int n_th = team->t.t_nproc; 4605 int n_places; 4606 if (first_place <= last_place) { 4607 n_places = last_place - first_place + 1; 4608 } else { 4609 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4610 } 4611 if (n_th <= n_places) { 4612 int place = masters_place; 4613 for (f = 1; f < n_th; f++) { 4614 kmp_info_t *th = team->t.t_threads[f]; 4615 KMP_DEBUG_ASSERT(th != NULL); 4616 4617 if (place == last_place) { 4618 place = first_place; 4619 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4620 place = 0; 4621 } else { 4622 place++; 4623 } 4624 th->th.th_first_place = first_place; 4625 th->th.th_last_place = last_place; 4626 th->th.th_new_place = place; 4627 if (__kmp_display_affinity && place != th->th.th_current_place && 4628 team->t.t_display_affinity != 1) { 4629 team->t.t_display_affinity = 1; 4630 } 4631 4632 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4633 "partition = [%d,%d]\n", 4634 __kmp_gtid_from_thread(team->t.t_threads[f]), 4635 team->t.t_id, f, place, first_place, last_place)); 4636 } 4637 } else { 4638 int S, rem, gap, s_count; 4639 S = n_th / n_places; 4640 s_count = 0; 4641 rem = n_th - (S * n_places); 4642 gap = rem > 0 ? n_places / rem : n_places; 4643 int place = masters_place; 4644 int gap_ct = gap; 4645 for (f = 0; f < n_th; f++) { 4646 kmp_info_t *th = team->t.t_threads[f]; 4647 KMP_DEBUG_ASSERT(th != NULL); 4648 4649 th->th.th_first_place = first_place; 4650 th->th.th_last_place = last_place; 4651 th->th.th_new_place = place; 4652 if (__kmp_display_affinity && place != th->th.th_current_place && 4653 team->t.t_display_affinity != 1) { 4654 team->t.t_display_affinity = 1; 4655 } 4656 s_count++; 4657 4658 if ((s_count == S) && rem && (gap_ct == gap)) { 4659 // do nothing, add an extra thread to place on next iteration 4660 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4661 // we added an extra thread to this place; move to next place 4662 if (place == last_place) { 4663 place = first_place; 4664 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4665 place = 0; 4666 } else { 4667 place++; 4668 } 4669 s_count = 0; 4670 gap_ct = 1; 4671 rem--; 4672 } else if (s_count == S) { // place full; don't add extra 4673 if (place == last_place) { 4674 place = first_place; 4675 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4676 place = 0; 4677 } else { 4678 place++; 4679 } 4680 gap_ct++; 4681 s_count = 0; 4682 } 4683 4684 KA_TRACE(100, 4685 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4686 "partition = [%d,%d]\n", 4687 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4688 th->th.th_new_place, first_place, last_place)); 4689 } 4690 KMP_DEBUG_ASSERT(place == masters_place); 4691 } 4692 } break; 4693 4694 case proc_bind_spread: { 4695 int f; 4696 int n_th = team->t.t_nproc; 4697 int n_places; 4698 int thidx; 4699 if (first_place <= last_place) { 4700 n_places = last_place - first_place + 1; 4701 } else { 4702 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4703 } 4704 if (n_th <= n_places) { 4705 int place = -1; 4706 4707 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4708 int S = n_places / n_th; 4709 int s_count, rem, gap, gap_ct; 4710 4711 place = masters_place; 4712 rem = n_places - n_th * S; 4713 gap = rem ? n_th / rem : 1; 4714 gap_ct = gap; 4715 thidx = n_th; 4716 if (update_master_only == 1) 4717 thidx = 1; 4718 for (f = 0; f < thidx; f++) { 4719 kmp_info_t *th = team->t.t_threads[f]; 4720 KMP_DEBUG_ASSERT(th != NULL); 4721 4722 th->th.th_first_place = place; 4723 th->th.th_new_place = place; 4724 if (__kmp_display_affinity && place != th->th.th_current_place && 4725 team->t.t_display_affinity != 1) { 4726 team->t.t_display_affinity = 1; 4727 } 4728 s_count = 1; 4729 while (s_count < S) { 4730 if (place == last_place) { 4731 place = first_place; 4732 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4733 place = 0; 4734 } else { 4735 place++; 4736 } 4737 s_count++; 4738 } 4739 if (rem && (gap_ct == gap)) { 4740 if (place == last_place) { 4741 place = first_place; 4742 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4743 place = 0; 4744 } else { 4745 place++; 4746 } 4747 rem--; 4748 gap_ct = 0; 4749 } 4750 th->th.th_last_place = place; 4751 gap_ct++; 4752 4753 if (place == last_place) { 4754 place = first_place; 4755 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4756 place = 0; 4757 } else { 4758 place++; 4759 } 4760 4761 KA_TRACE(100, 4762 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4763 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4764 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4765 f, th->th.th_new_place, th->th.th_first_place, 4766 th->th.th_last_place, __kmp_affinity_num_masks)); 4767 } 4768 } else { 4769 /* Having uniform space of available computation places I can create 4770 T partitions of round(P/T) size and put threads into the first 4771 place of each partition. */ 4772 double current = static_cast<double>(masters_place); 4773 double spacing = 4774 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4775 int first, last; 4776 kmp_info_t *th; 4777 4778 thidx = n_th + 1; 4779 if (update_master_only == 1) 4780 thidx = 1; 4781 for (f = 0; f < thidx; f++) { 4782 first = static_cast<int>(current); 4783 last = static_cast<int>(current + spacing) - 1; 4784 KMP_DEBUG_ASSERT(last >= first); 4785 if (first >= n_places) { 4786 if (masters_place) { 4787 first -= n_places; 4788 last -= n_places; 4789 if (first == (masters_place + 1)) { 4790 KMP_DEBUG_ASSERT(f == n_th); 4791 first--; 4792 } 4793 if (last == masters_place) { 4794 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4795 last--; 4796 } 4797 } else { 4798 KMP_DEBUG_ASSERT(f == n_th); 4799 first = 0; 4800 last = 0; 4801 } 4802 } 4803 if (last >= n_places) { 4804 last = (n_places - 1); 4805 } 4806 place = first; 4807 current += spacing; 4808 if (f < n_th) { 4809 KMP_DEBUG_ASSERT(0 <= first); 4810 KMP_DEBUG_ASSERT(n_places > first); 4811 KMP_DEBUG_ASSERT(0 <= last); 4812 KMP_DEBUG_ASSERT(n_places > last); 4813 KMP_DEBUG_ASSERT(last_place >= first_place); 4814 th = team->t.t_threads[f]; 4815 KMP_DEBUG_ASSERT(th); 4816 th->th.th_first_place = first; 4817 th->th.th_new_place = place; 4818 th->th.th_last_place = last; 4819 if (__kmp_display_affinity && place != th->th.th_current_place && 4820 team->t.t_display_affinity != 1) { 4821 team->t.t_display_affinity = 1; 4822 } 4823 KA_TRACE(100, 4824 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4825 "partition = [%d,%d], spacing = %.4f\n", 4826 __kmp_gtid_from_thread(team->t.t_threads[f]), 4827 team->t.t_id, f, th->th.th_new_place, 4828 th->th.th_first_place, th->th.th_last_place, spacing)); 4829 } 4830 } 4831 } 4832 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4833 } else { 4834 int S, rem, gap, s_count; 4835 S = n_th / n_places; 4836 s_count = 0; 4837 rem = n_th - (S * n_places); 4838 gap = rem > 0 ? n_places / rem : n_places; 4839 int place = masters_place; 4840 int gap_ct = gap; 4841 thidx = n_th; 4842 if (update_master_only == 1) 4843 thidx = 1; 4844 for (f = 0; f < thidx; f++) { 4845 kmp_info_t *th = team->t.t_threads[f]; 4846 KMP_DEBUG_ASSERT(th != NULL); 4847 4848 th->th.th_first_place = place; 4849 th->th.th_last_place = place; 4850 th->th.th_new_place = place; 4851 if (__kmp_display_affinity && place != th->th.th_current_place && 4852 team->t.t_display_affinity != 1) { 4853 team->t.t_display_affinity = 1; 4854 } 4855 s_count++; 4856 4857 if ((s_count == S) && rem && (gap_ct == gap)) { 4858 // do nothing, add an extra thread to place on next iteration 4859 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4860 // we added an extra thread to this place; move on to next place 4861 if (place == last_place) { 4862 place = first_place; 4863 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4864 place = 0; 4865 } else { 4866 place++; 4867 } 4868 s_count = 0; 4869 gap_ct = 1; 4870 rem--; 4871 } else if (s_count == S) { // place is full; don't add extra thread 4872 if (place == last_place) { 4873 place = first_place; 4874 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4875 place = 0; 4876 } else { 4877 place++; 4878 } 4879 gap_ct++; 4880 s_count = 0; 4881 } 4882 4883 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4884 "partition = [%d,%d]\n", 4885 __kmp_gtid_from_thread(team->t.t_threads[f]), 4886 team->t.t_id, f, th->th.th_new_place, 4887 th->th.th_first_place, th->th.th_last_place)); 4888 } 4889 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4890 } 4891 } break; 4892 4893 default: 4894 break; 4895 } 4896 4897 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4898 } 4899 4900 #endif // KMP_AFFINITY_SUPPORTED 4901 4902 /* allocate a new team data structure to use. take one off of the free pool if 4903 available */ 4904 kmp_team_t * 4905 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4906 #if OMPT_SUPPORT 4907 ompt_data_t ompt_parallel_data, 4908 #endif 4909 kmp_proc_bind_t new_proc_bind, 4910 kmp_internal_control_t *new_icvs, 4911 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4912 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4913 int f; 4914 kmp_team_t *team; 4915 int use_hot_team = !root->r.r_active; 4916 int level = 0; 4917 4918 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4919 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4920 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4921 KMP_MB(); 4922 4923 #if KMP_NESTED_HOT_TEAMS 4924 kmp_hot_team_ptr_t *hot_teams; 4925 if (master) { 4926 team = master->th.th_team; 4927 level = team->t.t_active_level; 4928 if (master->th.th_teams_microtask) { // in teams construct? 4929 if (master->th.th_teams_size.nteams > 1 && 4930 ( // #teams > 1 4931 team->t.t_pkfn == 4932 (microtask_t)__kmp_teams_master || // inner fork of the teams 4933 master->th.th_teams_level < 4934 team->t.t_level)) { // or nested parallel inside the teams 4935 ++level; // not increment if #teams==1, or for outer fork of the teams; 4936 // increment otherwise 4937 } 4938 } 4939 hot_teams = master->th.th_hot_teams; 4940 if (level < __kmp_hot_teams_max_level && hot_teams && 4941 hot_teams[level].hot_team) { 4942 // hot team has already been allocated for given level 4943 use_hot_team = 1; 4944 } else { 4945 use_hot_team = 0; 4946 } 4947 } else { 4948 // check we won't access uninitialized hot_teams, just in case 4949 KMP_DEBUG_ASSERT(new_nproc == 1); 4950 } 4951 #endif 4952 // Optimization to use a "hot" team 4953 if (use_hot_team && new_nproc > 1) { 4954 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4955 #if KMP_NESTED_HOT_TEAMS 4956 team = hot_teams[level].hot_team; 4957 #else 4958 team = root->r.r_hot_team; 4959 #endif 4960 #if KMP_DEBUG 4961 if (__kmp_tasking_mode != tskm_immediate_exec) { 4962 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4963 "task_team[1] = %p before reinit\n", 4964 team->t.t_task_team[0], team->t.t_task_team[1])); 4965 } 4966 #endif 4967 4968 // Has the number of threads changed? 4969 /* Let's assume the most common case is that the number of threads is 4970 unchanged, and put that case first. */ 4971 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4972 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4973 // This case can mean that omp_set_num_threads() was called and the hot 4974 // team size was already reduced, so we check the special flag 4975 if (team->t.t_size_changed == -1) { 4976 team->t.t_size_changed = 1; 4977 } else { 4978 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4979 } 4980 4981 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4982 kmp_r_sched_t new_sched = new_icvs->sched; 4983 // set master's schedule as new run-time schedule 4984 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 4985 4986 __kmp_reinitialize_team(team, new_icvs, 4987 root->r.r_uber_thread->th.th_ident); 4988 4989 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 4990 team->t.t_threads[0], team)); 4991 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4992 4993 #if KMP_AFFINITY_SUPPORTED 4994 if ((team->t.t_size_changed == 0) && 4995 (team->t.t_proc_bind == new_proc_bind)) { 4996 if (new_proc_bind == proc_bind_spread) { 4997 __kmp_partition_places( 4998 team, 1); // add flag to update only master for spread 4999 } 5000 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5001 "proc_bind = %d, partition = [%d,%d]\n", 5002 team->t.t_id, new_proc_bind, team->t.t_first_place, 5003 team->t.t_last_place)); 5004 } else { 5005 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5006 __kmp_partition_places(team); 5007 } 5008 #else 5009 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5010 #endif /* KMP_AFFINITY_SUPPORTED */ 5011 } else if (team->t.t_nproc > new_nproc) { 5012 KA_TRACE(20, 5013 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5014 new_nproc)); 5015 5016 team->t.t_size_changed = 1; 5017 #if KMP_NESTED_HOT_TEAMS 5018 if (__kmp_hot_teams_mode == 0) { 5019 // AC: saved number of threads should correspond to team's value in this 5020 // mode, can be bigger in mode 1, when hot team has threads in reserve 5021 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5022 hot_teams[level].hot_team_nth = new_nproc; 5023 #endif // KMP_NESTED_HOT_TEAMS 5024 /* release the extra threads we don't need any more */ 5025 for (f = new_nproc; f < team->t.t_nproc; f++) { 5026 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5027 if (__kmp_tasking_mode != tskm_immediate_exec) { 5028 // When decreasing team size, threads no longer in the team should 5029 // unref task team. 5030 team->t.t_threads[f]->th.th_task_team = NULL; 5031 } 5032 __kmp_free_thread(team->t.t_threads[f]); 5033 team->t.t_threads[f] = NULL; 5034 } 5035 #if KMP_NESTED_HOT_TEAMS 5036 } // (__kmp_hot_teams_mode == 0) 5037 else { 5038 // When keeping extra threads in team, switch threads to wait on own 5039 // b_go flag 5040 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5041 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5042 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5043 for (int b = 0; b < bs_last_barrier; ++b) { 5044 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5045 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5046 } 5047 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5048 } 5049 } 5050 } 5051 #endif // KMP_NESTED_HOT_TEAMS 5052 team->t.t_nproc = new_nproc; 5053 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5054 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5055 __kmp_reinitialize_team(team, new_icvs, 5056 root->r.r_uber_thread->th.th_ident); 5057 5058 // Update remaining threads 5059 for (f = 0; f < new_nproc; ++f) { 5060 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5061 } 5062 5063 // restore the current task state of the master thread: should be the 5064 // implicit task 5065 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5066 team->t.t_threads[0], team)); 5067 5068 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5069 5070 #ifdef KMP_DEBUG 5071 for (f = 0; f < team->t.t_nproc; f++) { 5072 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5073 team->t.t_threads[f]->th.th_team_nproc == 5074 team->t.t_nproc); 5075 } 5076 #endif 5077 5078 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5079 #if KMP_AFFINITY_SUPPORTED 5080 __kmp_partition_places(team); 5081 #endif 5082 } else { // team->t.t_nproc < new_nproc 5083 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5084 kmp_affin_mask_t *old_mask; 5085 if (KMP_AFFINITY_CAPABLE()) { 5086 KMP_CPU_ALLOC(old_mask); 5087 } 5088 #endif 5089 5090 KA_TRACE(20, 5091 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5092 new_nproc)); 5093 5094 team->t.t_size_changed = 1; 5095 5096 #if KMP_NESTED_HOT_TEAMS 5097 int avail_threads = hot_teams[level].hot_team_nth; 5098 if (new_nproc < avail_threads) 5099 avail_threads = new_nproc; 5100 kmp_info_t **other_threads = team->t.t_threads; 5101 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5102 // Adjust barrier data of reserved threads (if any) of the team 5103 // Other data will be set in __kmp_initialize_info() below. 5104 int b; 5105 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5106 for (b = 0; b < bs_last_barrier; ++b) { 5107 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5108 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5109 #if USE_DEBUGGER 5110 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5111 #endif 5112 } 5113 } 5114 if (hot_teams[level].hot_team_nth >= new_nproc) { 5115 // we have all needed threads in reserve, no need to allocate any 5116 // this only possible in mode 1, cannot have reserved threads in mode 0 5117 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5118 team->t.t_nproc = new_nproc; // just get reserved threads involved 5119 } else { 5120 // we may have some threads in reserve, but not enough 5121 team->t.t_nproc = 5122 hot_teams[level] 5123 .hot_team_nth; // get reserved threads involved if any 5124 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5125 #endif // KMP_NESTED_HOT_TEAMS 5126 if (team->t.t_max_nproc < new_nproc) { 5127 /* reallocate larger arrays */ 5128 __kmp_reallocate_team_arrays(team, new_nproc); 5129 __kmp_reinitialize_team(team, new_icvs, NULL); 5130 } 5131 5132 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5133 /* Temporarily set full mask for master thread before creation of 5134 workers. The reason is that workers inherit the affinity from master, 5135 so if a lot of workers are created on the single core quickly, they 5136 don't get a chance to set their own affinity for a long time. */ 5137 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5138 #endif 5139 5140 /* allocate new threads for the hot team */ 5141 for (f = team->t.t_nproc; f < new_nproc; f++) { 5142 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5143 KMP_DEBUG_ASSERT(new_worker); 5144 team->t.t_threads[f] = new_worker; 5145 5146 KA_TRACE(20, 5147 ("__kmp_allocate_team: team %d init T#%d arrived: " 5148 "join=%llu, plain=%llu\n", 5149 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5150 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5151 team->t.t_bar[bs_plain_barrier].b_arrived)); 5152 5153 { // Initialize barrier data for new threads. 5154 int b; 5155 kmp_balign_t *balign = new_worker->th.th_bar; 5156 for (b = 0; b < bs_last_barrier; ++b) { 5157 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5158 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5159 KMP_BARRIER_PARENT_FLAG); 5160 #if USE_DEBUGGER 5161 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5162 #endif 5163 } 5164 } 5165 } 5166 5167 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5168 if (KMP_AFFINITY_CAPABLE()) { 5169 /* Restore initial master thread's affinity mask */ 5170 __kmp_set_system_affinity(old_mask, TRUE); 5171 KMP_CPU_FREE(old_mask); 5172 } 5173 #endif 5174 #if KMP_NESTED_HOT_TEAMS 5175 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5176 #endif // KMP_NESTED_HOT_TEAMS 5177 /* make sure everyone is syncronized */ 5178 int old_nproc = team->t.t_nproc; // save old value and use to update only 5179 // new threads below 5180 __kmp_initialize_team(team, new_nproc, new_icvs, 5181 root->r.r_uber_thread->th.th_ident); 5182 5183 /* reinitialize the threads */ 5184 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5185 for (f = 0; f < team->t.t_nproc; ++f) 5186 __kmp_initialize_info(team->t.t_threads[f], team, f, 5187 __kmp_gtid_from_tid(f, team)); 5188 5189 if (level) { // set th_task_state for new threads in nested hot team 5190 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5191 // only need to set the th_task_state for the new threads. th_task_state 5192 // for master thread will not be accurate until after this in 5193 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5194 // correct value. 5195 for (f = old_nproc; f < team->t.t_nproc; ++f) 5196 team->t.t_threads[f]->th.th_task_state = 5197 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5198 } else { // set th_task_state for new threads in non-nested hot team 5199 int old_state = 5200 team->t.t_threads[0]->th.th_task_state; // copy master's state 5201 for (f = old_nproc; f < team->t.t_nproc; ++f) 5202 team->t.t_threads[f]->th.th_task_state = old_state; 5203 } 5204 5205 #ifdef KMP_DEBUG 5206 for (f = 0; f < team->t.t_nproc; ++f) { 5207 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5208 team->t.t_threads[f]->th.th_team_nproc == 5209 team->t.t_nproc); 5210 } 5211 #endif 5212 5213 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5214 #if KMP_AFFINITY_SUPPORTED 5215 __kmp_partition_places(team); 5216 #endif 5217 } // Check changes in number of threads 5218 5219 kmp_info_t *master = team->t.t_threads[0]; 5220 if (master->th.th_teams_microtask) { 5221 for (f = 1; f < new_nproc; ++f) { 5222 // propagate teams construct specific info to workers 5223 kmp_info_t *thr = team->t.t_threads[f]; 5224 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5225 thr->th.th_teams_level = master->th.th_teams_level; 5226 thr->th.th_teams_size = master->th.th_teams_size; 5227 } 5228 } 5229 #if KMP_NESTED_HOT_TEAMS 5230 if (level) { 5231 // Sync barrier state for nested hot teams, not needed for outermost hot 5232 // team. 5233 for (f = 1; f < new_nproc; ++f) { 5234 kmp_info_t *thr = team->t.t_threads[f]; 5235 int b; 5236 kmp_balign_t *balign = thr->th.th_bar; 5237 for (b = 0; b < bs_last_barrier; ++b) { 5238 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5239 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5240 #if USE_DEBUGGER 5241 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5242 #endif 5243 } 5244 } 5245 } 5246 #endif // KMP_NESTED_HOT_TEAMS 5247 5248 /* reallocate space for arguments if necessary */ 5249 __kmp_alloc_argv_entries(argc, team, TRUE); 5250 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5251 // The hot team re-uses the previous task team, 5252 // if untouched during the previous release->gather phase. 5253 5254 KF_TRACE(10, (" hot_team = %p\n", team)); 5255 5256 #if KMP_DEBUG 5257 if (__kmp_tasking_mode != tskm_immediate_exec) { 5258 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5259 "task_team[1] = %p after reinit\n", 5260 team->t.t_task_team[0], team->t.t_task_team[1])); 5261 } 5262 #endif 5263 5264 #if OMPT_SUPPORT 5265 __ompt_team_assign_id(team, ompt_parallel_data); 5266 #endif 5267 5268 KMP_MB(); 5269 5270 return team; 5271 } 5272 5273 /* next, let's try to take one from the team pool */ 5274 KMP_MB(); 5275 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5276 /* TODO: consider resizing undersized teams instead of reaping them, now 5277 that we have a resizing mechanism */ 5278 if (team->t.t_max_nproc >= max_nproc) { 5279 /* take this team from the team pool */ 5280 __kmp_team_pool = team->t.t_next_pool; 5281 5282 /* setup the team for fresh use */ 5283 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5284 5285 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5286 "task_team[1] %p to NULL\n", 5287 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5288 team->t.t_task_team[0] = NULL; 5289 team->t.t_task_team[1] = NULL; 5290 5291 /* reallocate space for arguments if necessary */ 5292 __kmp_alloc_argv_entries(argc, team, TRUE); 5293 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5294 5295 KA_TRACE( 5296 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5297 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5298 { // Initialize barrier data. 5299 int b; 5300 for (b = 0; b < bs_last_barrier; ++b) { 5301 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5302 #if USE_DEBUGGER 5303 team->t.t_bar[b].b_master_arrived = 0; 5304 team->t.t_bar[b].b_team_arrived = 0; 5305 #endif 5306 } 5307 } 5308 5309 team->t.t_proc_bind = new_proc_bind; 5310 5311 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5312 team->t.t_id)); 5313 5314 #if OMPT_SUPPORT 5315 __ompt_team_assign_id(team, ompt_parallel_data); 5316 #endif 5317 5318 KMP_MB(); 5319 5320 return team; 5321 } 5322 5323 /* reap team if it is too small, then loop back and check the next one */ 5324 // not sure if this is wise, but, will be redone during the hot-teams 5325 // rewrite. 5326 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5327 team = __kmp_reap_team(team); 5328 __kmp_team_pool = team; 5329 } 5330 5331 /* nothing available in the pool, no matter, make a new team! */ 5332 KMP_MB(); 5333 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5334 5335 /* and set it up */ 5336 team->t.t_max_nproc = max_nproc; 5337 /* NOTE well, for some reason allocating one big buffer and dividing it up 5338 seems to really hurt performance a lot on the P4, so, let's not use this */ 5339 __kmp_allocate_team_arrays(team, max_nproc); 5340 5341 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5342 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5343 5344 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5345 "%p to NULL\n", 5346 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5347 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5348 // memory, no need to duplicate 5349 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5350 // memory, no need to duplicate 5351 5352 if (__kmp_storage_map) { 5353 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5354 } 5355 5356 /* allocate space for arguments */ 5357 __kmp_alloc_argv_entries(argc, team, FALSE); 5358 team->t.t_argc = argc; 5359 5360 KA_TRACE(20, 5361 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5362 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5363 { // Initialize barrier data. 5364 int b; 5365 for (b = 0; b < bs_last_barrier; ++b) { 5366 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5367 #if USE_DEBUGGER 5368 team->t.t_bar[b].b_master_arrived = 0; 5369 team->t.t_bar[b].b_team_arrived = 0; 5370 #endif 5371 } 5372 } 5373 5374 team->t.t_proc_bind = new_proc_bind; 5375 5376 #if OMPT_SUPPORT 5377 __ompt_team_assign_id(team, ompt_parallel_data); 5378 team->t.ompt_serialized_team_info = NULL; 5379 #endif 5380 5381 KMP_MB(); 5382 5383 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5384 team->t.t_id)); 5385 5386 return team; 5387 } 5388 5389 /* TODO implement hot-teams at all levels */ 5390 /* TODO implement lazy thread release on demand (disband request) */ 5391 5392 /* free the team. return it to the team pool. release all the threads 5393 * associated with it */ 5394 void __kmp_free_team(kmp_root_t *root, 5395 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5396 int f; 5397 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5398 team->t.t_id)); 5399 5400 /* verify state */ 5401 KMP_DEBUG_ASSERT(root); 5402 KMP_DEBUG_ASSERT(team); 5403 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5404 KMP_DEBUG_ASSERT(team->t.t_threads); 5405 5406 int use_hot_team = team == root->r.r_hot_team; 5407 #if KMP_NESTED_HOT_TEAMS 5408 int level; 5409 kmp_hot_team_ptr_t *hot_teams; 5410 if (master) { 5411 level = team->t.t_active_level - 1; 5412 if (master->th.th_teams_microtask) { // in teams construct? 5413 if (master->th.th_teams_size.nteams > 1) { 5414 ++level; // level was not increased in teams construct for 5415 // team_of_masters 5416 } 5417 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5418 master->th.th_teams_level == team->t.t_level) { 5419 ++level; // level was not increased in teams construct for 5420 // team_of_workers before the parallel 5421 } // team->t.t_level will be increased inside parallel 5422 } 5423 hot_teams = master->th.th_hot_teams; 5424 if (level < __kmp_hot_teams_max_level) { 5425 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5426 use_hot_team = 1; 5427 } 5428 } 5429 #endif // KMP_NESTED_HOT_TEAMS 5430 5431 /* team is done working */ 5432 TCW_SYNC_PTR(team->t.t_pkfn, 5433 NULL); // Important for Debugging Support Library. 5434 #if KMP_OS_WINDOWS 5435 team->t.t_copyin_counter = 0; // init counter for possible reuse 5436 #endif 5437 // Do not reset pointer to parent team to NULL for hot teams. 5438 5439 /* if we are non-hot team, release our threads */ 5440 if (!use_hot_team) { 5441 if (__kmp_tasking_mode != tskm_immediate_exec) { 5442 // Wait for threads to reach reapable state 5443 for (f = 1; f < team->t.t_nproc; ++f) { 5444 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5445 kmp_info_t *th = team->t.t_threads[f]; 5446 volatile kmp_uint32 *state = &th->th.th_reap_state; 5447 while (*state != KMP_SAFE_TO_REAP) { 5448 #if KMP_OS_WINDOWS 5449 // On Windows a thread can be killed at any time, check this 5450 DWORD ecode; 5451 if (!__kmp_is_thread_alive(th, &ecode)) { 5452 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5453 break; 5454 } 5455 #endif 5456 // first check if thread is sleeping 5457 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5458 if (fl.is_sleeping()) 5459 fl.resume(__kmp_gtid_from_thread(th)); 5460 KMP_CPU_PAUSE(); 5461 } 5462 } 5463 5464 // Delete task teams 5465 int tt_idx; 5466 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5467 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5468 if (task_team != NULL) { 5469 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5470 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5471 team->t.t_threads[f]->th.th_task_team = NULL; 5472 } 5473 KA_TRACE( 5474 20, 5475 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5476 __kmp_get_gtid(), task_team, team->t.t_id)); 5477 #if KMP_NESTED_HOT_TEAMS 5478 __kmp_free_task_team(master, task_team); 5479 #endif 5480 team->t.t_task_team[tt_idx] = NULL; 5481 } 5482 } 5483 } 5484 5485 // Reset pointer to parent team only for non-hot teams. 5486 team->t.t_parent = NULL; 5487 team->t.t_level = 0; 5488 team->t.t_active_level = 0; 5489 5490 /* free the worker threads */ 5491 for (f = 1; f < team->t.t_nproc; ++f) { 5492 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5493 __kmp_free_thread(team->t.t_threads[f]); 5494 team->t.t_threads[f] = NULL; 5495 } 5496 5497 /* put the team back in the team pool */ 5498 /* TODO limit size of team pool, call reap_team if pool too large */ 5499 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5500 __kmp_team_pool = (volatile kmp_team_t *)team; 5501 } else { // Check if team was created for the masters in a teams construct 5502 // See if first worker is a CG root 5503 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5504 team->t.t_threads[1]->th.th_cg_roots); 5505 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5506 // Clean up the CG root nodes on workers so that this team can be re-used 5507 for (f = 1; f < team->t.t_nproc; ++f) { 5508 kmp_info_t *thr = team->t.t_threads[f]; 5509 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5510 thr->th.th_cg_roots->cg_root == thr); 5511 // Pop current CG root off list 5512 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5513 thr->th.th_cg_roots = tmp->up; 5514 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5515 " up to node %p. cg_nthreads was %d\n", 5516 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5517 int i = tmp->cg_nthreads--; 5518 if (i == 1) { 5519 __kmp_free(tmp); // free CG if we are the last thread in it 5520 } 5521 // Restore current task's thread_limit from CG root 5522 if (thr->th.th_cg_roots) 5523 thr->th.th_current_task->td_icvs.thread_limit = 5524 thr->th.th_cg_roots->cg_thread_limit; 5525 } 5526 } 5527 } 5528 5529 KMP_MB(); 5530 } 5531 5532 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5533 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5534 kmp_team_t *next_pool = team->t.t_next_pool; 5535 5536 KMP_DEBUG_ASSERT(team); 5537 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5538 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5539 KMP_DEBUG_ASSERT(team->t.t_threads); 5540 KMP_DEBUG_ASSERT(team->t.t_argv); 5541 5542 /* TODO clean the threads that are a part of this? */ 5543 5544 /* free stuff */ 5545 __kmp_free_team_arrays(team); 5546 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5547 __kmp_free((void *)team->t.t_argv); 5548 __kmp_free(team); 5549 5550 KMP_MB(); 5551 return next_pool; 5552 } 5553 5554 // Free the thread. Don't reap it, just place it on the pool of available 5555 // threads. 5556 // 5557 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5558 // binding for the affinity mechanism to be useful. 5559 // 5560 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5561 // However, we want to avoid a potential performance problem by always 5562 // scanning through the list to find the correct point at which to insert 5563 // the thread (potential N**2 behavior). To do this we keep track of the 5564 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5565 // With single-level parallelism, threads will always be added to the tail 5566 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5567 // parallelism, all bets are off and we may need to scan through the entire 5568 // free list. 5569 // 5570 // This change also has a potentially large performance benefit, for some 5571 // applications. Previously, as threads were freed from the hot team, they 5572 // would be placed back on the free list in inverse order. If the hot team 5573 // grew back to it's original size, then the freed thread would be placed 5574 // back on the hot team in reverse order. This could cause bad cache 5575 // locality problems on programs where the size of the hot team regularly 5576 // grew and shrunk. 5577 // 5578 // Now, for single-level parallelism, the OMP tid is always == gtid. 5579 void __kmp_free_thread(kmp_info_t *this_th) { 5580 int gtid; 5581 kmp_info_t **scan; 5582 5583 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5584 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5585 5586 KMP_DEBUG_ASSERT(this_th); 5587 5588 // When moving thread to pool, switch thread to wait on own b_go flag, and 5589 // uninitialized (NULL team). 5590 int b; 5591 kmp_balign_t *balign = this_th->th.th_bar; 5592 for (b = 0; b < bs_last_barrier; ++b) { 5593 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5594 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5595 balign[b].bb.team = NULL; 5596 balign[b].bb.leaf_kids = 0; 5597 } 5598 this_th->th.th_task_state = 0; 5599 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5600 5601 /* put thread back on the free pool */ 5602 TCW_PTR(this_th->th.th_team, NULL); 5603 TCW_PTR(this_th->th.th_root, NULL); 5604 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5605 5606 while (this_th->th.th_cg_roots) { 5607 this_th->th.th_cg_roots->cg_nthreads--; 5608 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5609 " %p of thread %p to %d\n", 5610 this_th, this_th->th.th_cg_roots, 5611 this_th->th.th_cg_roots->cg_root, 5612 this_th->th.th_cg_roots->cg_nthreads)); 5613 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5614 if (tmp->cg_root == this_th) { // Thread is a cg_root 5615 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5616 KA_TRACE( 5617 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5618 this_th->th.th_cg_roots = tmp->up; 5619 __kmp_free(tmp); 5620 } else { // Worker thread 5621 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5622 __kmp_free(tmp); 5623 } 5624 this_th->th.th_cg_roots = NULL; 5625 break; 5626 } 5627 } 5628 5629 /* If the implicit task assigned to this thread can be used by other threads 5630 * -> multiple threads can share the data and try to free the task at 5631 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5632 * with higher probability when hot team is disabled but can occurs even when 5633 * the hot team is enabled */ 5634 __kmp_free_implicit_task(this_th); 5635 this_th->th.th_current_task = NULL; 5636 5637 // If the __kmp_thread_pool_insert_pt is already past the new insert 5638 // point, then we need to re-scan the entire list. 5639 gtid = this_th->th.th_info.ds.ds_gtid; 5640 if (__kmp_thread_pool_insert_pt != NULL) { 5641 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5642 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5643 __kmp_thread_pool_insert_pt = NULL; 5644 } 5645 } 5646 5647 // Scan down the list to find the place to insert the thread. 5648 // scan is the address of a link in the list, possibly the address of 5649 // __kmp_thread_pool itself. 5650 // 5651 // In the absence of nested parallelism, the for loop will have 0 iterations. 5652 if (__kmp_thread_pool_insert_pt != NULL) { 5653 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5654 } else { 5655 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5656 } 5657 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5658 scan = &((*scan)->th.th_next_pool)) 5659 ; 5660 5661 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5662 // to its address. 5663 TCW_PTR(this_th->th.th_next_pool, *scan); 5664 __kmp_thread_pool_insert_pt = *scan = this_th; 5665 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5666 (this_th->th.th_info.ds.ds_gtid < 5667 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5668 TCW_4(this_th->th.th_in_pool, TRUE); 5669 __kmp_suspend_initialize_thread(this_th); 5670 __kmp_lock_suspend_mx(this_th); 5671 if (this_th->th.th_active == TRUE) { 5672 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5673 this_th->th.th_active_in_pool = TRUE; 5674 } 5675 #if KMP_DEBUG 5676 else { 5677 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5678 } 5679 #endif 5680 __kmp_unlock_suspend_mx(this_th); 5681 5682 TCW_4(__kmp_nth, __kmp_nth - 1); 5683 5684 #ifdef KMP_ADJUST_BLOCKTIME 5685 /* Adjust blocktime back to user setting or default if necessary */ 5686 /* Middle initialization might never have occurred */ 5687 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5688 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5689 if (__kmp_nth <= __kmp_avail_proc) { 5690 __kmp_zero_bt = FALSE; 5691 } 5692 } 5693 #endif /* KMP_ADJUST_BLOCKTIME */ 5694 5695 KMP_MB(); 5696 } 5697 5698 /* ------------------------------------------------------------------------ */ 5699 5700 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5701 int gtid = this_thr->th.th_info.ds.ds_gtid; 5702 /* void *stack_data;*/ 5703 kmp_team_t **volatile pteam; 5704 5705 KMP_MB(); 5706 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5707 5708 if (__kmp_env_consistency_check) { 5709 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5710 } 5711 5712 #if OMPT_SUPPORT 5713 ompt_data_t *thread_data; 5714 if (ompt_enabled.enabled) { 5715 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5716 *thread_data = ompt_data_none; 5717 5718 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5719 this_thr->th.ompt_thread_info.wait_id = 0; 5720 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5721 this_thr->th.ompt_thread_info.parallel_flags = 0; 5722 if (ompt_enabled.ompt_callback_thread_begin) { 5723 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5724 ompt_thread_worker, thread_data); 5725 } 5726 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5727 } 5728 #endif 5729 5730 /* This is the place where threads wait for work */ 5731 while (!TCR_4(__kmp_global.g.g_done)) { 5732 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5733 KMP_MB(); 5734 5735 /* wait for work to do */ 5736 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5737 5738 /* No tid yet since not part of a team */ 5739 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5740 5741 #if OMPT_SUPPORT 5742 if (ompt_enabled.enabled) { 5743 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5744 } 5745 #endif 5746 5747 pteam = &this_thr->th.th_team; 5748 5749 /* have we been allocated? */ 5750 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5751 /* we were just woken up, so run our new task */ 5752 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5753 int rc; 5754 KA_TRACE(20, 5755 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5756 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5757 (*pteam)->t.t_pkfn)); 5758 5759 updateHWFPControl(*pteam); 5760 5761 #if OMPT_SUPPORT 5762 if (ompt_enabled.enabled) { 5763 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5764 } 5765 #endif 5766 5767 rc = (*pteam)->t.t_invoke(gtid); 5768 KMP_ASSERT(rc); 5769 5770 KMP_MB(); 5771 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5772 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5773 (*pteam)->t.t_pkfn)); 5774 } 5775 #if OMPT_SUPPORT 5776 if (ompt_enabled.enabled) { 5777 /* no frame set while outside task */ 5778 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5779 5780 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5781 } 5782 #endif 5783 /* join barrier after parallel region */ 5784 __kmp_join_barrier(gtid); 5785 } 5786 } 5787 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5788 5789 #if OMPT_SUPPORT 5790 if (ompt_enabled.ompt_callback_thread_end) { 5791 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5792 } 5793 #endif 5794 5795 this_thr->th.th_task_team = NULL; 5796 /* run the destructors for the threadprivate data for this thread */ 5797 __kmp_common_destroy_gtid(gtid); 5798 5799 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5800 KMP_MB(); 5801 return this_thr; 5802 } 5803 5804 /* ------------------------------------------------------------------------ */ 5805 5806 void __kmp_internal_end_dest(void *specific_gtid) { 5807 #if KMP_COMPILER_ICC 5808 #pragma warning(push) 5809 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose 5810 // significant bits 5811 #endif 5812 // Make sure no significant bits are lost 5813 int gtid = (kmp_intptr_t)specific_gtid - 1; 5814 #if KMP_COMPILER_ICC 5815 #pragma warning(pop) 5816 #endif 5817 5818 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5819 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5820 * this is because 0 is reserved for the nothing-stored case */ 5821 5822 /* josh: One reason for setting the gtid specific data even when it is being 5823 destroyed by pthread is to allow gtid lookup through thread specific data 5824 (__kmp_gtid_get_specific). Some of the code, especially stat code, 5825 that gets executed in the call to __kmp_internal_end_thread, actually 5826 gets the gtid through the thread specific data. Setting it here seems 5827 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread 5828 to run smoothly. 5829 todo: get rid of this after we remove the dependence on 5830 __kmp_gtid_get_specific */ 5831 if (gtid >= 0 && KMP_UBER_GTID(gtid)) 5832 __kmp_gtid_set_specific(gtid); 5833 #ifdef KMP_TDATA_GTID 5834 __kmp_gtid = gtid; 5835 #endif 5836 __kmp_internal_end_thread(gtid); 5837 } 5838 5839 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5840 5841 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5842 __kmp_internal_end_atexit(); 5843 } 5844 5845 #endif 5846 5847 /* [Windows] josh: when the atexit handler is called, there may still be more 5848 than one thread alive */ 5849 void __kmp_internal_end_atexit(void) { 5850 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5851 /* [Windows] 5852 josh: ideally, we want to completely shutdown the library in this atexit 5853 handler, but stat code that depends on thread specific data for gtid fails 5854 because that data becomes unavailable at some point during the shutdown, so 5855 we call __kmp_internal_end_thread instead. We should eventually remove the 5856 dependency on __kmp_get_specific_gtid in the stat code and use 5857 __kmp_internal_end_library to cleanly shutdown the library. 5858 5859 // TODO: Can some of this comment about GVS be removed? 5860 I suspect that the offending stat code is executed when the calling thread 5861 tries to clean up a dead root thread's data structures, resulting in GVS 5862 code trying to close the GVS structures for that thread, but since the stat 5863 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5864 the calling thread is cleaning up itself instead of another thread, it get 5865 confused. This happens because allowing a thread to unregister and cleanup 5866 another thread is a recent modification for addressing an issue. 5867 Based on the current design (20050722), a thread may end up 5868 trying to unregister another thread only if thread death does not trigger 5869 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5870 thread specific data destructor function to detect thread death. For 5871 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5872 is nothing. Thus, the workaround is applicable only for Windows static 5873 stat library. */ 5874 __kmp_internal_end_library(-1); 5875 #if KMP_OS_WINDOWS 5876 __kmp_close_console(); 5877 #endif 5878 } 5879 5880 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5881 // It is assumed __kmp_forkjoin_lock is acquired. 5882 5883 int gtid; 5884 5885 KMP_DEBUG_ASSERT(thread != NULL); 5886 5887 gtid = thread->th.th_info.ds.ds_gtid; 5888 5889 if (!is_root) { 5890 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5891 /* Assume the threads are at the fork barrier here */ 5892 KA_TRACE( 5893 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5894 gtid)); 5895 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5896 * (GEH) */ 5897 ANNOTATE_HAPPENS_BEFORE(thread); 5898 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 5899 __kmp_release_64(&flag); 5900 } 5901 5902 // Terminate OS thread. 5903 __kmp_reap_worker(thread); 5904 5905 // The thread was killed asynchronously. If it was actively 5906 // spinning in the thread pool, decrement the global count. 5907 // 5908 // There is a small timing hole here - if the worker thread was just waking 5909 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5910 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5911 // the global counter might not get updated. 5912 // 5913 // Currently, this can only happen as the library is unloaded, 5914 // so there are no harmful side effects. 5915 if (thread->th.th_active_in_pool) { 5916 thread->th.th_active_in_pool = FALSE; 5917 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5918 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5919 } 5920 } 5921 5922 __kmp_free_implicit_task(thread); 5923 5924 // Free the fast memory for tasking 5925 #if USE_FAST_MEMORY 5926 __kmp_free_fast_memory(thread); 5927 #endif /* USE_FAST_MEMORY */ 5928 5929 __kmp_suspend_uninitialize_thread(thread); 5930 5931 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5932 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5933 5934 --__kmp_all_nth; 5935 // __kmp_nth was decremented when thread is added to the pool. 5936 5937 #ifdef KMP_ADJUST_BLOCKTIME 5938 /* Adjust blocktime back to user setting or default if necessary */ 5939 /* Middle initialization might never have occurred */ 5940 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5941 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5942 if (__kmp_nth <= __kmp_avail_proc) { 5943 __kmp_zero_bt = FALSE; 5944 } 5945 } 5946 #endif /* KMP_ADJUST_BLOCKTIME */ 5947 5948 /* free the memory being used */ 5949 if (__kmp_env_consistency_check) { 5950 if (thread->th.th_cons) { 5951 __kmp_free_cons_stack(thread->th.th_cons); 5952 thread->th.th_cons = NULL; 5953 } 5954 } 5955 5956 if (thread->th.th_pri_common != NULL) { 5957 __kmp_free(thread->th.th_pri_common); 5958 thread->th.th_pri_common = NULL; 5959 } 5960 5961 if (thread->th.th_task_state_memo_stack != NULL) { 5962 __kmp_free(thread->th.th_task_state_memo_stack); 5963 thread->th.th_task_state_memo_stack = NULL; 5964 } 5965 5966 #if KMP_USE_BGET 5967 if (thread->th.th_local.bget_data != NULL) { 5968 __kmp_finalize_bget(thread); 5969 } 5970 #endif 5971 5972 #if KMP_AFFINITY_SUPPORTED 5973 if (thread->th.th_affin_mask != NULL) { 5974 KMP_CPU_FREE(thread->th.th_affin_mask); 5975 thread->th.th_affin_mask = NULL; 5976 } 5977 #endif /* KMP_AFFINITY_SUPPORTED */ 5978 5979 #if KMP_USE_HIER_SCHED 5980 if (thread->th.th_hier_bar_data != NULL) { 5981 __kmp_free(thread->th.th_hier_bar_data); 5982 thread->th.th_hier_bar_data = NULL; 5983 } 5984 #endif 5985 5986 __kmp_reap_team(thread->th.th_serial_team); 5987 thread->th.th_serial_team = NULL; 5988 __kmp_free(thread); 5989 5990 KMP_MB(); 5991 5992 } // __kmp_reap_thread 5993 5994 static void __kmp_internal_end(void) { 5995 int i; 5996 5997 /* First, unregister the library */ 5998 __kmp_unregister_library(); 5999 6000 #if KMP_OS_WINDOWS 6001 /* In Win static library, we can't tell when a root actually dies, so we 6002 reclaim the data structures for any root threads that have died but not 6003 unregistered themselves, in order to shut down cleanly. 6004 In Win dynamic library we also can't tell when a thread dies. */ 6005 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 6006 // dead roots 6007 #endif 6008 6009 for (i = 0; i < __kmp_threads_capacity; i++) 6010 if (__kmp_root[i]) 6011 if (__kmp_root[i]->r.r_active) 6012 break; 6013 KMP_MB(); /* Flush all pending memory write invalidates. */ 6014 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6015 6016 if (i < __kmp_threads_capacity) { 6017 #if KMP_USE_MONITOR 6018 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6019 KMP_MB(); /* Flush all pending memory write invalidates. */ 6020 6021 // Need to check that monitor was initialized before reaping it. If we are 6022 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6023 // __kmp_monitor will appear to contain valid data, but it is only valid in 6024 // the parent process, not the child. 6025 // New behavior (201008): instead of keying off of the flag 6026 // __kmp_init_parallel, the monitor thread creation is keyed off 6027 // of the new flag __kmp_init_monitor. 6028 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6029 if (TCR_4(__kmp_init_monitor)) { 6030 __kmp_reap_monitor(&__kmp_monitor); 6031 TCW_4(__kmp_init_monitor, 0); 6032 } 6033 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6034 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6035 #endif // KMP_USE_MONITOR 6036 } else { 6037 /* TODO move this to cleanup code */ 6038 #ifdef KMP_DEBUG 6039 /* make sure that everything has properly ended */ 6040 for (i = 0; i < __kmp_threads_capacity; i++) { 6041 if (__kmp_root[i]) { 6042 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6043 // there can be uber threads alive here 6044 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6045 } 6046 } 6047 #endif 6048 6049 KMP_MB(); 6050 6051 // Reap the worker threads. 6052 // This is valid for now, but be careful if threads are reaped sooner. 6053 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6054 // Get the next thread from the pool. 6055 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6056 __kmp_thread_pool = thread->th.th_next_pool; 6057 // Reap it. 6058 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6059 thread->th.th_next_pool = NULL; 6060 thread->th.th_in_pool = FALSE; 6061 __kmp_reap_thread(thread, 0); 6062 } 6063 __kmp_thread_pool_insert_pt = NULL; 6064 6065 // Reap teams. 6066 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6067 // Get the next team from the pool. 6068 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6069 __kmp_team_pool = team->t.t_next_pool; 6070 // Reap it. 6071 team->t.t_next_pool = NULL; 6072 __kmp_reap_team(team); 6073 } 6074 6075 __kmp_reap_task_teams(); 6076 6077 #if KMP_OS_UNIX 6078 // Threads that are not reaped should not access any resources since they 6079 // are going to be deallocated soon, so the shutdown sequence should wait 6080 // until all threads either exit the final spin-waiting loop or begin 6081 // sleeping after the given blocktime. 6082 for (i = 0; i < __kmp_threads_capacity; i++) { 6083 kmp_info_t *thr = __kmp_threads[i]; 6084 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6085 KMP_CPU_PAUSE(); 6086 } 6087 #endif 6088 6089 for (i = 0; i < __kmp_threads_capacity; ++i) { 6090 // TBD: Add some checking... 6091 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6092 } 6093 6094 /* Make sure all threadprivate destructors get run by joining with all 6095 worker threads before resetting this flag */ 6096 TCW_SYNC_4(__kmp_init_common, FALSE); 6097 6098 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6099 KMP_MB(); 6100 6101 #if KMP_USE_MONITOR 6102 // See note above: One of the possible fixes for CQ138434 / CQ140126 6103 // 6104 // FIXME: push both code fragments down and CSE them? 6105 // push them into __kmp_cleanup() ? 6106 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6107 if (TCR_4(__kmp_init_monitor)) { 6108 __kmp_reap_monitor(&__kmp_monitor); 6109 TCW_4(__kmp_init_monitor, 0); 6110 } 6111 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6112 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6113 #endif 6114 } /* else !__kmp_global.t_active */ 6115 TCW_4(__kmp_init_gtid, FALSE); 6116 KMP_MB(); /* Flush all pending memory write invalidates. */ 6117 6118 __kmp_cleanup(); 6119 #if OMPT_SUPPORT 6120 ompt_fini(); 6121 #endif 6122 } 6123 6124 void __kmp_internal_end_library(int gtid_req) { 6125 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6126 /* this shouldn't be a race condition because __kmp_internal_end() is the 6127 only place to clear __kmp_serial_init */ 6128 /* we'll check this later too, after we get the lock */ 6129 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6130 // redundant, because the next check will work in any case. 6131 if (__kmp_global.g.g_abort) { 6132 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6133 /* TODO abort? */ 6134 return; 6135 } 6136 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6137 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6138 return; 6139 } 6140 6141 KMP_MB(); /* Flush all pending memory write invalidates. */ 6142 6143 /* find out who we are and what we should do */ 6144 { 6145 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6146 KA_TRACE( 6147 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6148 if (gtid == KMP_GTID_SHUTDOWN) { 6149 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6150 "already shutdown\n")); 6151 return; 6152 } else if (gtid == KMP_GTID_MONITOR) { 6153 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6154 "registered, or system shutdown\n")); 6155 return; 6156 } else if (gtid == KMP_GTID_DNE) { 6157 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6158 "shutdown\n")); 6159 /* we don't know who we are, but we may still shutdown the library */ 6160 } else if (KMP_UBER_GTID(gtid)) { 6161 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6162 if (__kmp_root[gtid]->r.r_active) { 6163 __kmp_global.g.g_abort = -1; 6164 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6165 KA_TRACE(10, 6166 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6167 gtid)); 6168 return; 6169 } else { 6170 KA_TRACE( 6171 10, 6172 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6173 __kmp_unregister_root_current_thread(gtid); 6174 } 6175 } else { 6176 /* worker threads may call this function through the atexit handler, if they 6177 * call exit() */ 6178 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6179 TODO: do a thorough shutdown instead */ 6180 #ifdef DUMP_DEBUG_ON_EXIT 6181 if (__kmp_debug_buf) 6182 __kmp_dump_debug_buffer(); 6183 #endif 6184 return; 6185 } 6186 } 6187 /* synchronize the termination process */ 6188 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6189 6190 /* have we already finished */ 6191 if (__kmp_global.g.g_abort) { 6192 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6193 /* TODO abort? */ 6194 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6195 return; 6196 } 6197 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6198 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6199 return; 6200 } 6201 6202 /* We need this lock to enforce mutex between this reading of 6203 __kmp_threads_capacity and the writing by __kmp_register_root. 6204 Alternatively, we can use a counter of roots that is atomically updated by 6205 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6206 __kmp_internal_end_*. */ 6207 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6208 6209 /* now we can safely conduct the actual termination */ 6210 __kmp_internal_end(); 6211 6212 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6213 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6214 6215 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6216 6217 #ifdef DUMP_DEBUG_ON_EXIT 6218 if (__kmp_debug_buf) 6219 __kmp_dump_debug_buffer(); 6220 #endif 6221 6222 #if KMP_OS_WINDOWS 6223 __kmp_close_console(); 6224 #endif 6225 6226 __kmp_fini_allocator(); 6227 6228 } // __kmp_internal_end_library 6229 6230 void __kmp_internal_end_thread(int gtid_req) { 6231 int i; 6232 6233 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6234 /* this shouldn't be a race condition because __kmp_internal_end() is the 6235 * only place to clear __kmp_serial_init */ 6236 /* we'll check this later too, after we get the lock */ 6237 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6238 // redundant, because the next check will work in any case. 6239 if (__kmp_global.g.g_abort) { 6240 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6241 /* TODO abort? */ 6242 return; 6243 } 6244 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6245 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6246 return; 6247 } 6248 6249 KMP_MB(); /* Flush all pending memory write invalidates. */ 6250 6251 /* find out who we are and what we should do */ 6252 { 6253 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6254 KA_TRACE(10, 6255 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6256 if (gtid == KMP_GTID_SHUTDOWN) { 6257 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6258 "already shutdown\n")); 6259 return; 6260 } else if (gtid == KMP_GTID_MONITOR) { 6261 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6262 "registered, or system shutdown\n")); 6263 return; 6264 } else if (gtid == KMP_GTID_DNE) { 6265 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6266 "shutdown\n")); 6267 return; 6268 /* we don't know who we are */ 6269 } else if (KMP_UBER_GTID(gtid)) { 6270 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6271 if (__kmp_root[gtid]->r.r_active) { 6272 __kmp_global.g.g_abort = -1; 6273 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6274 KA_TRACE(10, 6275 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6276 gtid)); 6277 return; 6278 } else { 6279 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6280 gtid)); 6281 __kmp_unregister_root_current_thread(gtid); 6282 } 6283 } else { 6284 /* just a worker thread, let's leave */ 6285 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6286 6287 if (gtid >= 0) { 6288 __kmp_threads[gtid]->th.th_task_team = NULL; 6289 } 6290 6291 KA_TRACE(10, 6292 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6293 gtid)); 6294 return; 6295 } 6296 } 6297 #if KMP_DYNAMIC_LIB 6298 if (__kmp_pause_status != kmp_hard_paused) 6299 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6300 // because we will better shutdown later in the library destructor. 6301 { 6302 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6303 return; 6304 } 6305 #endif 6306 /* synchronize the termination process */ 6307 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6308 6309 /* have we already finished */ 6310 if (__kmp_global.g.g_abort) { 6311 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6312 /* TODO abort? */ 6313 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6314 return; 6315 } 6316 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6317 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6318 return; 6319 } 6320 6321 /* We need this lock to enforce mutex between this reading of 6322 __kmp_threads_capacity and the writing by __kmp_register_root. 6323 Alternatively, we can use a counter of roots that is atomically updated by 6324 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6325 __kmp_internal_end_*. */ 6326 6327 /* should we finish the run-time? are all siblings done? */ 6328 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6329 6330 for (i = 0; i < __kmp_threads_capacity; ++i) { 6331 if (KMP_UBER_GTID(i)) { 6332 KA_TRACE( 6333 10, 6334 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6335 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6336 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6337 return; 6338 } 6339 } 6340 6341 /* now we can safely conduct the actual termination */ 6342 6343 __kmp_internal_end(); 6344 6345 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6346 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6347 6348 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6349 6350 #ifdef DUMP_DEBUG_ON_EXIT 6351 if (__kmp_debug_buf) 6352 __kmp_dump_debug_buffer(); 6353 #endif 6354 } // __kmp_internal_end_thread 6355 6356 // ----------------------------------------------------------------------------- 6357 // Library registration stuff. 6358 6359 static long __kmp_registration_flag = 0; 6360 // Random value used to indicate library initialization. 6361 static char *__kmp_registration_str = NULL; 6362 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6363 6364 static inline char *__kmp_reg_status_name() { 6365 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6366 each thread. If registration and unregistration go in different threads 6367 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6368 env var can not be found, because the name will contain different pid. */ 6369 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6370 } // __kmp_reg_status_get 6371 6372 void __kmp_register_library_startup(void) { 6373 6374 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6375 int done = 0; 6376 union { 6377 double dtime; 6378 long ltime; 6379 } time; 6380 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6381 __kmp_initialize_system_tick(); 6382 #endif 6383 __kmp_read_system_time(&time.dtime); 6384 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6385 __kmp_registration_str = 6386 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6387 __kmp_registration_flag, KMP_LIBRARY_FILE); 6388 6389 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6390 __kmp_registration_str)); 6391 6392 while (!done) { 6393 6394 char *value = NULL; // Actual value of the environment variable. 6395 6396 // Set environment variable, but do not overwrite if it is exist. 6397 __kmp_env_set(name, __kmp_registration_str, 0); 6398 // Check the variable is written. 6399 value = __kmp_env_get(name); 6400 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6401 6402 done = 1; // Ok, environment variable set successfully, exit the loop. 6403 6404 } else { 6405 6406 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6407 // Check whether it alive or dead. 6408 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6409 char *tail = value; 6410 char *flag_addr_str = NULL; 6411 char *flag_val_str = NULL; 6412 char const *file_name = NULL; 6413 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6414 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6415 file_name = tail; 6416 if (tail != NULL) { 6417 long *flag_addr = 0; 6418 long flag_val = 0; 6419 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr)); 6420 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6421 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6422 // First, check whether environment-encoded address is mapped into 6423 // addr space. 6424 // If so, dereference it to see if it still has the right value. 6425 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6426 neighbor = 1; 6427 } else { 6428 // If not, then we know the other copy of the library is no longer 6429 // running. 6430 neighbor = 2; 6431 } 6432 } 6433 } 6434 switch (neighbor) { 6435 case 0: // Cannot parse environment variable -- neighbor status unknown. 6436 // Assume it is the incompatible format of future version of the 6437 // library. Assume the other library is alive. 6438 // WARN( ... ); // TODO: Issue a warning. 6439 file_name = "unknown library"; 6440 KMP_FALLTHROUGH(); 6441 // Attention! Falling to the next case. That's intentional. 6442 case 1: { // Neighbor is alive. 6443 // Check it is allowed. 6444 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6445 if (!__kmp_str_match_true(duplicate_ok)) { 6446 // That's not allowed. Issue fatal error. 6447 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6448 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6449 } 6450 KMP_INTERNAL_FREE(duplicate_ok); 6451 __kmp_duplicate_library_ok = 1; 6452 done = 1; // Exit the loop. 6453 } break; 6454 case 2: { // Neighbor is dead. 6455 // Clear the variable and try to register library again. 6456 __kmp_env_unset(name); 6457 } break; 6458 default: { KMP_DEBUG_ASSERT(0); } break; 6459 } 6460 } 6461 KMP_INTERNAL_FREE((void *)value); 6462 } 6463 KMP_INTERNAL_FREE((void *)name); 6464 6465 } // func __kmp_register_library_startup 6466 6467 void __kmp_unregister_library(void) { 6468 6469 char *name = __kmp_reg_status_name(); 6470 char *value = __kmp_env_get(name); 6471 6472 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6473 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6474 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6475 // Ok, this is our variable. Delete it. 6476 __kmp_env_unset(name); 6477 } 6478 6479 KMP_INTERNAL_FREE(__kmp_registration_str); 6480 KMP_INTERNAL_FREE(value); 6481 KMP_INTERNAL_FREE(name); 6482 6483 __kmp_registration_flag = 0; 6484 __kmp_registration_str = NULL; 6485 6486 } // __kmp_unregister_library 6487 6488 // End of Library registration stuff. 6489 // ----------------------------------------------------------------------------- 6490 6491 #if KMP_MIC_SUPPORTED 6492 6493 static void __kmp_check_mic_type() { 6494 kmp_cpuid_t cpuid_state = {0}; 6495 kmp_cpuid_t *cs_p = &cpuid_state; 6496 __kmp_x86_cpuid(1, 0, cs_p); 6497 // We don't support mic1 at the moment 6498 if ((cs_p->eax & 0xff0) == 0xB10) { 6499 __kmp_mic_type = mic2; 6500 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6501 __kmp_mic_type = mic3; 6502 } else { 6503 __kmp_mic_type = non_mic; 6504 } 6505 } 6506 6507 #endif /* KMP_MIC_SUPPORTED */ 6508 6509 static void __kmp_do_serial_initialize(void) { 6510 int i, gtid; 6511 int size; 6512 6513 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6514 6515 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6516 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6517 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6518 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6519 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6520 6521 #if OMPT_SUPPORT 6522 ompt_pre_init(); 6523 #endif 6524 6525 __kmp_validate_locks(); 6526 6527 /* Initialize internal memory allocator */ 6528 __kmp_init_allocator(); 6529 6530 /* Register the library startup via an environment variable and check to see 6531 whether another copy of the library is already registered. */ 6532 6533 __kmp_register_library_startup(); 6534 6535 /* TODO reinitialization of library */ 6536 if (TCR_4(__kmp_global.g.g_done)) { 6537 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6538 } 6539 6540 __kmp_global.g.g_abort = 0; 6541 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6542 6543 /* initialize the locks */ 6544 #if KMP_USE_ADAPTIVE_LOCKS 6545 #if KMP_DEBUG_ADAPTIVE_LOCKS 6546 __kmp_init_speculative_stats(); 6547 #endif 6548 #endif 6549 #if KMP_STATS_ENABLED 6550 __kmp_stats_init(); 6551 #endif 6552 __kmp_init_lock(&__kmp_global_lock); 6553 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6554 __kmp_init_lock(&__kmp_debug_lock); 6555 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6556 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6557 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6558 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6559 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6560 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6561 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6562 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6563 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6564 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6565 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6566 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6567 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6568 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6569 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6570 #if KMP_USE_MONITOR 6571 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6572 #endif 6573 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6574 6575 /* conduct initialization and initial setup of configuration */ 6576 6577 __kmp_runtime_initialize(); 6578 6579 #if KMP_MIC_SUPPORTED 6580 __kmp_check_mic_type(); 6581 #endif 6582 6583 // Some global variable initialization moved here from kmp_env_initialize() 6584 #ifdef KMP_DEBUG 6585 kmp_diag = 0; 6586 #endif 6587 __kmp_abort_delay = 0; 6588 6589 // From __kmp_init_dflt_team_nth() 6590 /* assume the entire machine will be used */ 6591 __kmp_dflt_team_nth_ub = __kmp_xproc; 6592 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6593 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6594 } 6595 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6596 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6597 } 6598 __kmp_max_nth = __kmp_sys_max_nth; 6599 __kmp_cg_max_nth = __kmp_sys_max_nth; 6600 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6601 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6602 __kmp_teams_max_nth = __kmp_sys_max_nth; 6603 } 6604 6605 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6606 // part 6607 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6608 #if KMP_USE_MONITOR 6609 __kmp_monitor_wakeups = 6610 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6611 __kmp_bt_intervals = 6612 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6613 #endif 6614 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6615 __kmp_library = library_throughput; 6616 // From KMP_SCHEDULE initialization 6617 __kmp_static = kmp_sch_static_balanced; 6618 // AC: do not use analytical here, because it is non-monotonous 6619 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6620 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6621 // need to repeat assignment 6622 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6623 // bit control and barrier method control parts 6624 #if KMP_FAST_REDUCTION_BARRIER 6625 #define kmp_reduction_barrier_gather_bb ((int)1) 6626 #define kmp_reduction_barrier_release_bb ((int)1) 6627 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6628 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6629 #endif // KMP_FAST_REDUCTION_BARRIER 6630 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6631 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6632 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6633 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6634 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6635 #if KMP_FAST_REDUCTION_BARRIER 6636 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6637 // lin_64 ): hyper,1 6638 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6639 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6640 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6641 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6642 } 6643 #endif // KMP_FAST_REDUCTION_BARRIER 6644 } 6645 #if KMP_FAST_REDUCTION_BARRIER 6646 #undef kmp_reduction_barrier_release_pat 6647 #undef kmp_reduction_barrier_gather_pat 6648 #undef kmp_reduction_barrier_release_bb 6649 #undef kmp_reduction_barrier_gather_bb 6650 #endif // KMP_FAST_REDUCTION_BARRIER 6651 #if KMP_MIC_SUPPORTED 6652 if (__kmp_mic_type == mic2) { // KNC 6653 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6654 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6655 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6656 1; // forkjoin release 6657 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6658 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6659 } 6660 #if KMP_FAST_REDUCTION_BARRIER 6661 if (__kmp_mic_type == mic2) { // KNC 6662 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6663 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6664 } 6665 #endif // KMP_FAST_REDUCTION_BARRIER 6666 #endif // KMP_MIC_SUPPORTED 6667 6668 // From KMP_CHECKS initialization 6669 #ifdef KMP_DEBUG 6670 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6671 #else 6672 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6673 #endif 6674 6675 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6676 __kmp_foreign_tp = TRUE; 6677 6678 __kmp_global.g.g_dynamic = FALSE; 6679 __kmp_global.g.g_dynamic_mode = dynamic_default; 6680 6681 __kmp_env_initialize(NULL); 6682 6683 // Print all messages in message catalog for testing purposes. 6684 #ifdef KMP_DEBUG 6685 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6686 if (__kmp_str_match_true(val)) { 6687 kmp_str_buf_t buffer; 6688 __kmp_str_buf_init(&buffer); 6689 __kmp_i18n_dump_catalog(&buffer); 6690 __kmp_printf("%s", buffer.str); 6691 __kmp_str_buf_free(&buffer); 6692 } 6693 __kmp_env_free(&val); 6694 #endif 6695 6696 __kmp_threads_capacity = 6697 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6698 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6699 __kmp_tp_capacity = __kmp_default_tp_capacity( 6700 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6701 6702 // If the library is shut down properly, both pools must be NULL. Just in 6703 // case, set them to NULL -- some memory may leak, but subsequent code will 6704 // work even if pools are not freed. 6705 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6706 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6707 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6708 __kmp_thread_pool = NULL; 6709 __kmp_thread_pool_insert_pt = NULL; 6710 __kmp_team_pool = NULL; 6711 6712 /* Allocate all of the variable sized records */ 6713 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6714 * expandable */ 6715 /* Since allocation is cache-aligned, just add extra padding at the end */ 6716 size = 6717 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6718 CACHE_LINE; 6719 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6720 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6721 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6722 6723 /* init thread counts */ 6724 KMP_DEBUG_ASSERT(__kmp_all_nth == 6725 0); // Asserts fail if the library is reinitializing and 6726 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6727 __kmp_all_nth = 0; 6728 __kmp_nth = 0; 6729 6730 /* setup the uber master thread and hierarchy */ 6731 gtid = __kmp_register_root(TRUE); 6732 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6733 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6734 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6735 6736 KMP_MB(); /* Flush all pending memory write invalidates. */ 6737 6738 __kmp_common_initialize(); 6739 6740 #if KMP_OS_UNIX 6741 /* invoke the child fork handler */ 6742 __kmp_register_atfork(); 6743 #endif 6744 6745 #if !KMP_DYNAMIC_LIB 6746 { 6747 /* Invoke the exit handler when the program finishes, only for static 6748 library. For dynamic library, we already have _fini and DllMain. */ 6749 int rc = atexit(__kmp_internal_end_atexit); 6750 if (rc != 0) { 6751 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6752 __kmp_msg_null); 6753 } 6754 } 6755 #endif 6756 6757 #if KMP_HANDLE_SIGNALS 6758 #if KMP_OS_UNIX 6759 /* NOTE: make sure that this is called before the user installs their own 6760 signal handlers so that the user handlers are called first. this way they 6761 can return false, not call our handler, avoid terminating the library, and 6762 continue execution where they left off. */ 6763 __kmp_install_signals(FALSE); 6764 #endif /* KMP_OS_UNIX */ 6765 #if KMP_OS_WINDOWS 6766 __kmp_install_signals(TRUE); 6767 #endif /* KMP_OS_WINDOWS */ 6768 #endif 6769 6770 /* we have finished the serial initialization */ 6771 __kmp_init_counter++; 6772 6773 __kmp_init_serial = TRUE; 6774 6775 if (__kmp_settings) { 6776 __kmp_env_print(); 6777 } 6778 6779 if (__kmp_display_env || __kmp_display_env_verbose) { 6780 __kmp_env_print_2(); 6781 } 6782 6783 #if OMPT_SUPPORT 6784 ompt_post_init(); 6785 #endif 6786 6787 KMP_MB(); 6788 6789 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6790 } 6791 6792 void __kmp_serial_initialize(void) { 6793 if (__kmp_init_serial) { 6794 return; 6795 } 6796 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6797 if (__kmp_init_serial) { 6798 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6799 return; 6800 } 6801 __kmp_do_serial_initialize(); 6802 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6803 } 6804 6805 static void __kmp_do_middle_initialize(void) { 6806 int i, j; 6807 int prev_dflt_team_nth; 6808 6809 if (!__kmp_init_serial) { 6810 __kmp_do_serial_initialize(); 6811 } 6812 6813 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6814 6815 // Save the previous value for the __kmp_dflt_team_nth so that 6816 // we can avoid some reinitialization if it hasn't changed. 6817 prev_dflt_team_nth = __kmp_dflt_team_nth; 6818 6819 #if KMP_AFFINITY_SUPPORTED 6820 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6821 // number of cores on the machine. 6822 __kmp_affinity_initialize(); 6823 6824 // Run through the __kmp_threads array and set the affinity mask 6825 // for each root thread that is currently registered with the RTL. 6826 for (i = 0; i < __kmp_threads_capacity; i++) { 6827 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6828 __kmp_affinity_set_init_mask(i, TRUE); 6829 } 6830 } 6831 #endif /* KMP_AFFINITY_SUPPORTED */ 6832 6833 KMP_ASSERT(__kmp_xproc > 0); 6834 if (__kmp_avail_proc == 0) { 6835 __kmp_avail_proc = __kmp_xproc; 6836 } 6837 6838 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6839 // correct them now 6840 j = 0; 6841 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6842 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6843 __kmp_avail_proc; 6844 j++; 6845 } 6846 6847 if (__kmp_dflt_team_nth == 0) { 6848 #ifdef KMP_DFLT_NTH_CORES 6849 // Default #threads = #cores 6850 __kmp_dflt_team_nth = __kmp_ncores; 6851 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6852 "__kmp_ncores (%d)\n", 6853 __kmp_dflt_team_nth)); 6854 #else 6855 // Default #threads = #available OS procs 6856 __kmp_dflt_team_nth = __kmp_avail_proc; 6857 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6858 "__kmp_avail_proc(%d)\n", 6859 __kmp_dflt_team_nth)); 6860 #endif /* KMP_DFLT_NTH_CORES */ 6861 } 6862 6863 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 6864 __kmp_dflt_team_nth = KMP_MIN_NTH; 6865 } 6866 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 6867 __kmp_dflt_team_nth = __kmp_sys_max_nth; 6868 } 6869 6870 // There's no harm in continuing if the following check fails, 6871 // but it indicates an error in the previous logic. 6872 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 6873 6874 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 6875 // Run through the __kmp_threads array and set the num threads icv for each 6876 // root thread that is currently registered with the RTL (which has not 6877 // already explicitly set its nthreads-var with a call to 6878 // omp_set_num_threads()). 6879 for (i = 0; i < __kmp_threads_capacity; i++) { 6880 kmp_info_t *thread = __kmp_threads[i]; 6881 if (thread == NULL) 6882 continue; 6883 if (thread->th.th_current_task->td_icvs.nproc != 0) 6884 continue; 6885 6886 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 6887 } 6888 } 6889 KA_TRACE( 6890 20, 6891 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 6892 __kmp_dflt_team_nth)); 6893 6894 #ifdef KMP_ADJUST_BLOCKTIME 6895 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 6896 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6897 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6898 if (__kmp_nth > __kmp_avail_proc) { 6899 __kmp_zero_bt = TRUE; 6900 } 6901 } 6902 #endif /* KMP_ADJUST_BLOCKTIME */ 6903 6904 /* we have finished middle initialization */ 6905 TCW_SYNC_4(__kmp_init_middle, TRUE); 6906 6907 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 6908 } 6909 6910 void __kmp_middle_initialize(void) { 6911 if (__kmp_init_middle) { 6912 return; 6913 } 6914 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6915 if (__kmp_init_middle) { 6916 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6917 return; 6918 } 6919 __kmp_do_middle_initialize(); 6920 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6921 } 6922 6923 void __kmp_parallel_initialize(void) { 6924 int gtid = __kmp_entry_gtid(); // this might be a new root 6925 6926 /* synchronize parallel initialization (for sibling) */ 6927 if (TCR_4(__kmp_init_parallel)) 6928 return; 6929 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6930 if (TCR_4(__kmp_init_parallel)) { 6931 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6932 return; 6933 } 6934 6935 /* TODO reinitialization after we have already shut down */ 6936 if (TCR_4(__kmp_global.g.g_done)) { 6937 KA_TRACE( 6938 10, 6939 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 6940 __kmp_infinite_loop(); 6941 } 6942 6943 /* jc: The lock __kmp_initz_lock is already held, so calling 6944 __kmp_serial_initialize would cause a deadlock. So we call 6945 __kmp_do_serial_initialize directly. */ 6946 if (!__kmp_init_middle) { 6947 __kmp_do_middle_initialize(); 6948 } 6949 __kmp_resume_if_hard_paused(); 6950 6951 /* begin initialization */ 6952 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 6953 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6954 6955 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6956 // Save the FP control regs. 6957 // Worker threads will set theirs to these values at thread startup. 6958 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 6959 __kmp_store_mxcsr(&__kmp_init_mxcsr); 6960 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 6961 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 6962 6963 #if KMP_OS_UNIX 6964 #if KMP_HANDLE_SIGNALS 6965 /* must be after __kmp_serial_initialize */ 6966 __kmp_install_signals(TRUE); 6967 #endif 6968 #endif 6969 6970 __kmp_suspend_initialize(); 6971 6972 #if defined(USE_LOAD_BALANCE) 6973 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6974 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 6975 } 6976 #else 6977 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6978 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 6979 } 6980 #endif 6981 6982 if (__kmp_version) { 6983 __kmp_print_version_2(); 6984 } 6985 6986 /* we have finished parallel initialization */ 6987 TCW_SYNC_4(__kmp_init_parallel, TRUE); 6988 6989 KMP_MB(); 6990 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 6991 6992 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6993 } 6994 6995 /* ------------------------------------------------------------------------ */ 6996 6997 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6998 kmp_team_t *team) { 6999 kmp_disp_t *dispatch; 7000 7001 KMP_MB(); 7002 7003 /* none of the threads have encountered any constructs, yet. */ 7004 this_thr->th.th_local.this_construct = 0; 7005 #if KMP_CACHE_MANAGE 7006 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 7007 #endif /* KMP_CACHE_MANAGE */ 7008 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 7009 KMP_DEBUG_ASSERT(dispatch); 7010 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7011 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7012 // this_thr->th.th_info.ds.ds_tid ] ); 7013 7014 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7015 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7016 if (__kmp_env_consistency_check) 7017 __kmp_push_parallel(gtid, team->t.t_ident); 7018 7019 KMP_MB(); /* Flush all pending memory write invalidates. */ 7020 } 7021 7022 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7023 kmp_team_t *team) { 7024 if (__kmp_env_consistency_check) 7025 __kmp_pop_parallel(gtid, team->t.t_ident); 7026 7027 __kmp_finish_implicit_task(this_thr); 7028 } 7029 7030 int __kmp_invoke_task_func(int gtid) { 7031 int rc; 7032 int tid = __kmp_tid_from_gtid(gtid); 7033 kmp_info_t *this_thr = __kmp_threads[gtid]; 7034 kmp_team_t *team = this_thr->th.th_team; 7035 7036 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7037 #if USE_ITT_BUILD 7038 if (__itt_stack_caller_create_ptr) { 7039 __kmp_itt_stack_callee_enter( 7040 (__itt_caller) 7041 team->t.t_stack_id); // inform ittnotify about entering user's code 7042 } 7043 #endif /* USE_ITT_BUILD */ 7044 #if INCLUDE_SSC_MARKS 7045 SSC_MARK_INVOKING(); 7046 #endif 7047 7048 #if OMPT_SUPPORT 7049 void *dummy; 7050 void **exit_frame_p; 7051 ompt_data_t *my_task_data; 7052 ompt_data_t *my_parallel_data; 7053 int ompt_team_size; 7054 7055 if (ompt_enabled.enabled) { 7056 exit_frame_p = &( 7057 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr); 7058 } else { 7059 exit_frame_p = &dummy; 7060 } 7061 7062 my_task_data = 7063 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7064 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7065 if (ompt_enabled.ompt_callback_implicit_task) { 7066 ompt_team_size = team->t.t_nproc; 7067 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7068 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7069 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7070 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7071 } 7072 #endif 7073 7074 #if KMP_STATS_ENABLED 7075 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7076 if (previous_state == stats_state_e::TEAMS_REGION) { 7077 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7078 } else { 7079 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7080 } 7081 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7082 #endif 7083 7084 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7085 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7086 #if OMPT_SUPPORT 7087 , 7088 exit_frame_p 7089 #endif 7090 ); 7091 #if OMPT_SUPPORT 7092 *exit_frame_p = NULL; 7093 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7094 #endif 7095 7096 #if KMP_STATS_ENABLED 7097 if (previous_state == stats_state_e::TEAMS_REGION) { 7098 KMP_SET_THREAD_STATE(previous_state); 7099 } 7100 KMP_POP_PARTITIONED_TIMER(); 7101 #endif 7102 7103 #if USE_ITT_BUILD 7104 if (__itt_stack_caller_create_ptr) { 7105 __kmp_itt_stack_callee_leave( 7106 (__itt_caller) 7107 team->t.t_stack_id); // inform ittnotify about leaving user's code 7108 } 7109 #endif /* USE_ITT_BUILD */ 7110 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7111 7112 return rc; 7113 } 7114 7115 void __kmp_teams_master(int gtid) { 7116 // This routine is called by all master threads in teams construct 7117 kmp_info_t *thr = __kmp_threads[gtid]; 7118 kmp_team_t *team = thr->th.th_team; 7119 ident_t *loc = team->t.t_ident; 7120 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7121 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7122 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7123 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7124 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7125 7126 // This thread is a new CG root. Set up the proper variables. 7127 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7128 tmp->cg_root = thr; // Make thr the CG root 7129 // Init to thread limit that was stored when league masters were forked 7130 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7131 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7132 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7133 " cg_nthreads to 1\n", 7134 thr, tmp)); 7135 tmp->up = thr->th.th_cg_roots; 7136 thr->th.th_cg_roots = tmp; 7137 7138 // Launch league of teams now, but not let workers execute 7139 // (they hang on fork barrier until next parallel) 7140 #if INCLUDE_SSC_MARKS 7141 SSC_MARK_FORKING(); 7142 #endif 7143 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7144 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7145 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7146 #if INCLUDE_SSC_MARKS 7147 SSC_MARK_JOINING(); 7148 #endif 7149 // If the team size was reduced from the limit, set it to the new size 7150 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7151 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7152 // AC: last parameter "1" eliminates join barrier which won't work because 7153 // worker threads are in a fork barrier waiting for more parallel regions 7154 __kmp_join_call(loc, gtid 7155 #if OMPT_SUPPORT 7156 , 7157 fork_context_intel 7158 #endif 7159 , 7160 1); 7161 } 7162 7163 int __kmp_invoke_teams_master(int gtid) { 7164 kmp_info_t *this_thr = __kmp_threads[gtid]; 7165 kmp_team_t *team = this_thr->th.th_team; 7166 #if KMP_DEBUG 7167 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7168 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7169 (void *)__kmp_teams_master); 7170 #endif 7171 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7172 #if OMPT_SUPPORT 7173 int tid = __kmp_tid_from_gtid(gtid); 7174 ompt_data_t *task_data = 7175 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7176 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7177 if (ompt_enabled.ompt_callback_implicit_task) { 7178 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7179 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7180 ompt_task_initial); 7181 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7182 } 7183 #endif 7184 __kmp_teams_master(gtid); 7185 #if OMPT_SUPPORT 7186 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7187 #endif 7188 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7189 return 1; 7190 } 7191 7192 /* this sets the requested number of threads for the next parallel region 7193 encountered by this team. since this should be enclosed in the forkjoin 7194 critical section it should avoid race conditions with asymmetrical nested 7195 parallelism */ 7196 7197 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7198 kmp_info_t *thr = __kmp_threads[gtid]; 7199 7200 if (num_threads > 0) 7201 thr->th.th_set_nproc = num_threads; 7202 } 7203 7204 /* this sets the requested number of teams for the teams region and/or 7205 the number of threads for the next parallel region encountered */ 7206 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7207 int num_threads) { 7208 kmp_info_t *thr = __kmp_threads[gtid]; 7209 KMP_DEBUG_ASSERT(num_teams >= 0); 7210 KMP_DEBUG_ASSERT(num_threads >= 0); 7211 7212 if (num_teams == 0) 7213 num_teams = 1; // default number of teams is 1. 7214 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7215 if (!__kmp_reserve_warn) { 7216 __kmp_reserve_warn = 1; 7217 __kmp_msg(kmp_ms_warning, 7218 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7219 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7220 } 7221 num_teams = __kmp_teams_max_nth; 7222 } 7223 // Set number of teams (number of threads in the outer "parallel" of the 7224 // teams) 7225 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7226 7227 // Remember the number of threads for inner parallel regions 7228 if (!TCR_4(__kmp_init_middle)) 7229 __kmp_middle_initialize(); // get internal globals calculated 7230 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7231 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7232 if (num_threads == 0) { 7233 num_threads = __kmp_avail_proc / num_teams; 7234 // adjust num_threads w/o warning as it is not user setting 7235 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7236 // no thread_limit clause specified - do not change thread-limit-var ICV 7237 if (num_threads > __kmp_dflt_team_nth) { 7238 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7239 } 7240 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7241 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7242 } // prevent team size to exceed thread-limit-var 7243 if (num_teams * num_threads > __kmp_teams_max_nth) { 7244 num_threads = __kmp_teams_max_nth / num_teams; 7245 } 7246 } else { 7247 // This thread will be the master of the league masters 7248 // Store new thread limit; old limit is saved in th_cg_roots list 7249 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7250 // num_threads = min(num_threads, nthreads-var) 7251 if (num_threads > __kmp_dflt_team_nth) { 7252 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7253 } 7254 if (num_teams * num_threads > __kmp_teams_max_nth) { 7255 int new_threads = __kmp_teams_max_nth / num_teams; 7256 if (!__kmp_reserve_warn) { // user asked for too many threads 7257 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7258 __kmp_msg(kmp_ms_warning, 7259 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7260 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7261 } 7262 num_threads = new_threads; 7263 } 7264 } 7265 thr->th.th_teams_size.nth = num_threads; 7266 } 7267 7268 // Set the proc_bind var to use in the following parallel region. 7269 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7270 kmp_info_t *thr = __kmp_threads[gtid]; 7271 thr->th.th_set_proc_bind = proc_bind; 7272 } 7273 7274 /* Launch the worker threads into the microtask. */ 7275 7276 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7277 kmp_info_t *this_thr = __kmp_threads[gtid]; 7278 7279 #ifdef KMP_DEBUG 7280 int f; 7281 #endif /* KMP_DEBUG */ 7282 7283 KMP_DEBUG_ASSERT(team); 7284 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7285 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7286 KMP_MB(); /* Flush all pending memory write invalidates. */ 7287 7288 team->t.t_construct = 0; /* no single directives seen yet */ 7289 team->t.t_ordered.dt.t_value = 7290 0; /* thread 0 enters the ordered section first */ 7291 7292 /* Reset the identifiers on the dispatch buffer */ 7293 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7294 if (team->t.t_max_nproc > 1) { 7295 int i; 7296 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7297 team->t.t_disp_buffer[i].buffer_index = i; 7298 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7299 } 7300 } else { 7301 team->t.t_disp_buffer[0].buffer_index = 0; 7302 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7303 } 7304 7305 KMP_MB(); /* Flush all pending memory write invalidates. */ 7306 KMP_ASSERT(this_thr->th.th_team == team); 7307 7308 #ifdef KMP_DEBUG 7309 for (f = 0; f < team->t.t_nproc; f++) { 7310 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7311 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7312 } 7313 #endif /* KMP_DEBUG */ 7314 7315 /* release the worker threads so they may begin working */ 7316 __kmp_fork_barrier(gtid, 0); 7317 } 7318 7319 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7320 kmp_info_t *this_thr = __kmp_threads[gtid]; 7321 7322 KMP_DEBUG_ASSERT(team); 7323 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7324 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7325 KMP_MB(); /* Flush all pending memory write invalidates. */ 7326 7327 /* Join barrier after fork */ 7328 7329 #ifdef KMP_DEBUG 7330 if (__kmp_threads[gtid] && 7331 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7332 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7333 __kmp_threads[gtid]); 7334 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7335 "team->t.t_nproc=%d\n", 7336 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7337 team->t.t_nproc); 7338 __kmp_print_structure(); 7339 } 7340 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7341 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7342 #endif /* KMP_DEBUG */ 7343 7344 __kmp_join_barrier(gtid); /* wait for everyone */ 7345 #if OMPT_SUPPORT 7346 if (ompt_enabled.enabled && 7347 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7348 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7349 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7350 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7351 #if OMPT_OPTIONAL 7352 void *codeptr = NULL; 7353 if (KMP_MASTER_TID(ds_tid) && 7354 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7355 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7356 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7357 7358 if (ompt_enabled.ompt_callback_sync_region_wait) { 7359 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7360 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7361 codeptr); 7362 } 7363 if (ompt_enabled.ompt_callback_sync_region) { 7364 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7365 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7366 codeptr); 7367 } 7368 #endif 7369 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7370 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7371 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7372 } 7373 } 7374 #endif 7375 7376 KMP_MB(); /* Flush all pending memory write invalidates. */ 7377 KMP_ASSERT(this_thr->th.th_team == team); 7378 } 7379 7380 /* ------------------------------------------------------------------------ */ 7381 7382 #ifdef USE_LOAD_BALANCE 7383 7384 // Return the worker threads actively spinning in the hot team, if we 7385 // are at the outermost level of parallelism. Otherwise, return 0. 7386 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7387 int i; 7388 int retval; 7389 kmp_team_t *hot_team; 7390 7391 if (root->r.r_active) { 7392 return 0; 7393 } 7394 hot_team = root->r.r_hot_team; 7395 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7396 return hot_team->t.t_nproc - 1; // Don't count master thread 7397 } 7398 7399 // Skip the master thread - it is accounted for elsewhere. 7400 retval = 0; 7401 for (i = 1; i < hot_team->t.t_nproc; i++) { 7402 if (hot_team->t.t_threads[i]->th.th_active) { 7403 retval++; 7404 } 7405 } 7406 return retval; 7407 } 7408 7409 // Perform an automatic adjustment to the number of 7410 // threads used by the next parallel region. 7411 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7412 int retval; 7413 int pool_active; 7414 int hot_team_active; 7415 int team_curr_active; 7416 int system_active; 7417 7418 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7419 set_nproc)); 7420 KMP_DEBUG_ASSERT(root); 7421 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7422 ->th.th_current_task->td_icvs.dynamic == TRUE); 7423 KMP_DEBUG_ASSERT(set_nproc > 1); 7424 7425 if (set_nproc == 1) { 7426 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7427 return 1; 7428 } 7429 7430 // Threads that are active in the thread pool, active in the hot team for this 7431 // particular root (if we are at the outer par level), and the currently 7432 // executing thread (to become the master) are available to add to the new 7433 // team, but are currently contributing to the system load, and must be 7434 // accounted for. 7435 pool_active = __kmp_thread_pool_active_nth; 7436 hot_team_active = __kmp_active_hot_team_nproc(root); 7437 team_curr_active = pool_active + hot_team_active + 1; 7438 7439 // Check the system load. 7440 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7441 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7442 "hot team active = %d\n", 7443 system_active, pool_active, hot_team_active)); 7444 7445 if (system_active < 0) { 7446 // There was an error reading the necessary info from /proc, so use the 7447 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7448 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7449 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7450 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7451 7452 // Make this call behave like the thread limit algorithm. 7453 retval = __kmp_avail_proc - __kmp_nth + 7454 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7455 if (retval > set_nproc) { 7456 retval = set_nproc; 7457 } 7458 if (retval < KMP_MIN_NTH) { 7459 retval = KMP_MIN_NTH; 7460 } 7461 7462 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7463 retval)); 7464 return retval; 7465 } 7466 7467 // There is a slight delay in the load balance algorithm in detecting new 7468 // running procs. The real system load at this instant should be at least as 7469 // large as the #active omp thread that are available to add to the team. 7470 if (system_active < team_curr_active) { 7471 system_active = team_curr_active; 7472 } 7473 retval = __kmp_avail_proc - system_active + team_curr_active; 7474 if (retval > set_nproc) { 7475 retval = set_nproc; 7476 } 7477 if (retval < KMP_MIN_NTH) { 7478 retval = KMP_MIN_NTH; 7479 } 7480 7481 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7482 return retval; 7483 } // __kmp_load_balance_nproc() 7484 7485 #endif /* USE_LOAD_BALANCE */ 7486 7487 /* ------------------------------------------------------------------------ */ 7488 7489 /* NOTE: this is called with the __kmp_init_lock held */ 7490 void __kmp_cleanup(void) { 7491 int f; 7492 7493 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7494 7495 if (TCR_4(__kmp_init_parallel)) { 7496 #if KMP_HANDLE_SIGNALS 7497 __kmp_remove_signals(); 7498 #endif 7499 TCW_4(__kmp_init_parallel, FALSE); 7500 } 7501 7502 if (TCR_4(__kmp_init_middle)) { 7503 #if KMP_AFFINITY_SUPPORTED 7504 __kmp_affinity_uninitialize(); 7505 #endif /* KMP_AFFINITY_SUPPORTED */ 7506 __kmp_cleanup_hierarchy(); 7507 TCW_4(__kmp_init_middle, FALSE); 7508 } 7509 7510 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7511 7512 if (__kmp_init_serial) { 7513 __kmp_runtime_destroy(); 7514 __kmp_init_serial = FALSE; 7515 } 7516 7517 __kmp_cleanup_threadprivate_caches(); 7518 7519 for (f = 0; f < __kmp_threads_capacity; f++) { 7520 if (__kmp_root[f] != NULL) { 7521 __kmp_free(__kmp_root[f]); 7522 __kmp_root[f] = NULL; 7523 } 7524 } 7525 __kmp_free(__kmp_threads); 7526 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7527 // there is no need in freeing __kmp_root. 7528 __kmp_threads = NULL; 7529 __kmp_root = NULL; 7530 __kmp_threads_capacity = 0; 7531 7532 #if KMP_USE_DYNAMIC_LOCK 7533 __kmp_cleanup_indirect_user_locks(); 7534 #else 7535 __kmp_cleanup_user_locks(); 7536 #endif 7537 7538 #if KMP_AFFINITY_SUPPORTED 7539 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7540 __kmp_cpuinfo_file = NULL; 7541 #endif /* KMP_AFFINITY_SUPPORTED */ 7542 7543 #if KMP_USE_ADAPTIVE_LOCKS 7544 #if KMP_DEBUG_ADAPTIVE_LOCKS 7545 __kmp_print_speculative_stats(); 7546 #endif 7547 #endif 7548 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7549 __kmp_nested_nth.nth = NULL; 7550 __kmp_nested_nth.size = 0; 7551 __kmp_nested_nth.used = 0; 7552 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7553 __kmp_nested_proc_bind.bind_types = NULL; 7554 __kmp_nested_proc_bind.size = 0; 7555 __kmp_nested_proc_bind.used = 0; 7556 if (__kmp_affinity_format) { 7557 KMP_INTERNAL_FREE(__kmp_affinity_format); 7558 __kmp_affinity_format = NULL; 7559 } 7560 7561 __kmp_i18n_catclose(); 7562 7563 #if KMP_USE_HIER_SCHED 7564 __kmp_hier_scheds.deallocate(); 7565 #endif 7566 7567 #if KMP_STATS_ENABLED 7568 __kmp_stats_fini(); 7569 #endif 7570 7571 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7572 } 7573 7574 /* ------------------------------------------------------------------------ */ 7575 7576 int __kmp_ignore_mppbeg(void) { 7577 char *env; 7578 7579 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7580 if (__kmp_str_match_false(env)) 7581 return FALSE; 7582 } 7583 // By default __kmpc_begin() is no-op. 7584 return TRUE; 7585 } 7586 7587 int __kmp_ignore_mppend(void) { 7588 char *env; 7589 7590 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7591 if (__kmp_str_match_false(env)) 7592 return FALSE; 7593 } 7594 // By default __kmpc_end() is no-op. 7595 return TRUE; 7596 } 7597 7598 void __kmp_internal_begin(void) { 7599 int gtid; 7600 kmp_root_t *root; 7601 7602 /* this is a very important step as it will register new sibling threads 7603 and assign these new uber threads a new gtid */ 7604 gtid = __kmp_entry_gtid(); 7605 root = __kmp_threads[gtid]->th.th_root; 7606 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7607 7608 if (root->r.r_begin) 7609 return; 7610 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7611 if (root->r.r_begin) { 7612 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7613 return; 7614 } 7615 7616 root->r.r_begin = TRUE; 7617 7618 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7619 } 7620 7621 /* ------------------------------------------------------------------------ */ 7622 7623 void __kmp_user_set_library(enum library_type arg) { 7624 int gtid; 7625 kmp_root_t *root; 7626 kmp_info_t *thread; 7627 7628 /* first, make sure we are initialized so we can get our gtid */ 7629 7630 gtid = __kmp_entry_gtid(); 7631 thread = __kmp_threads[gtid]; 7632 7633 root = thread->th.th_root; 7634 7635 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7636 library_serial)); 7637 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7638 thread */ 7639 KMP_WARNING(SetLibraryIncorrectCall); 7640 return; 7641 } 7642 7643 switch (arg) { 7644 case library_serial: 7645 thread->th.th_set_nproc = 0; 7646 set__nproc(thread, 1); 7647 break; 7648 case library_turnaround: 7649 thread->th.th_set_nproc = 0; 7650 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7651 : __kmp_dflt_team_nth_ub); 7652 break; 7653 case library_throughput: 7654 thread->th.th_set_nproc = 0; 7655 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7656 : __kmp_dflt_team_nth_ub); 7657 break; 7658 default: 7659 KMP_FATAL(UnknownLibraryType, arg); 7660 } 7661 7662 __kmp_aux_set_library(arg); 7663 } 7664 7665 void __kmp_aux_set_stacksize(size_t arg) { 7666 if (!__kmp_init_serial) 7667 __kmp_serial_initialize(); 7668 7669 #if KMP_OS_DARWIN 7670 if (arg & (0x1000 - 1)) { 7671 arg &= ~(0x1000 - 1); 7672 if (arg + 0x1000) /* check for overflow if we round up */ 7673 arg += 0x1000; 7674 } 7675 #endif 7676 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7677 7678 /* only change the default stacksize before the first parallel region */ 7679 if (!TCR_4(__kmp_init_parallel)) { 7680 size_t value = arg; /* argument is in bytes */ 7681 7682 if (value < __kmp_sys_min_stksize) 7683 value = __kmp_sys_min_stksize; 7684 else if (value > KMP_MAX_STKSIZE) 7685 value = KMP_MAX_STKSIZE; 7686 7687 __kmp_stksize = value; 7688 7689 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7690 } 7691 7692 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7693 } 7694 7695 /* set the behaviour of the runtime library */ 7696 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7697 void __kmp_aux_set_library(enum library_type arg) { 7698 __kmp_library = arg; 7699 7700 switch (__kmp_library) { 7701 case library_serial: { 7702 KMP_INFORM(LibraryIsSerial); 7703 } break; 7704 case library_turnaround: 7705 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7706 __kmp_use_yield = 2; // only yield when oversubscribed 7707 break; 7708 case library_throughput: 7709 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7710 __kmp_dflt_blocktime = 200; 7711 break; 7712 default: 7713 KMP_FATAL(UnknownLibraryType, arg); 7714 } 7715 } 7716 7717 /* Getting team information common for all team API */ 7718 // Returns NULL if not in teams construct 7719 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 7720 kmp_info_t *thr = __kmp_entry_thread(); 7721 teams_serialized = 0; 7722 if (thr->th.th_teams_microtask) { 7723 kmp_team_t *team = thr->th.th_team; 7724 int tlevel = thr->th.th_teams_level; // the level of the teams construct 7725 int ii = team->t.t_level; 7726 teams_serialized = team->t.t_serialized; 7727 int level = tlevel + 1; 7728 KMP_DEBUG_ASSERT(ii >= tlevel); 7729 while (ii > level) { 7730 for (teams_serialized = team->t.t_serialized; 7731 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 7732 } 7733 if (team->t.t_serialized && (!teams_serialized)) { 7734 team = team->t.t_parent; 7735 continue; 7736 } 7737 if (ii > level) { 7738 team = team->t.t_parent; 7739 ii--; 7740 } 7741 } 7742 return team; 7743 } 7744 return NULL; 7745 } 7746 7747 int __kmp_aux_get_team_num() { 7748 int serialized; 7749 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7750 if (team) { 7751 if (serialized > 1) { 7752 return 0; // teams region is serialized ( 1 team of 1 thread ). 7753 } else { 7754 return team->t.t_master_tid; 7755 } 7756 } 7757 return 0; 7758 } 7759 7760 int __kmp_aux_get_num_teams() { 7761 int serialized; 7762 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7763 if (team) { 7764 if (serialized > 1) { 7765 return 1; 7766 } else { 7767 return team->t.t_parent->t.t_nproc; 7768 } 7769 } 7770 return 1; 7771 } 7772 7773 /* ------------------------------------------------------------------------ */ 7774 7775 /* 7776 * Affinity Format Parser 7777 * 7778 * Field is in form of: %[[[0].]size]type 7779 * % and type are required (%% means print a literal '%') 7780 * type is either single char or long name surrounded by {}, 7781 * e.g., N or {num_threads} 7782 * 0 => leading zeros 7783 * . => right justified when size is specified 7784 * by default output is left justified 7785 * size is the *minimum* field length 7786 * All other characters are printed as is 7787 * 7788 * Available field types: 7789 * L {thread_level} - omp_get_level() 7790 * n {thread_num} - omp_get_thread_num() 7791 * h {host} - name of host machine 7792 * P {process_id} - process id (integer) 7793 * T {thread_identifier} - native thread identifier (integer) 7794 * N {num_threads} - omp_get_num_threads() 7795 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 7796 * a {thread_affinity} - comma separated list of integers or integer ranges 7797 * (values of affinity mask) 7798 * 7799 * Implementation-specific field types can be added 7800 * If a type is unknown, print "undefined" 7801 */ 7802 7803 // Structure holding the short name, long name, and corresponding data type 7804 // for snprintf. A table of these will represent the entire valid keyword 7805 // field types. 7806 typedef struct kmp_affinity_format_field_t { 7807 char short_name; // from spec e.g., L -> thread level 7808 const char *long_name; // from spec thread_level -> thread level 7809 char field_format; // data type for snprintf (typically 'd' or 's' 7810 // for integer or string) 7811 } kmp_affinity_format_field_t; 7812 7813 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 7814 #if KMP_AFFINITY_SUPPORTED 7815 {'A', "thread_affinity", 's'}, 7816 #endif 7817 {'t', "team_num", 'd'}, 7818 {'T', "num_teams", 'd'}, 7819 {'L', "nesting_level", 'd'}, 7820 {'n', "thread_num", 'd'}, 7821 {'N', "num_threads", 'd'}, 7822 {'a', "ancestor_tnum", 'd'}, 7823 {'H', "host", 's'}, 7824 {'P', "process_id", 'd'}, 7825 {'i', "native_thread_id", 'd'}}; 7826 7827 // Return the number of characters it takes to hold field 7828 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 7829 const char **ptr, 7830 kmp_str_buf_t *field_buffer) { 7831 int rc, format_index, field_value; 7832 const char *width_left, *width_right; 7833 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 7834 static const int FORMAT_SIZE = 20; 7835 char format[FORMAT_SIZE] = {0}; 7836 char absolute_short_name = 0; 7837 7838 KMP_DEBUG_ASSERT(gtid >= 0); 7839 KMP_DEBUG_ASSERT(th); 7840 KMP_DEBUG_ASSERT(**ptr == '%'); 7841 KMP_DEBUG_ASSERT(field_buffer); 7842 7843 __kmp_str_buf_clear(field_buffer); 7844 7845 // Skip the initial % 7846 (*ptr)++; 7847 7848 // Check for %% first 7849 if (**ptr == '%') { 7850 __kmp_str_buf_cat(field_buffer, "%", 1); 7851 (*ptr)++; // skip over the second % 7852 return 1; 7853 } 7854 7855 // Parse field modifiers if they are present 7856 pad_zeros = false; 7857 if (**ptr == '0') { 7858 pad_zeros = true; 7859 (*ptr)++; // skip over 0 7860 } 7861 right_justify = false; 7862 if (**ptr == '.') { 7863 right_justify = true; 7864 (*ptr)++; // skip over . 7865 } 7866 // Parse width of field: [width_left, width_right) 7867 width_left = width_right = NULL; 7868 if (**ptr >= '0' && **ptr <= '9') { 7869 width_left = *ptr; 7870 SKIP_DIGITS(*ptr); 7871 width_right = *ptr; 7872 } 7873 7874 // Create the format for KMP_SNPRINTF based on flags parsed above 7875 format_index = 0; 7876 format[format_index++] = '%'; 7877 if (!right_justify) 7878 format[format_index++] = '-'; 7879 if (pad_zeros) 7880 format[format_index++] = '0'; 7881 if (width_left && width_right) { 7882 int i = 0; 7883 // Only allow 8 digit number widths. 7884 // This also prevents overflowing format variable 7885 while (i < 8 && width_left < width_right) { 7886 format[format_index++] = *width_left; 7887 width_left++; 7888 i++; 7889 } 7890 } 7891 7892 // Parse a name (long or short) 7893 // Canonicalize the name into absolute_short_name 7894 found_valid_name = false; 7895 parse_long_name = (**ptr == '{'); 7896 if (parse_long_name) 7897 (*ptr)++; // skip initial left brace 7898 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 7899 sizeof(__kmp_affinity_format_table[0]); 7900 ++i) { 7901 char short_name = __kmp_affinity_format_table[i].short_name; 7902 const char *long_name = __kmp_affinity_format_table[i].long_name; 7903 char field_format = __kmp_affinity_format_table[i].field_format; 7904 if (parse_long_name) { 7905 int length = KMP_STRLEN(long_name); 7906 if (strncmp(*ptr, long_name, length) == 0) { 7907 found_valid_name = true; 7908 (*ptr) += length; // skip the long name 7909 } 7910 } else if (**ptr == short_name) { 7911 found_valid_name = true; 7912 (*ptr)++; // skip the short name 7913 } 7914 if (found_valid_name) { 7915 format[format_index++] = field_format; 7916 format[format_index++] = '\0'; 7917 absolute_short_name = short_name; 7918 break; 7919 } 7920 } 7921 if (parse_long_name) { 7922 if (**ptr != '}') { 7923 absolute_short_name = 0; 7924 } else { 7925 (*ptr)++; // skip over the right brace 7926 } 7927 } 7928 7929 // Attempt to fill the buffer with the requested 7930 // value using snprintf within __kmp_str_buf_print() 7931 switch (absolute_short_name) { 7932 case 't': 7933 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 7934 break; 7935 case 'T': 7936 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 7937 break; 7938 case 'L': 7939 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 7940 break; 7941 case 'n': 7942 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 7943 break; 7944 case 'H': { 7945 static const int BUFFER_SIZE = 256; 7946 char buf[BUFFER_SIZE]; 7947 __kmp_expand_host_name(buf, BUFFER_SIZE); 7948 rc = __kmp_str_buf_print(field_buffer, format, buf); 7949 } break; 7950 case 'P': 7951 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 7952 break; 7953 case 'i': 7954 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 7955 break; 7956 case 'N': 7957 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 7958 break; 7959 case 'a': 7960 field_value = 7961 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 7962 rc = __kmp_str_buf_print(field_buffer, format, field_value); 7963 break; 7964 #if KMP_AFFINITY_SUPPORTED 7965 case 'A': { 7966 kmp_str_buf_t buf; 7967 __kmp_str_buf_init(&buf); 7968 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 7969 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 7970 __kmp_str_buf_free(&buf); 7971 } break; 7972 #endif 7973 default: 7974 // According to spec, If an implementation does not have info for field 7975 // type, then "undefined" is printed 7976 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 7977 // Skip the field 7978 if (parse_long_name) { 7979 SKIP_TOKEN(*ptr); 7980 if (**ptr == '}') 7981 (*ptr)++; 7982 } else { 7983 (*ptr)++; 7984 } 7985 } 7986 7987 KMP_ASSERT(format_index <= FORMAT_SIZE); 7988 return rc; 7989 } 7990 7991 /* 7992 * Return number of characters needed to hold the affinity string 7993 * (not including null byte character) 7994 * The resultant string is printed to buffer, which the caller can then 7995 * handle afterwards 7996 */ 7997 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 7998 kmp_str_buf_t *buffer) { 7999 const char *parse_ptr; 8000 size_t retval; 8001 const kmp_info_t *th; 8002 kmp_str_buf_t field; 8003 8004 KMP_DEBUG_ASSERT(buffer); 8005 KMP_DEBUG_ASSERT(gtid >= 0); 8006 8007 __kmp_str_buf_init(&field); 8008 __kmp_str_buf_clear(buffer); 8009 8010 th = __kmp_threads[gtid]; 8011 retval = 0; 8012 8013 // If format is NULL or zero-length string, then we use 8014 // affinity-format-var ICV 8015 parse_ptr = format; 8016 if (parse_ptr == NULL || *parse_ptr == '\0') { 8017 parse_ptr = __kmp_affinity_format; 8018 } 8019 KMP_DEBUG_ASSERT(parse_ptr); 8020 8021 while (*parse_ptr != '\0') { 8022 // Parse a field 8023 if (*parse_ptr == '%') { 8024 // Put field in the buffer 8025 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8026 __kmp_str_buf_catbuf(buffer, &field); 8027 retval += rc; 8028 } else { 8029 // Put literal character in buffer 8030 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8031 retval++; 8032 parse_ptr++; 8033 } 8034 } 8035 __kmp_str_buf_free(&field); 8036 return retval; 8037 } 8038 8039 // Displays the affinity string to stdout 8040 void __kmp_aux_display_affinity(int gtid, const char *format) { 8041 kmp_str_buf_t buf; 8042 __kmp_str_buf_init(&buf); 8043 __kmp_aux_capture_affinity(gtid, format, &buf); 8044 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8045 __kmp_str_buf_free(&buf); 8046 } 8047 8048 /* ------------------------------------------------------------------------ */ 8049 8050 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8051 int blocktime = arg; /* argument is in milliseconds */ 8052 #if KMP_USE_MONITOR 8053 int bt_intervals; 8054 #endif 8055 int bt_set; 8056 8057 __kmp_save_internal_controls(thread); 8058 8059 /* Normalize and set blocktime for the teams */ 8060 if (blocktime < KMP_MIN_BLOCKTIME) 8061 blocktime = KMP_MIN_BLOCKTIME; 8062 else if (blocktime > KMP_MAX_BLOCKTIME) 8063 blocktime = KMP_MAX_BLOCKTIME; 8064 8065 set__blocktime_team(thread->th.th_team, tid, blocktime); 8066 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8067 8068 #if KMP_USE_MONITOR 8069 /* Calculate and set blocktime intervals for the teams */ 8070 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8071 8072 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8073 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8074 #endif 8075 8076 /* Set whether blocktime has been set to "TRUE" */ 8077 bt_set = TRUE; 8078 8079 set__bt_set_team(thread->th.th_team, tid, bt_set); 8080 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8081 #if KMP_USE_MONITOR 8082 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8083 "bt_intervals=%d, monitor_updates=%d\n", 8084 __kmp_gtid_from_tid(tid, thread->th.th_team), 8085 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8086 __kmp_monitor_wakeups)); 8087 #else 8088 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8089 __kmp_gtid_from_tid(tid, thread->th.th_team), 8090 thread->th.th_team->t.t_id, tid, blocktime)); 8091 #endif 8092 } 8093 8094 void __kmp_aux_set_defaults(char const *str, int len) { 8095 if (!__kmp_init_serial) { 8096 __kmp_serial_initialize(); 8097 } 8098 __kmp_env_initialize(str); 8099 8100 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8101 __kmp_env_print(); 8102 } 8103 } // __kmp_aux_set_defaults 8104 8105 /* ------------------------------------------------------------------------ */ 8106 /* internal fast reduction routines */ 8107 8108 PACKED_REDUCTION_METHOD_T 8109 __kmp_determine_reduction_method( 8110 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8111 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8112 kmp_critical_name *lck) { 8113 8114 // Default reduction method: critical construct ( lck != NULL, like in current 8115 // PAROPT ) 8116 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8117 // can be selected by RTL 8118 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8119 // can be selected by RTL 8120 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8121 // among generated by PAROPT. 8122 8123 PACKED_REDUCTION_METHOD_T retval; 8124 8125 int team_size; 8126 8127 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8128 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8129 8130 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8131 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8132 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8133 8134 retval = critical_reduce_block; 8135 8136 // another choice of getting a team size (with 1 dynamic deference) is slower 8137 team_size = __kmp_get_team_num_threads(global_tid); 8138 if (team_size == 1) { 8139 8140 retval = empty_reduce_block; 8141 8142 } else { 8143 8144 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8145 8146 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8147 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8148 8149 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8150 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8151 8152 int teamsize_cutoff = 4; 8153 8154 #if KMP_MIC_SUPPORTED 8155 if (__kmp_mic_type != non_mic) { 8156 teamsize_cutoff = 8; 8157 } 8158 #endif 8159 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8160 if (tree_available) { 8161 if (team_size <= teamsize_cutoff) { 8162 if (atomic_available) { 8163 retval = atomic_reduce_block; 8164 } 8165 } else { 8166 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8167 } 8168 } else if (atomic_available) { 8169 retval = atomic_reduce_block; 8170 } 8171 #else 8172 #error "Unknown or unsupported OS" 8173 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8174 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8175 8176 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8177 8178 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8179 8180 // basic tuning 8181 8182 if (atomic_available) { 8183 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8184 retval = atomic_reduce_block; 8185 } 8186 } // otherwise: use critical section 8187 8188 #elif KMP_OS_DARWIN 8189 8190 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8191 if (atomic_available && (num_vars <= 3)) { 8192 retval = atomic_reduce_block; 8193 } else if (tree_available) { 8194 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8195 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8196 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8197 } 8198 } // otherwise: use critical section 8199 8200 #else 8201 #error "Unknown or unsupported OS" 8202 #endif 8203 8204 #else 8205 #error "Unknown or unsupported architecture" 8206 #endif 8207 } 8208 8209 // KMP_FORCE_REDUCTION 8210 8211 // If the team is serialized (team_size == 1), ignore the forced reduction 8212 // method and stay with the unsynchronized method (empty_reduce_block) 8213 if (__kmp_force_reduction_method != reduction_method_not_defined && 8214 team_size != 1) { 8215 8216 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8217 8218 int atomic_available, tree_available; 8219 8220 switch ((forced_retval = __kmp_force_reduction_method)) { 8221 case critical_reduce_block: 8222 KMP_ASSERT(lck); // lck should be != 0 8223 break; 8224 8225 case atomic_reduce_block: 8226 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8227 if (!atomic_available) { 8228 KMP_WARNING(RedMethodNotSupported, "atomic"); 8229 forced_retval = critical_reduce_block; 8230 } 8231 break; 8232 8233 case tree_reduce_block: 8234 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8235 if (!tree_available) { 8236 KMP_WARNING(RedMethodNotSupported, "tree"); 8237 forced_retval = critical_reduce_block; 8238 } else { 8239 #if KMP_FAST_REDUCTION_BARRIER 8240 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8241 #endif 8242 } 8243 break; 8244 8245 default: 8246 KMP_ASSERT(0); // "unsupported method specified" 8247 } 8248 8249 retval = forced_retval; 8250 } 8251 8252 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8253 8254 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8255 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8256 8257 return (retval); 8258 } 8259 // this function is for testing set/get/determine reduce method 8260 kmp_int32 __kmp_get_reduce_method(void) { 8261 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8262 } 8263 8264 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8265 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8266 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8267 8268 // Hard pause shuts down the runtime completely. Resume happens naturally when 8269 // OpenMP is used subsequently. 8270 void __kmp_hard_pause() { 8271 __kmp_pause_status = kmp_hard_paused; 8272 __kmp_internal_end_thread(-1); 8273 } 8274 8275 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8276 void __kmp_resume_if_soft_paused() { 8277 if (__kmp_pause_status == kmp_soft_paused) { 8278 __kmp_pause_status = kmp_not_paused; 8279 8280 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8281 kmp_info_t *thread = __kmp_threads[gtid]; 8282 if (thread) { // Wake it if sleeping 8283 kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 8284 if (fl.is_sleeping()) 8285 fl.resume(gtid); 8286 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8287 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8288 } else { // thread holds the lock and may sleep soon 8289 do { // until either the thread sleeps, or we can get the lock 8290 if (fl.is_sleeping()) { 8291 fl.resume(gtid); 8292 break; 8293 } else if (__kmp_try_suspend_mx(thread)) { 8294 __kmp_unlock_suspend_mx(thread); 8295 break; 8296 } 8297 } while (1); 8298 } 8299 } 8300 } 8301 } 8302 } 8303 8304 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8305 // TODO: add warning messages 8306 int __kmp_pause_resource(kmp_pause_status_t level) { 8307 if (level == kmp_not_paused) { // requesting resume 8308 if (__kmp_pause_status == kmp_not_paused) { 8309 // error message about runtime not being paused, so can't resume 8310 return 1; 8311 } else { 8312 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8313 __kmp_pause_status == kmp_hard_paused); 8314 __kmp_pause_status = kmp_not_paused; 8315 return 0; 8316 } 8317 } else if (level == kmp_soft_paused) { // requesting soft pause 8318 if (__kmp_pause_status != kmp_not_paused) { 8319 // error message about already being paused 8320 return 1; 8321 } else { 8322 __kmp_soft_pause(); 8323 return 0; 8324 } 8325 } else if (level == kmp_hard_paused) { // requesting hard pause 8326 if (__kmp_pause_status != kmp_not_paused) { 8327 // error message about already being paused 8328 return 1; 8329 } else { 8330 __kmp_hard_pause(); 8331 return 0; 8332 } 8333 } else { 8334 // error message about invalid level 8335 return 1; 8336 } 8337 } 8338 8339 8340 void __kmp_omp_display_env(int verbose) { 8341 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8342 if (__kmp_init_serial == 0) 8343 __kmp_do_serial_initialize(); 8344 __kmp_display_env_impl(!verbose, verbose); 8345 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8346 } 8347