1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 /* these are temporary issues to be dealt with */ 36 #define KMP_USE_PRCTL 0 37 38 #if KMP_OS_WINDOWS 39 #include <process.h> 40 #endif 41 42 #include "tsan_annotations.h" 43 44 #if defined(KMP_GOMP_COMPAT) 45 char const __kmp_version_alt_comp[] = 46 KMP_VERSION_PREFIX "alternative compiler support: yes"; 47 #endif /* defined(KMP_GOMP_COMPAT) */ 48 49 char const __kmp_version_omp_api[] = 50 KMP_VERSION_PREFIX "API version: 5.0 (201611)"; 51 52 #ifdef KMP_DEBUG 53 char const __kmp_version_lock[] = 54 KMP_VERSION_PREFIX "lock type: run time selectable"; 55 #endif /* KMP_DEBUG */ 56 57 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 58 59 /* ------------------------------------------------------------------------ */ 60 61 #if KMP_USE_MONITOR 62 kmp_info_t __kmp_monitor; 63 #endif 64 65 /* Forward declarations */ 66 67 void __kmp_cleanup(void); 68 69 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 70 int gtid); 71 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 72 kmp_internal_control_t *new_icvs, 73 ident_t *loc); 74 #if KMP_AFFINITY_SUPPORTED 75 static void __kmp_partition_places(kmp_team_t *team, 76 int update_master_only = 0); 77 #endif 78 static void __kmp_do_serial_initialize(void); 79 void __kmp_fork_barrier(int gtid, int tid); 80 void __kmp_join_barrier(int gtid); 81 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 82 kmp_internal_control_t *new_icvs, ident_t *loc); 83 84 #ifdef USE_LOAD_BALANCE 85 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 86 #endif 87 88 static int __kmp_expand_threads(int nNeed); 89 #if KMP_OS_WINDOWS 90 static int __kmp_unregister_root_other_thread(int gtid); 91 #endif 92 static void __kmp_unregister_library(void); // called by __kmp_internal_end() 93 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 94 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 95 96 /* Calculate the identifier of the current thread */ 97 /* fast (and somewhat portable) way to get unique identifier of executing 98 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 99 int __kmp_get_global_thread_id() { 100 int i; 101 kmp_info_t **other_threads; 102 size_t stack_data; 103 char *stack_addr; 104 size_t stack_size; 105 char *stack_base; 106 107 KA_TRACE( 108 1000, 109 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 110 __kmp_nth, __kmp_all_nth)); 111 112 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 113 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 114 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 115 __kmp_init_gtid for this to work. */ 116 117 if (!TCR_4(__kmp_init_gtid)) 118 return KMP_GTID_DNE; 119 120 #ifdef KMP_TDATA_GTID 121 if (TCR_4(__kmp_gtid_mode) >= 3) { 122 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 123 return __kmp_gtid; 124 } 125 #endif 126 if (TCR_4(__kmp_gtid_mode) >= 2) { 127 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 128 return __kmp_gtid_get_specific(); 129 } 130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 131 132 stack_addr = (char *)&stack_data; 133 other_threads = __kmp_threads; 134 135 /* ATT: The code below is a source of potential bugs due to unsynchronized 136 access to __kmp_threads array. For example: 137 1. Current thread loads other_threads[i] to thr and checks it, it is 138 non-NULL. 139 2. Current thread is suspended by OS. 140 3. Another thread unregisters and finishes (debug versions of free() 141 may fill memory with something like 0xEF). 142 4. Current thread is resumed. 143 5. Current thread reads junk from *thr. 144 TODO: Fix it. --ln */ 145 146 for (i = 0; i < __kmp_threads_capacity; i++) { 147 148 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 149 if (!thr) 150 continue; 151 152 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 153 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 154 155 /* stack grows down -- search through all of the active threads */ 156 157 if (stack_addr <= stack_base) { 158 size_t stack_diff = stack_base - stack_addr; 159 160 if (stack_diff <= stack_size) { 161 /* The only way we can be closer than the allocated */ 162 /* stack size is if we are running on this thread. */ 163 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 164 return i; 165 } 166 } 167 } 168 169 /* get specific to try and determine our gtid */ 170 KA_TRACE(1000, 171 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 172 "thread, using TLS\n")); 173 i = __kmp_gtid_get_specific(); 174 175 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 176 177 /* if we havn't been assigned a gtid, then return code */ 178 if (i < 0) 179 return i; 180 181 /* dynamically updated stack window for uber threads to avoid get_specific 182 call */ 183 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 184 KMP_FATAL(StackOverflow, i); 185 } 186 187 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 188 if (stack_addr > stack_base) { 189 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 190 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 191 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 192 stack_base); 193 } else { 194 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 195 stack_base - stack_addr); 196 } 197 198 /* Reprint stack bounds for ubermaster since they have been refined */ 199 if (__kmp_storage_map) { 200 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 201 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 202 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 203 other_threads[i]->th.th_info.ds.ds_stacksize, 204 "th_%d stack (refinement)", i); 205 } 206 return i; 207 } 208 209 int __kmp_get_global_thread_id_reg() { 210 int gtid; 211 212 if (!__kmp_init_serial) { 213 gtid = KMP_GTID_DNE; 214 } else 215 #ifdef KMP_TDATA_GTID 216 if (TCR_4(__kmp_gtid_mode) >= 3) { 217 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 218 gtid = __kmp_gtid; 219 } else 220 #endif 221 if (TCR_4(__kmp_gtid_mode) >= 2) { 222 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 223 gtid = __kmp_gtid_get_specific(); 224 } else { 225 KA_TRACE(1000, 226 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 227 gtid = __kmp_get_global_thread_id(); 228 } 229 230 /* we must be a new uber master sibling thread */ 231 if (gtid == KMP_GTID_DNE) { 232 KA_TRACE(10, 233 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 234 "Registering a new gtid.\n")); 235 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 236 if (!__kmp_init_serial) { 237 __kmp_do_serial_initialize(); 238 gtid = __kmp_gtid_get_specific(); 239 } else { 240 gtid = __kmp_register_root(FALSE); 241 } 242 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 243 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 244 } 245 246 KMP_DEBUG_ASSERT(gtid >= 0); 247 248 return gtid; 249 } 250 251 /* caller must hold forkjoin_lock */ 252 void __kmp_check_stack_overlap(kmp_info_t *th) { 253 int f; 254 char *stack_beg = NULL; 255 char *stack_end = NULL; 256 int gtid; 257 258 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 259 if (__kmp_storage_map) { 260 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 261 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 262 263 gtid = __kmp_gtid_from_thread(th); 264 265 if (gtid == KMP_GTID_MONITOR) { 266 __kmp_print_storage_map_gtid( 267 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 268 "th_%s stack (%s)", "mon", 269 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 270 } else { 271 __kmp_print_storage_map_gtid( 272 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 273 "th_%d stack (%s)", gtid, 274 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 275 } 276 } 277 278 /* No point in checking ubermaster threads since they use refinement and 279 * cannot overlap */ 280 gtid = __kmp_gtid_from_thread(th); 281 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 282 KA_TRACE(10, 283 ("__kmp_check_stack_overlap: performing extensive checking\n")); 284 if (stack_beg == NULL) { 285 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 286 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 287 } 288 289 for (f = 0; f < __kmp_threads_capacity; f++) { 290 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 291 292 if (f_th && f_th != th) { 293 char *other_stack_end = 294 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 295 char *other_stack_beg = 296 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 297 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 298 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 299 300 /* Print the other stack values before the abort */ 301 if (__kmp_storage_map) 302 __kmp_print_storage_map_gtid( 303 -1, other_stack_beg, other_stack_end, 304 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 305 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 306 307 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 308 __kmp_msg_null); 309 } 310 } 311 } 312 } 313 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 314 } 315 316 /* ------------------------------------------------------------------------ */ 317 318 void __kmp_infinite_loop(void) { 319 static int done = FALSE; 320 321 while (!done) { 322 KMP_YIELD(TRUE); 323 } 324 } 325 326 #define MAX_MESSAGE 512 327 328 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 329 char const *format, ...) { 330 char buffer[MAX_MESSAGE]; 331 va_list ap; 332 333 va_start(ap, format); 334 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 335 p2, (unsigned long)size, format); 336 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 337 __kmp_vprintf(kmp_err, buffer, ap); 338 #if KMP_PRINT_DATA_PLACEMENT 339 int node; 340 if (gtid >= 0) { 341 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 342 if (__kmp_storage_map_verbose) { 343 node = __kmp_get_host_node(p1); 344 if (node < 0) /* doesn't work, so don't try this next time */ 345 __kmp_storage_map_verbose = FALSE; 346 else { 347 char *last; 348 int lastNode; 349 int localProc = __kmp_get_cpu_from_gtid(gtid); 350 351 const int page_size = KMP_GET_PAGE_SIZE(); 352 353 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 354 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 355 if (localProc >= 0) 356 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 357 localProc >> 1); 358 else 359 __kmp_printf_no_lock(" GTID %d\n", gtid); 360 #if KMP_USE_PRCTL 361 /* The more elaborate format is disabled for now because of the prctl 362 * hanging bug. */ 363 do { 364 last = p1; 365 lastNode = node; 366 /* This loop collates adjacent pages with the same host node. */ 367 do { 368 (char *)p1 += page_size; 369 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 370 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 371 lastNode); 372 } while (p1 <= p2); 373 #else 374 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 375 (char *)p1 + (page_size - 1), 376 __kmp_get_host_node(p1)); 377 if (p1 < p2) { 378 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 379 (char *)p2 + (page_size - 1), 380 __kmp_get_host_node(p2)); 381 } 382 #endif 383 } 384 } 385 } else 386 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 387 } 388 #endif /* KMP_PRINT_DATA_PLACEMENT */ 389 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 390 } 391 392 void __kmp_warn(char const *format, ...) { 393 char buffer[MAX_MESSAGE]; 394 va_list ap; 395 396 if (__kmp_generate_warnings == kmp_warnings_off) { 397 return; 398 } 399 400 va_start(ap, format); 401 402 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 403 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 404 __kmp_vprintf(kmp_err, buffer, ap); 405 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 406 407 va_end(ap); 408 } 409 410 void __kmp_abort_process() { 411 // Later threads may stall here, but that's ok because abort() will kill them. 412 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 413 414 if (__kmp_debug_buf) { 415 __kmp_dump_debug_buffer(); 416 } 417 418 if (KMP_OS_WINDOWS) { 419 // Let other threads know of abnormal termination and prevent deadlock 420 // if abort happened during library initialization or shutdown 421 __kmp_global.g.g_abort = SIGABRT; 422 423 /* On Windows* OS by default abort() causes pop-up error box, which stalls 424 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 425 boxes. _set_abort_behavior() works well, but this function is not 426 available in VS7 (this is not problem for DLL, but it is a problem for 427 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 428 help, at least in some versions of MS C RTL. 429 430 It seems following sequence is the only way to simulate abort() and 431 avoid pop-up error box. */ 432 raise(SIGABRT); 433 _exit(3); // Just in case, if signal ignored, exit anyway. 434 } else { 435 abort(); 436 } 437 438 __kmp_infinite_loop(); 439 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 440 441 } // __kmp_abort_process 442 443 void __kmp_abort_thread(void) { 444 // TODO: Eliminate g_abort global variable and this function. 445 // In case of abort just call abort(), it will kill all the threads. 446 __kmp_infinite_loop(); 447 } // __kmp_abort_thread 448 449 /* Print out the storage map for the major kmp_info_t thread data structures 450 that are allocated together. */ 451 452 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 453 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 454 gtid); 455 456 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 457 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 458 459 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 460 sizeof(kmp_local_t), "th_%d.th_local", gtid); 461 462 __kmp_print_storage_map_gtid( 463 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 464 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 465 466 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 467 &thr->th.th_bar[bs_plain_barrier + 1], 468 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 469 gtid); 470 471 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 472 &thr->th.th_bar[bs_forkjoin_barrier + 1], 473 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 474 gtid); 475 476 #if KMP_FAST_REDUCTION_BARRIER 477 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 478 &thr->th.th_bar[bs_reduction_barrier + 1], 479 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 480 gtid); 481 #endif // KMP_FAST_REDUCTION_BARRIER 482 } 483 484 /* Print out the storage map for the major kmp_team_t team data structures 485 that are allocated together. */ 486 487 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 488 int team_id, int num_thr) { 489 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 490 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 491 header, team_id); 492 493 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 494 &team->t.t_bar[bs_last_barrier], 495 sizeof(kmp_balign_team_t) * bs_last_barrier, 496 "%s_%d.t_bar", header, team_id); 497 498 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 499 &team->t.t_bar[bs_plain_barrier + 1], 500 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 501 header, team_id); 502 503 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 504 &team->t.t_bar[bs_forkjoin_barrier + 1], 505 sizeof(kmp_balign_team_t), 506 "%s_%d.t_bar[forkjoin]", header, team_id); 507 508 #if KMP_FAST_REDUCTION_BARRIER 509 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 510 &team->t.t_bar[bs_reduction_barrier + 1], 511 sizeof(kmp_balign_team_t), 512 "%s_%d.t_bar[reduction]", header, team_id); 513 #endif // KMP_FAST_REDUCTION_BARRIER 514 515 __kmp_print_storage_map_gtid( 516 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 517 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 518 519 __kmp_print_storage_map_gtid( 520 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 521 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 522 523 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 524 &team->t.t_disp_buffer[num_disp_buff], 525 sizeof(dispatch_shared_info_t) * num_disp_buff, 526 "%s_%d.t_disp_buffer", header, team_id); 527 } 528 529 static void __kmp_init_allocator() { __kmp_init_memkind(); } 530 static void __kmp_fini_allocator() { __kmp_fini_memkind(); } 531 532 /* ------------------------------------------------------------------------ */ 533 534 #if KMP_DYNAMIC_LIB 535 #if KMP_OS_WINDOWS 536 537 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) { 538 // TODO: Change to __kmp_break_bootstrap_lock(). 539 __kmp_init_bootstrap_lock(lck); // make the lock released 540 } 541 542 static void __kmp_reset_locks_on_process_detach(int gtid_req) { 543 int i; 544 int thread_count; 545 546 // PROCESS_DETACH is expected to be called by a thread that executes 547 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one 548 // calling ProcessExit or FreeLibrary). So, it might be safe to access the 549 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some 550 // threads can be still alive here, although being about to be terminated. The 551 // threads in the array with ds_thread==0 are most suspicious. Actually, it 552 // can be not safe to access the __kmp_threads[]. 553 554 // TODO: does it make sense to check __kmp_roots[] ? 555 556 // Let's check that there are no other alive threads registered with the OMP 557 // lib. 558 while (1) { 559 thread_count = 0; 560 for (i = 0; i < __kmp_threads_capacity; ++i) { 561 if (!__kmp_threads) 562 continue; 563 kmp_info_t *th = __kmp_threads[i]; 564 if (th == NULL) 565 continue; 566 int gtid = th->th.th_info.ds.ds_gtid; 567 if (gtid == gtid_req) 568 continue; 569 if (gtid < 0) 570 continue; 571 DWORD exit_val; 572 int alive = __kmp_is_thread_alive(th, &exit_val); 573 if (alive) { 574 ++thread_count; 575 } 576 } 577 if (thread_count == 0) 578 break; // success 579 } 580 581 // Assume that I'm alone. Now it might be safe to check and reset locks. 582 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset. 583 __kmp_reset_lock(&__kmp_forkjoin_lock); 584 #ifdef KMP_DEBUG 585 __kmp_reset_lock(&__kmp_stdio_lock); 586 #endif // KMP_DEBUG 587 } 588 589 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 590 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 591 592 switch (fdwReason) { 593 594 case DLL_PROCESS_ATTACH: 595 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 596 597 return TRUE; 598 599 case DLL_PROCESS_DETACH: 600 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 601 602 if (lpReserved != NULL) { 603 // lpReserved is used for telling the difference: 604 // lpReserved == NULL when FreeLibrary() was called, 605 // lpReserved != NULL when the process terminates. 606 // When FreeLibrary() is called, worker threads remain alive. So they will 607 // release the forkjoin lock by themselves. When the process terminates, 608 // worker threads disappear triggering the problem of unreleased forkjoin 609 // lock as described below. 610 611 // A worker thread can take the forkjoin lock. The problem comes up if 612 // that worker thread becomes dead before it releases the forkjoin lock. 613 // The forkjoin lock remains taken, while the thread executing 614 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try 615 // to take the forkjoin lock and will always fail, so that the application 616 // will never finish [normally]. This scenario is possible if 617 // __kmpc_end() has not been executed. It looks like it's not a corner 618 // case, but common cases: 619 // - the main function was compiled by an alternative compiler; 620 // - the main function was compiled by icl but without /Qopenmp 621 // (application with plugins); 622 // - application terminates by calling C exit(), Fortran CALL EXIT() or 623 // Fortran STOP. 624 // - alive foreign thread prevented __kmpc_end from doing cleanup. 625 // 626 // This is a hack to work around the problem. 627 // TODO: !!! figure out something better. 628 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific()); 629 } 630 631 __kmp_internal_end_library(__kmp_gtid_get_specific()); 632 633 return TRUE; 634 635 case DLL_THREAD_ATTACH: 636 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 637 638 /* if we want to register new siblings all the time here call 639 * __kmp_get_gtid(); */ 640 return TRUE; 641 642 case DLL_THREAD_DETACH: 643 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 644 645 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 646 return TRUE; 647 } 648 649 return TRUE; 650 } 651 652 #endif /* KMP_OS_WINDOWS */ 653 #endif /* KMP_DYNAMIC_LIB */ 654 655 /* __kmp_parallel_deo -- Wait until it's our turn. */ 656 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 657 int gtid = *gtid_ref; 658 #ifdef BUILD_PARALLEL_ORDERED 659 kmp_team_t *team = __kmp_team_from_gtid(gtid); 660 #endif /* BUILD_PARALLEL_ORDERED */ 661 662 if (__kmp_env_consistency_check) { 663 if (__kmp_threads[gtid]->th.th_root->r.r_active) 664 #if KMP_USE_DYNAMIC_LOCK 665 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 666 #else 667 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 668 #endif 669 } 670 #ifdef BUILD_PARALLEL_ORDERED 671 if (!team->t.t_serialized) { 672 KMP_MB(); 673 KMP_WAIT(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), KMP_EQ, 674 NULL); 675 KMP_MB(); 676 } 677 #endif /* BUILD_PARALLEL_ORDERED */ 678 } 679 680 /* __kmp_parallel_dxo -- Signal the next task. */ 681 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 682 int gtid = *gtid_ref; 683 #ifdef BUILD_PARALLEL_ORDERED 684 int tid = __kmp_tid_from_gtid(gtid); 685 kmp_team_t *team = __kmp_team_from_gtid(gtid); 686 #endif /* BUILD_PARALLEL_ORDERED */ 687 688 if (__kmp_env_consistency_check) { 689 if (__kmp_threads[gtid]->th.th_root->r.r_active) 690 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 691 } 692 #ifdef BUILD_PARALLEL_ORDERED 693 if (!team->t.t_serialized) { 694 KMP_MB(); /* Flush all pending memory write invalidates. */ 695 696 /* use the tid of the next thread in this team */ 697 /* TODO replace with general release procedure */ 698 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 699 700 KMP_MB(); /* Flush all pending memory write invalidates. */ 701 } 702 #endif /* BUILD_PARALLEL_ORDERED */ 703 } 704 705 /* ------------------------------------------------------------------------ */ 706 /* The BARRIER for a SINGLE process section is always explicit */ 707 708 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 709 int status; 710 kmp_info_t *th; 711 kmp_team_t *team; 712 713 if (!TCR_4(__kmp_init_parallel)) 714 __kmp_parallel_initialize(); 715 __kmp_resume_if_soft_paused(); 716 717 th = __kmp_threads[gtid]; 718 team = th->th.th_team; 719 status = 0; 720 721 th->th.th_ident = id_ref; 722 723 if (team->t.t_serialized) { 724 status = 1; 725 } else { 726 kmp_int32 old_this = th->th.th_local.this_construct; 727 728 ++th->th.th_local.this_construct; 729 /* try to set team count to thread count--success means thread got the 730 single block */ 731 /* TODO: Should this be acquire or release? */ 732 if (team->t.t_construct == old_this) { 733 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 734 th->th.th_local.this_construct); 735 } 736 #if USE_ITT_BUILD 737 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 738 KMP_MASTER_GTID(gtid) && th->th.th_teams_microtask == NULL && 739 team->t.t_active_level == 740 1) { // Only report metadata by master of active team at level 1 741 __kmp_itt_metadata_single(id_ref); 742 } 743 #endif /* USE_ITT_BUILD */ 744 } 745 746 if (__kmp_env_consistency_check) { 747 if (status && push_ws) { 748 __kmp_push_workshare(gtid, ct_psingle, id_ref); 749 } else { 750 __kmp_check_workshare(gtid, ct_psingle, id_ref); 751 } 752 } 753 #if USE_ITT_BUILD 754 if (status) { 755 __kmp_itt_single_start(gtid); 756 } 757 #endif /* USE_ITT_BUILD */ 758 return status; 759 } 760 761 void __kmp_exit_single(int gtid) { 762 #if USE_ITT_BUILD 763 __kmp_itt_single_end(gtid); 764 #endif /* USE_ITT_BUILD */ 765 if (__kmp_env_consistency_check) 766 __kmp_pop_workshare(gtid, ct_psingle, NULL); 767 } 768 769 /* determine if we can go parallel or must use a serialized parallel region and 770 * how many threads we can use 771 * set_nproc is the number of threads requested for the team 772 * returns 0 if we should serialize or only use one thread, 773 * otherwise the number of threads to use 774 * The forkjoin lock is held by the caller. */ 775 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 776 int master_tid, int set_nthreads, 777 int enter_teams) { 778 int capacity; 779 int new_nthreads; 780 KMP_DEBUG_ASSERT(__kmp_init_serial); 781 KMP_DEBUG_ASSERT(root && parent_team); 782 kmp_info_t *this_thr = parent_team->t.t_threads[master_tid]; 783 784 // If dyn-var is set, dynamically adjust the number of desired threads, 785 // according to the method specified by dynamic_mode. 786 new_nthreads = set_nthreads; 787 if (!get__dynamic_2(parent_team, master_tid)) { 788 ; 789 } 790 #ifdef USE_LOAD_BALANCE 791 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 792 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 793 if (new_nthreads == 1) { 794 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 795 "reservation to 1 thread\n", 796 master_tid)); 797 return 1; 798 } 799 if (new_nthreads < set_nthreads) { 800 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 801 "reservation to %d threads\n", 802 master_tid, new_nthreads)); 803 } 804 } 805 #endif /* USE_LOAD_BALANCE */ 806 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 807 new_nthreads = __kmp_avail_proc - __kmp_nth + 808 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 809 if (new_nthreads <= 1) { 810 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 811 "reservation to 1 thread\n", 812 master_tid)); 813 return 1; 814 } 815 if (new_nthreads < set_nthreads) { 816 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 817 "reservation to %d threads\n", 818 master_tid, new_nthreads)); 819 } else { 820 new_nthreads = set_nthreads; 821 } 822 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 823 if (set_nthreads > 2) { 824 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 825 new_nthreads = (new_nthreads % set_nthreads) + 1; 826 if (new_nthreads == 1) { 827 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 828 "reservation to 1 thread\n", 829 master_tid)); 830 return 1; 831 } 832 if (new_nthreads < set_nthreads) { 833 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 834 "reservation to %d threads\n", 835 master_tid, new_nthreads)); 836 } 837 } 838 } else { 839 KMP_ASSERT(0); 840 } 841 842 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 843 if (__kmp_nth + new_nthreads - 844 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 845 __kmp_max_nth) { 846 int tl_nthreads = __kmp_max_nth - __kmp_nth + 847 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 848 if (tl_nthreads <= 0) { 849 tl_nthreads = 1; 850 } 851 852 // If dyn-var is false, emit a 1-time warning. 853 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 854 __kmp_reserve_warn = 1; 855 __kmp_msg(kmp_ms_warning, 856 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 857 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 858 } 859 if (tl_nthreads == 1) { 860 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 861 "reduced reservation to 1 thread\n", 862 master_tid)); 863 return 1; 864 } 865 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 866 "reservation to %d threads\n", 867 master_tid, tl_nthreads)); 868 new_nthreads = tl_nthreads; 869 } 870 871 // Respect OMP_THREAD_LIMIT 872 int cg_nthreads = this_thr->th.th_cg_roots->cg_nthreads; 873 int max_cg_threads = this_thr->th.th_cg_roots->cg_thread_limit; 874 if (cg_nthreads + new_nthreads - 875 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 876 max_cg_threads) { 877 int tl_nthreads = max_cg_threads - cg_nthreads + 878 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 879 if (tl_nthreads <= 0) { 880 tl_nthreads = 1; 881 } 882 883 // If dyn-var is false, emit a 1-time warning. 884 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 885 __kmp_reserve_warn = 1; 886 __kmp_msg(kmp_ms_warning, 887 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 888 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 889 } 890 if (tl_nthreads == 1) { 891 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 892 "reduced reservation to 1 thread\n", 893 master_tid)); 894 return 1; 895 } 896 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 897 "reservation to %d threads\n", 898 master_tid, tl_nthreads)); 899 new_nthreads = tl_nthreads; 900 } 901 902 // Check if the threads array is large enough, or needs expanding. 903 // See comment in __kmp_register_root() about the adjustment if 904 // __kmp_threads[0] == NULL. 905 capacity = __kmp_threads_capacity; 906 if (TCR_PTR(__kmp_threads[0]) == NULL) { 907 --capacity; 908 } 909 if (__kmp_nth + new_nthreads - 910 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 911 capacity) { 912 // Expand the threads array. 913 int slotsRequired = __kmp_nth + new_nthreads - 914 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 915 capacity; 916 int slotsAdded = __kmp_expand_threads(slotsRequired); 917 if (slotsAdded < slotsRequired) { 918 // The threads array was not expanded enough. 919 new_nthreads -= (slotsRequired - slotsAdded); 920 KMP_ASSERT(new_nthreads >= 1); 921 922 // If dyn-var is false, emit a 1-time warning. 923 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 924 __kmp_reserve_warn = 1; 925 if (__kmp_tp_cached) { 926 __kmp_msg(kmp_ms_warning, 927 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 928 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 929 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 930 } else { 931 __kmp_msg(kmp_ms_warning, 932 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 933 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 934 } 935 } 936 } 937 } 938 939 #ifdef KMP_DEBUG 940 if (new_nthreads == 1) { 941 KC_TRACE(10, 942 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 943 "dead roots and rechecking; requested %d threads\n", 944 __kmp_get_gtid(), set_nthreads)); 945 } else { 946 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 947 " %d threads\n", 948 __kmp_get_gtid(), new_nthreads, set_nthreads)); 949 } 950 #endif // KMP_DEBUG 951 return new_nthreads; 952 } 953 954 /* Allocate threads from the thread pool and assign them to the new team. We are 955 assured that there are enough threads available, because we checked on that 956 earlier within critical section forkjoin */ 957 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 958 kmp_info_t *master_th, int master_gtid) { 959 int i; 960 int use_hot_team; 961 962 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 963 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 964 KMP_MB(); 965 966 /* first, let's setup the master thread */ 967 master_th->th.th_info.ds.ds_tid = 0; 968 master_th->th.th_team = team; 969 master_th->th.th_team_nproc = team->t.t_nproc; 970 master_th->th.th_team_master = master_th; 971 master_th->th.th_team_serialized = FALSE; 972 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 973 974 /* make sure we are not the optimized hot team */ 975 #if KMP_NESTED_HOT_TEAMS 976 use_hot_team = 0; 977 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 978 if (hot_teams) { // hot teams array is not allocated if 979 // KMP_HOT_TEAMS_MAX_LEVEL=0 980 int level = team->t.t_active_level - 1; // index in array of hot teams 981 if (master_th->th.th_teams_microtask) { // are we inside the teams? 982 if (master_th->th.th_teams_size.nteams > 1) { 983 ++level; // level was not increased in teams construct for 984 // team_of_masters 985 } 986 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 987 master_th->th.th_teams_level == team->t.t_level) { 988 ++level; // level was not increased in teams construct for 989 // team_of_workers before the parallel 990 } // team->t.t_level will be increased inside parallel 991 } 992 if (level < __kmp_hot_teams_max_level) { 993 if (hot_teams[level].hot_team) { 994 // hot team has already been allocated for given level 995 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 996 use_hot_team = 1; // the team is ready to use 997 } else { 998 use_hot_team = 0; // AC: threads are not allocated yet 999 hot_teams[level].hot_team = team; // remember new hot team 1000 hot_teams[level].hot_team_nth = team->t.t_nproc; 1001 } 1002 } else { 1003 use_hot_team = 0; 1004 } 1005 } 1006 #else 1007 use_hot_team = team == root->r.r_hot_team; 1008 #endif 1009 if (!use_hot_team) { 1010 1011 /* install the master thread */ 1012 team->t.t_threads[0] = master_th; 1013 __kmp_initialize_info(master_th, team, 0, master_gtid); 1014 1015 /* now, install the worker threads */ 1016 for (i = 1; i < team->t.t_nproc; i++) { 1017 1018 /* fork or reallocate a new thread and install it in team */ 1019 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1020 team->t.t_threads[i] = thr; 1021 KMP_DEBUG_ASSERT(thr); 1022 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1023 /* align team and thread arrived states */ 1024 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1025 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1026 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1027 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1028 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1029 team->t.t_bar[bs_plain_barrier].b_arrived)); 1030 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1031 thr->th.th_teams_level = master_th->th.th_teams_level; 1032 thr->th.th_teams_size = master_th->th.th_teams_size; 1033 { // Initialize threads' barrier data. 1034 int b; 1035 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1036 for (b = 0; b < bs_last_barrier; ++b) { 1037 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1038 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1039 #if USE_DEBUGGER 1040 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1041 #endif 1042 } 1043 } 1044 } 1045 1046 #if KMP_AFFINITY_SUPPORTED 1047 __kmp_partition_places(team); 1048 #endif 1049 } 1050 1051 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1052 for (i = 0; i < team->t.t_nproc; i++) { 1053 kmp_info_t *thr = team->t.t_threads[i]; 1054 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1055 thr->th.th_prev_level != team->t.t_level) { 1056 team->t.t_display_affinity = 1; 1057 break; 1058 } 1059 } 1060 } 1061 1062 KMP_MB(); 1063 } 1064 1065 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1066 // Propagate any changes to the floating point control registers out to the team 1067 // We try to avoid unnecessary writes to the relevant cache line in the team 1068 // structure, so we don't make changes unless they are needed. 1069 inline static void propagateFPControl(kmp_team_t *team) { 1070 if (__kmp_inherit_fp_control) { 1071 kmp_int16 x87_fpu_control_word; 1072 kmp_uint32 mxcsr; 1073 1074 // Get master values of FPU control flags (both X87 and vector) 1075 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1076 __kmp_store_mxcsr(&mxcsr); 1077 mxcsr &= KMP_X86_MXCSR_MASK; 1078 1079 // There is no point looking at t_fp_control_saved here. 1080 // If it is TRUE, we still have to update the values if they are different 1081 // from those we now have. If it is FALSE we didn't save anything yet, but 1082 // our objective is the same. We have to ensure that the values in the team 1083 // are the same as those we have. 1084 // So, this code achieves what we need whether or not t_fp_control_saved is 1085 // true. By checking whether the value needs updating we avoid unnecessary 1086 // writes that would put the cache-line into a written state, causing all 1087 // threads in the team to have to read it again. 1088 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1089 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1090 // Although we don't use this value, other code in the runtime wants to know 1091 // whether it should restore them. So we must ensure it is correct. 1092 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1093 } else { 1094 // Similarly here. Don't write to this cache-line in the team structure 1095 // unless we have to. 1096 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1097 } 1098 } 1099 1100 // Do the opposite, setting the hardware registers to the updated values from 1101 // the team. 1102 inline static void updateHWFPControl(kmp_team_t *team) { 1103 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1104 // Only reset the fp control regs if they have been changed in the team. 1105 // the parallel region that we are exiting. 1106 kmp_int16 x87_fpu_control_word; 1107 kmp_uint32 mxcsr; 1108 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1109 __kmp_store_mxcsr(&mxcsr); 1110 mxcsr &= KMP_X86_MXCSR_MASK; 1111 1112 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1113 __kmp_clear_x87_fpu_status_word(); 1114 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1115 } 1116 1117 if (team->t.t_mxcsr != mxcsr) { 1118 __kmp_load_mxcsr(&team->t.t_mxcsr); 1119 } 1120 } 1121 } 1122 #else 1123 #define propagateFPControl(x) ((void)0) 1124 #define updateHWFPControl(x) ((void)0) 1125 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1126 1127 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1128 int realloc); // forward declaration 1129 1130 /* Run a parallel region that has been serialized, so runs only in a team of the 1131 single master thread. */ 1132 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1133 kmp_info_t *this_thr; 1134 kmp_team_t *serial_team; 1135 1136 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1137 1138 /* Skip all this code for autopar serialized loops since it results in 1139 unacceptable overhead */ 1140 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1141 return; 1142 1143 if (!TCR_4(__kmp_init_parallel)) 1144 __kmp_parallel_initialize(); 1145 __kmp_resume_if_soft_paused(); 1146 1147 this_thr = __kmp_threads[global_tid]; 1148 serial_team = this_thr->th.th_serial_team; 1149 1150 /* utilize the serialized team held by this thread */ 1151 KMP_DEBUG_ASSERT(serial_team); 1152 KMP_MB(); 1153 1154 if (__kmp_tasking_mode != tskm_immediate_exec) { 1155 KMP_DEBUG_ASSERT( 1156 this_thr->th.th_task_team == 1157 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1158 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1159 NULL); 1160 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1161 "team %p, new task_team = NULL\n", 1162 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1163 this_thr->th.th_task_team = NULL; 1164 } 1165 1166 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1167 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1168 proc_bind = proc_bind_false; 1169 } else if (proc_bind == proc_bind_default) { 1170 // No proc_bind clause was specified, so use the current value 1171 // of proc-bind-var for this parallel region. 1172 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1173 } 1174 // Reset for next parallel region 1175 this_thr->th.th_set_proc_bind = proc_bind_default; 1176 1177 #if OMPT_SUPPORT 1178 ompt_data_t ompt_parallel_data = ompt_data_none; 1179 ompt_data_t *implicit_task_data; 1180 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1181 if (ompt_enabled.enabled && 1182 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1183 1184 ompt_task_info_t *parent_task_info; 1185 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1186 1187 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1188 if (ompt_enabled.ompt_callback_parallel_begin) { 1189 int team_size = 1; 1190 1191 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1192 &(parent_task_info->task_data), &(parent_task_info->frame), 1193 &ompt_parallel_data, team_size, 1194 ompt_parallel_invoker_program | ompt_parallel_team, codeptr); 1195 } 1196 } 1197 #endif // OMPT_SUPPORT 1198 1199 if (this_thr->th.th_team != serial_team) { 1200 // Nested level will be an index in the nested nthreads array 1201 int level = this_thr->th.th_team->t.t_level; 1202 1203 if (serial_team->t.t_serialized) { 1204 /* this serial team was already used 1205 TODO increase performance by making this locks more specific */ 1206 kmp_team_t *new_team; 1207 1208 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1209 1210 new_team = 1211 __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1212 #if OMPT_SUPPORT 1213 ompt_parallel_data, 1214 #endif 1215 proc_bind, &this_thr->th.th_current_task->td_icvs, 1216 0 USE_NESTED_HOT_ARG(NULL)); 1217 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1218 KMP_ASSERT(new_team); 1219 1220 /* setup new serialized team and install it */ 1221 new_team->t.t_threads[0] = this_thr; 1222 new_team->t.t_parent = this_thr->th.th_team; 1223 serial_team = new_team; 1224 this_thr->th.th_serial_team = serial_team; 1225 1226 KF_TRACE( 1227 10, 1228 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1229 global_tid, serial_team)); 1230 1231 /* TODO the above breaks the requirement that if we run out of resources, 1232 then we can still guarantee that serialized teams are ok, since we may 1233 need to allocate a new one */ 1234 } else { 1235 KF_TRACE( 1236 10, 1237 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1238 global_tid, serial_team)); 1239 } 1240 1241 /* we have to initialize this serial team */ 1242 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1243 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1244 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1245 serial_team->t.t_ident = loc; 1246 serial_team->t.t_serialized = 1; 1247 serial_team->t.t_nproc = 1; 1248 serial_team->t.t_parent = this_thr->th.th_team; 1249 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1250 this_thr->th.th_team = serial_team; 1251 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1252 1253 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1254 this_thr->th.th_current_task)); 1255 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1256 this_thr->th.th_current_task->td_flags.executing = 0; 1257 1258 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1259 1260 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1261 implicit task for each serialized task represented by 1262 team->t.t_serialized? */ 1263 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1264 &this_thr->th.th_current_task->td_parent->td_icvs); 1265 1266 // Thread value exists in the nested nthreads array for the next nested 1267 // level 1268 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1269 this_thr->th.th_current_task->td_icvs.nproc = 1270 __kmp_nested_nth.nth[level + 1]; 1271 } 1272 1273 if (__kmp_nested_proc_bind.used && 1274 (level + 1 < __kmp_nested_proc_bind.used)) { 1275 this_thr->th.th_current_task->td_icvs.proc_bind = 1276 __kmp_nested_proc_bind.bind_types[level + 1]; 1277 } 1278 1279 #if USE_DEBUGGER 1280 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1281 #endif 1282 this_thr->th.th_info.ds.ds_tid = 0; 1283 1284 /* set thread cache values */ 1285 this_thr->th.th_team_nproc = 1; 1286 this_thr->th.th_team_master = this_thr; 1287 this_thr->th.th_team_serialized = 1; 1288 1289 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1290 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1291 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1292 1293 propagateFPControl(serial_team); 1294 1295 /* check if we need to allocate dispatch buffers stack */ 1296 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1297 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1298 serial_team->t.t_dispatch->th_disp_buffer = 1299 (dispatch_private_info_t *)__kmp_allocate( 1300 sizeof(dispatch_private_info_t)); 1301 } 1302 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1303 1304 KMP_MB(); 1305 1306 } else { 1307 /* this serialized team is already being used, 1308 * that's fine, just add another nested level */ 1309 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1310 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1311 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1312 ++serial_team->t.t_serialized; 1313 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1314 1315 // Nested level will be an index in the nested nthreads array 1316 int level = this_thr->th.th_team->t.t_level; 1317 // Thread value exists in the nested nthreads array for the next nested 1318 // level 1319 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1320 this_thr->th.th_current_task->td_icvs.nproc = 1321 __kmp_nested_nth.nth[level + 1]; 1322 } 1323 serial_team->t.t_level++; 1324 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1325 "of serial team %p to %d\n", 1326 global_tid, serial_team, serial_team->t.t_level)); 1327 1328 /* allocate/push dispatch buffers stack */ 1329 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1330 { 1331 dispatch_private_info_t *disp_buffer = 1332 (dispatch_private_info_t *)__kmp_allocate( 1333 sizeof(dispatch_private_info_t)); 1334 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1335 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1336 } 1337 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1338 1339 KMP_MB(); 1340 } 1341 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1342 1343 // Perform the display affinity functionality for 1344 // serialized parallel regions 1345 if (__kmp_display_affinity) { 1346 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1347 this_thr->th.th_prev_num_threads != 1) { 1348 // NULL means use the affinity-format-var ICV 1349 __kmp_aux_display_affinity(global_tid, NULL); 1350 this_thr->th.th_prev_level = serial_team->t.t_level; 1351 this_thr->th.th_prev_num_threads = 1; 1352 } 1353 } 1354 1355 if (__kmp_env_consistency_check) 1356 __kmp_push_parallel(global_tid, NULL); 1357 #if OMPT_SUPPORT 1358 serial_team->t.ompt_team_info.master_return_address = codeptr; 1359 if (ompt_enabled.enabled && 1360 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1361 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1362 1363 ompt_lw_taskteam_t lw_taskteam; 1364 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1365 &ompt_parallel_data, codeptr); 1366 1367 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1368 // don't use lw_taskteam after linking. content was swaped 1369 1370 /* OMPT implicit task begin */ 1371 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1372 if (ompt_enabled.ompt_callback_implicit_task) { 1373 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1374 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1375 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1376 OMPT_CUR_TASK_INFO(this_thr) 1377 ->thread_num = __kmp_tid_from_gtid(global_tid); 1378 } 1379 1380 /* OMPT state */ 1381 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1382 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1383 } 1384 #endif 1385 } 1386 1387 /* most of the work for a fork */ 1388 /* return true if we really went parallel, false if serialized */ 1389 int __kmp_fork_call(ident_t *loc, int gtid, 1390 enum fork_context_e call_context, // Intel, GNU, ... 1391 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1392 kmp_va_list ap) { 1393 void **argv; 1394 int i; 1395 int master_tid; 1396 int master_this_cons; 1397 kmp_team_t *team; 1398 kmp_team_t *parent_team; 1399 kmp_info_t *master_th; 1400 kmp_root_t *root; 1401 int nthreads; 1402 int master_active; 1403 int master_set_numthreads; 1404 int level; 1405 int active_level; 1406 int teams_level; 1407 #if KMP_NESTED_HOT_TEAMS 1408 kmp_hot_team_ptr_t **p_hot_teams; 1409 #endif 1410 { // KMP_TIME_BLOCK 1411 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1412 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1413 1414 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1415 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1416 /* Some systems prefer the stack for the root thread(s) to start with */ 1417 /* some gap from the parent stack to prevent false sharing. */ 1418 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1419 /* These 2 lines below are so this does not get optimized out */ 1420 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1421 __kmp_stkpadding += (short)((kmp_int64)dummy); 1422 } 1423 1424 /* initialize if needed */ 1425 KMP_DEBUG_ASSERT( 1426 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1427 if (!TCR_4(__kmp_init_parallel)) 1428 __kmp_parallel_initialize(); 1429 __kmp_resume_if_soft_paused(); 1430 1431 /* setup current data */ 1432 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1433 // shutdown 1434 parent_team = master_th->th.th_team; 1435 master_tid = master_th->th.th_info.ds.ds_tid; 1436 master_this_cons = master_th->th.th_local.this_construct; 1437 root = master_th->th.th_root; 1438 master_active = root->r.r_active; 1439 master_set_numthreads = master_th->th.th_set_nproc; 1440 1441 #if OMPT_SUPPORT 1442 ompt_data_t ompt_parallel_data = ompt_data_none; 1443 ompt_data_t *parent_task_data; 1444 ompt_frame_t *ompt_frame; 1445 ompt_data_t *implicit_task_data; 1446 void *return_address = NULL; 1447 1448 if (ompt_enabled.enabled) { 1449 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1450 NULL, NULL); 1451 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1452 } 1453 #endif 1454 1455 // Nested level will be an index in the nested nthreads array 1456 level = parent_team->t.t_level; 1457 // used to launch non-serial teams even if nested is not allowed 1458 active_level = parent_team->t.t_active_level; 1459 // needed to check nesting inside the teams 1460 teams_level = master_th->th.th_teams_level; 1461 #if KMP_NESTED_HOT_TEAMS 1462 p_hot_teams = &master_th->th.th_hot_teams; 1463 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1464 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1465 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1466 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1467 // it is either actual or not needed (when active_level > 0) 1468 (*p_hot_teams)[0].hot_team_nth = 1; 1469 } 1470 #endif 1471 1472 #if OMPT_SUPPORT 1473 if (ompt_enabled.enabled) { 1474 if (ompt_enabled.ompt_callback_parallel_begin) { 1475 int team_size = master_set_numthreads 1476 ? master_set_numthreads 1477 : get__nproc_2(parent_team, master_tid); 1478 int flags = OMPT_INVOKER(call_context) | 1479 ((microtask == (microtask_t)__kmp_teams_master) 1480 ? ompt_parallel_league 1481 : ompt_parallel_team); 1482 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1483 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, flags, 1484 return_address); 1485 } 1486 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1487 } 1488 #endif 1489 1490 master_th->th.th_ident = loc; 1491 1492 if (master_th->th.th_teams_microtask && ap && 1493 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1494 // AC: This is start of parallel that is nested inside teams construct. 1495 // The team is actual (hot), all workers are ready at the fork barrier. 1496 // No lock needed to initialize the team a bit, then free workers. 1497 parent_team->t.t_ident = loc; 1498 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1499 parent_team->t.t_argc = argc; 1500 argv = (void **)parent_team->t.t_argv; 1501 for (i = argc - 1; i >= 0; --i) 1502 *argv++ = va_arg(kmp_va_deref(ap), void *); 1503 // Increment our nested depth levels, but not increase the serialization 1504 if (parent_team == master_th->th.th_serial_team) { 1505 // AC: we are in serialized parallel 1506 __kmpc_serialized_parallel(loc, gtid); 1507 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1508 1509 #if OMPT_SUPPORT 1510 void *dummy; 1511 void **exit_frame_p; 1512 1513 ompt_lw_taskteam_t lw_taskteam; 1514 1515 if (ompt_enabled.enabled) { 1516 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1517 &ompt_parallel_data, return_address); 1518 exit_frame_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1519 1520 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1521 // don't use lw_taskteam after linking. content was swaped 1522 1523 /* OMPT implicit task begin */ 1524 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1525 if (ompt_enabled.ompt_callback_implicit_task) { 1526 OMPT_CUR_TASK_INFO(master_th) 1527 ->thread_num = __kmp_tid_from_gtid(gtid); 1528 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1529 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1530 implicit_task_data, 1, 1531 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1532 } 1533 1534 /* OMPT state */ 1535 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1536 } else { 1537 exit_frame_p = &dummy; 1538 } 1539 #endif 1540 // AC: need to decrement t_serialized for enquiry functions to work 1541 // correctly, will restore at join time 1542 parent_team->t.t_serialized--; 1543 1544 { 1545 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1546 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1547 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1548 #if OMPT_SUPPORT 1549 , 1550 exit_frame_p 1551 #endif 1552 ); 1553 } 1554 1555 #if OMPT_SUPPORT 1556 if (ompt_enabled.enabled) { 1557 *exit_frame_p = NULL; 1558 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1559 if (ompt_enabled.ompt_callback_implicit_task) { 1560 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1561 ompt_scope_end, NULL, implicit_task_data, 1, 1562 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 1563 } 1564 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1565 __ompt_lw_taskteam_unlink(master_th); 1566 if (ompt_enabled.ompt_callback_parallel_end) { 1567 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1568 &ompt_parallel_data, OMPT_CUR_TASK_DATA(master_th), 1569 OMPT_INVOKER(call_context) | ompt_parallel_team, 1570 return_address); 1571 } 1572 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1573 } 1574 #endif 1575 return TRUE; 1576 } 1577 1578 parent_team->t.t_pkfn = microtask; 1579 parent_team->t.t_invoke = invoker; 1580 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1581 parent_team->t.t_active_level++; 1582 parent_team->t.t_level++; 1583 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1584 1585 #if OMPT_SUPPORT 1586 if (ompt_enabled.enabled) { 1587 ompt_lw_taskteam_t lw_taskteam; 1588 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1589 &ompt_parallel_data, return_address); 1590 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 1, true); 1591 } 1592 #endif 1593 1594 /* Change number of threads in the team if requested */ 1595 if (master_set_numthreads) { // The parallel has num_threads clause 1596 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1597 // AC: only can reduce number of threads dynamically, can't increase 1598 kmp_info_t **other_threads = parent_team->t.t_threads; 1599 parent_team->t.t_nproc = master_set_numthreads; 1600 for (i = 0; i < master_set_numthreads; ++i) { 1601 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1602 } 1603 // Keep extra threads hot in the team for possible next parallels 1604 } 1605 master_th->th.th_set_nproc = 0; 1606 } 1607 1608 #if USE_DEBUGGER 1609 if (__kmp_debugging) { // Let debugger override number of threads. 1610 int nth = __kmp_omp_num_threads(loc); 1611 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1612 master_set_numthreads = nth; 1613 } 1614 } 1615 #endif 1616 1617 #if USE_ITT_BUILD 1618 if (((__itt_frame_submit_v3_ptr && __itt_get_timestamp_ptr) || 1619 KMP_ITT_DEBUG) && 1620 __kmp_forkjoin_frames_mode == 3 && 1621 parent_team->t.t_active_level == 1 // only report frames at level 1 1622 && master_th->th.th_teams_size.nteams == 1) { 1623 kmp_uint64 tmp_time = __itt_get_timestamp(); 1624 master_th->th.th_frame_time = tmp_time; 1625 parent_team->t.t_region_time = tmp_time; 1626 } 1627 if (__itt_stack_caller_create_ptr) { 1628 // create new stack stitching id before entering fork barrier 1629 parent_team->t.t_stack_id = __kmp_itt_stack_caller_create(); 1630 } 1631 #endif /* USE_ITT_BUILD */ 1632 1633 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1634 "master_th=%p, gtid=%d\n", 1635 root, parent_team, master_th, gtid)); 1636 __kmp_internal_fork(loc, gtid, parent_team); 1637 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1638 "master_th=%p, gtid=%d\n", 1639 root, parent_team, master_th, gtid)); 1640 1641 /* Invoke microtask for MASTER thread */ 1642 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1643 parent_team->t.t_id, parent_team->t.t_pkfn)); 1644 1645 if (!parent_team->t.t_invoke(gtid)) { 1646 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1647 } 1648 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1649 parent_team->t.t_id, parent_team->t.t_pkfn)); 1650 KMP_MB(); /* Flush all pending memory write invalidates. */ 1651 1652 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1653 1654 return TRUE; 1655 } // Parallel closely nested in teams construct 1656 1657 #if KMP_DEBUG 1658 if (__kmp_tasking_mode != tskm_immediate_exec) { 1659 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1660 parent_team->t.t_task_team[master_th->th.th_task_state]); 1661 } 1662 #endif 1663 1664 if (parent_team->t.t_active_level >= 1665 master_th->th.th_current_task->td_icvs.max_active_levels) { 1666 nthreads = 1; 1667 } else { 1668 int enter_teams = ((ap == NULL && active_level == 0) || 1669 (ap && teams_level > 0 && teams_level == level)); 1670 nthreads = 1671 master_set_numthreads 1672 ? master_set_numthreads 1673 : get__nproc_2( 1674 parent_team, 1675 master_tid); // TODO: get nproc directly from current task 1676 1677 // Check if we need to take forkjoin lock? (no need for serialized 1678 // parallel out of teams construct). This code moved here from 1679 // __kmp_reserve_threads() to speedup nested serialized parallels. 1680 if (nthreads > 1) { 1681 if ((get__max_active_levels(master_th) == 1 && 1682 (root->r.r_in_parallel && !enter_teams)) || 1683 (__kmp_library == library_serial)) { 1684 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1685 " threads\n", 1686 gtid, nthreads)); 1687 nthreads = 1; 1688 } 1689 } 1690 if (nthreads > 1) { 1691 /* determine how many new threads we can use */ 1692 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1693 /* AC: If we execute teams from parallel region (on host), then teams 1694 should be created but each can only have 1 thread if nesting is 1695 disabled. If teams called from serial region, then teams and their 1696 threads should be created regardless of the nesting setting. */ 1697 nthreads = __kmp_reserve_threads(root, parent_team, master_tid, 1698 nthreads, enter_teams); 1699 if (nthreads == 1) { 1700 // Free lock for single thread execution here; for multi-thread 1701 // execution it will be freed later after team of threads created 1702 // and initialized 1703 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1704 } 1705 } 1706 } 1707 KMP_DEBUG_ASSERT(nthreads > 0); 1708 1709 // If we temporarily changed the set number of threads then restore it now 1710 master_th->th.th_set_nproc = 0; 1711 1712 /* create a serialized parallel region? */ 1713 if (nthreads == 1) { 1714 /* josh todo: hypothetical question: what do we do for OS X*? */ 1715 #if KMP_OS_LINUX && \ 1716 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1717 void *args[argc]; 1718 #else 1719 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1720 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1721 KMP_ARCH_AARCH64) */ 1722 1723 KA_TRACE(20, 1724 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1725 1726 __kmpc_serialized_parallel(loc, gtid); 1727 1728 if (call_context == fork_context_intel) { 1729 /* TODO this sucks, use the compiler itself to pass args! :) */ 1730 master_th->th.th_serial_team->t.t_ident = loc; 1731 if (!ap) { 1732 // revert change made in __kmpc_serialized_parallel() 1733 master_th->th.th_serial_team->t.t_level--; 1734 // Get args from parent team for teams construct 1735 1736 #if OMPT_SUPPORT 1737 void *dummy; 1738 void **exit_frame_p; 1739 ompt_task_info_t *task_info; 1740 1741 ompt_lw_taskteam_t lw_taskteam; 1742 1743 if (ompt_enabled.enabled) { 1744 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1745 &ompt_parallel_data, return_address); 1746 1747 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1748 // don't use lw_taskteam after linking. content was swaped 1749 1750 task_info = OMPT_CUR_TASK_INFO(master_th); 1751 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1752 if (ompt_enabled.ompt_callback_implicit_task) { 1753 OMPT_CUR_TASK_INFO(master_th) 1754 ->thread_num = __kmp_tid_from_gtid(gtid); 1755 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1756 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1757 &(task_info->task_data), 1, 1758 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1759 ompt_task_implicit); 1760 } 1761 1762 /* OMPT state */ 1763 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1764 } else { 1765 exit_frame_p = &dummy; 1766 } 1767 #endif 1768 1769 { 1770 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1771 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1772 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1773 parent_team->t.t_argv 1774 #if OMPT_SUPPORT 1775 , 1776 exit_frame_p 1777 #endif 1778 ); 1779 } 1780 1781 #if OMPT_SUPPORT 1782 if (ompt_enabled.enabled) { 1783 *exit_frame_p = NULL; 1784 if (ompt_enabled.ompt_callback_implicit_task) { 1785 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1786 ompt_scope_end, NULL, &(task_info->task_data), 1, 1787 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1788 ompt_task_implicit); 1789 } 1790 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1791 __ompt_lw_taskteam_unlink(master_th); 1792 if (ompt_enabled.ompt_callback_parallel_end) { 1793 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1794 &ompt_parallel_data, parent_task_data, 1795 OMPT_INVOKER(call_context) | ompt_parallel_team, 1796 return_address); 1797 } 1798 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1799 } 1800 #endif 1801 } else if (microtask == (microtask_t)__kmp_teams_master) { 1802 KMP_DEBUG_ASSERT(master_th->th.th_team == 1803 master_th->th.th_serial_team); 1804 team = master_th->th.th_team; 1805 // team->t.t_pkfn = microtask; 1806 team->t.t_invoke = invoker; 1807 __kmp_alloc_argv_entries(argc, team, TRUE); 1808 team->t.t_argc = argc; 1809 argv = (void **)team->t.t_argv; 1810 if (ap) { 1811 for (i = argc - 1; i >= 0; --i) 1812 *argv++ = va_arg(kmp_va_deref(ap), void *); 1813 } else { 1814 for (i = 0; i < argc; ++i) 1815 // Get args from parent team for teams construct 1816 argv[i] = parent_team->t.t_argv[i]; 1817 } 1818 // AC: revert change made in __kmpc_serialized_parallel() 1819 // because initial code in teams should have level=0 1820 team->t.t_level--; 1821 // AC: call special invoker for outer "parallel" of teams construct 1822 invoker(gtid); 1823 #if OMPT_SUPPORT 1824 if (ompt_enabled.enabled) { 1825 ompt_task_info_t *task_info = OMPT_CUR_TASK_INFO(master_th); 1826 if (ompt_enabled.ompt_callback_implicit_task) { 1827 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1828 ompt_scope_end, NULL, &(task_info->task_data), 0, 1829 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_initial); 1830 } 1831 if (ompt_enabled.ompt_callback_parallel_end) { 1832 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1833 &ompt_parallel_data, parent_task_data, 1834 OMPT_INVOKER(call_context) | ompt_parallel_league, 1835 return_address); 1836 } 1837 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1838 } 1839 #endif 1840 } else { 1841 argv = args; 1842 for (i = argc - 1; i >= 0; --i) 1843 *argv++ = va_arg(kmp_va_deref(ap), void *); 1844 KMP_MB(); 1845 1846 #if OMPT_SUPPORT 1847 void *dummy; 1848 void **exit_frame_p; 1849 ompt_task_info_t *task_info; 1850 1851 ompt_lw_taskteam_t lw_taskteam; 1852 1853 if (ompt_enabled.enabled) { 1854 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1855 &ompt_parallel_data, return_address); 1856 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1857 // don't use lw_taskteam after linking. content was swaped 1858 task_info = OMPT_CUR_TASK_INFO(master_th); 1859 exit_frame_p = &(task_info->frame.exit_frame.ptr); 1860 1861 /* OMPT implicit task begin */ 1862 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1863 if (ompt_enabled.ompt_callback_implicit_task) { 1864 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1865 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1866 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), 1867 ompt_task_implicit); 1868 OMPT_CUR_TASK_INFO(master_th) 1869 ->thread_num = __kmp_tid_from_gtid(gtid); 1870 } 1871 1872 /* OMPT state */ 1873 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1874 } else { 1875 exit_frame_p = &dummy; 1876 } 1877 #endif 1878 1879 { 1880 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1881 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1882 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1883 #if OMPT_SUPPORT 1884 , 1885 exit_frame_p 1886 #endif 1887 ); 1888 } 1889 1890 #if OMPT_SUPPORT 1891 if (ompt_enabled.enabled) { 1892 *exit_frame_p = NULL; 1893 if (ompt_enabled.ompt_callback_implicit_task) { 1894 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1895 ompt_scope_end, NULL, &(task_info->task_data), 1, 1896 OMPT_CUR_TASK_INFO(master_th)->thread_num, 1897 ompt_task_implicit); 1898 } 1899 1900 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1901 __ompt_lw_taskteam_unlink(master_th); 1902 if (ompt_enabled.ompt_callback_parallel_end) { 1903 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1904 &ompt_parallel_data, parent_task_data, 1905 OMPT_INVOKER(call_context) | ompt_parallel_team, 1906 return_address); 1907 } 1908 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1909 } 1910 #endif 1911 } 1912 } else if (call_context == fork_context_gnu) { 1913 #if OMPT_SUPPORT 1914 ompt_lw_taskteam_t lwt; 1915 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1916 return_address); 1917 1918 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1919 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1920 // don't use lw_taskteam after linking. content was swaped 1921 #endif 1922 1923 // we were called from GNU native code 1924 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1925 return FALSE; 1926 } else { 1927 KMP_ASSERT2(call_context < fork_context_last, 1928 "__kmp_fork_call: unknown fork_context parameter"); 1929 } 1930 1931 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1932 KMP_MB(); 1933 return FALSE; 1934 } // if (nthreads == 1) 1935 1936 // GEH: only modify the executing flag in the case when not serialized 1937 // serialized case is handled in kmpc_serialized_parallel 1938 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1939 "curtask=%p, curtask_max_aclevel=%d\n", 1940 parent_team->t.t_active_level, master_th, 1941 master_th->th.th_current_task, 1942 master_th->th.th_current_task->td_icvs.max_active_levels)); 1943 // TODO: GEH - cannot do this assertion because root thread not set up as 1944 // executing 1945 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1946 master_th->th.th_current_task->td_flags.executing = 0; 1947 1948 if (!master_th->th.th_teams_microtask || level > teams_level) { 1949 /* Increment our nested depth level */ 1950 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1951 } 1952 1953 // See if we need to make a copy of the ICVs. 1954 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1955 if ((level + 1 < __kmp_nested_nth.used) && 1956 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1957 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1958 } else { 1959 nthreads_icv = 0; // don't update 1960 } 1961 1962 // Figure out the proc_bind_policy for the new team. 1963 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1964 kmp_proc_bind_t proc_bind_icv = 1965 proc_bind_default; // proc_bind_default means don't update 1966 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1967 proc_bind = proc_bind_false; 1968 } else { 1969 if (proc_bind == proc_bind_default) { 1970 // No proc_bind clause specified; use current proc-bind-var for this 1971 // parallel region 1972 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1973 } 1974 /* else: The proc_bind policy was specified explicitly on parallel clause. 1975 This overrides proc-bind-var for this parallel region, but does not 1976 change proc-bind-var. */ 1977 // Figure the value of proc-bind-var for the child threads. 1978 if ((level + 1 < __kmp_nested_proc_bind.used) && 1979 (__kmp_nested_proc_bind.bind_types[level + 1] != 1980 master_th->th.th_current_task->td_icvs.proc_bind)) { 1981 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1982 } 1983 } 1984 1985 // Reset for next parallel region 1986 master_th->th.th_set_proc_bind = proc_bind_default; 1987 1988 if ((nthreads_icv > 0) || (proc_bind_icv != proc_bind_default)) { 1989 kmp_internal_control_t new_icvs; 1990 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 1991 new_icvs.next = NULL; 1992 if (nthreads_icv > 0) { 1993 new_icvs.nproc = nthreads_icv; 1994 } 1995 if (proc_bind_icv != proc_bind_default) { 1996 new_icvs.proc_bind = proc_bind_icv; 1997 } 1998 1999 /* allocate a new parallel team */ 2000 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2001 team = __kmp_allocate_team(root, nthreads, nthreads, 2002 #if OMPT_SUPPORT 2003 ompt_parallel_data, 2004 #endif 2005 proc_bind, &new_icvs, 2006 argc USE_NESTED_HOT_ARG(master_th)); 2007 } else { 2008 /* allocate a new parallel team */ 2009 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2010 team = __kmp_allocate_team(root, nthreads, nthreads, 2011 #if OMPT_SUPPORT 2012 ompt_parallel_data, 2013 #endif 2014 proc_bind, 2015 &master_th->th.th_current_task->td_icvs, 2016 argc USE_NESTED_HOT_ARG(master_th)); 2017 } 2018 KF_TRACE( 2019 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2020 2021 /* setup the new team */ 2022 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2023 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2024 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2025 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2026 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2027 #if OMPT_SUPPORT 2028 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2029 return_address); 2030 #endif 2031 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2032 // TODO: parent_team->t.t_level == INT_MAX ??? 2033 if (!master_th->th.th_teams_microtask || level > teams_level) { 2034 int new_level = parent_team->t.t_level + 1; 2035 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2036 new_level = parent_team->t.t_active_level + 1; 2037 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2038 } else { 2039 // AC: Do not increase parallel level at start of the teams construct 2040 int new_level = parent_team->t.t_level; 2041 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2042 new_level = parent_team->t.t_active_level; 2043 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2044 } 2045 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2046 // set master's schedule as new run-time schedule 2047 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2048 2049 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2050 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2051 2052 // Update the floating point rounding in the team if required. 2053 propagateFPControl(team); 2054 2055 if (__kmp_tasking_mode != tskm_immediate_exec) { 2056 // Set master's task team to team's task team. Unless this is hot team, it 2057 // should be NULL. 2058 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2059 parent_team->t.t_task_team[master_th->th.th_task_state]); 2060 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2061 "%p, new task_team %p / team %p\n", 2062 __kmp_gtid_from_thread(master_th), 2063 master_th->th.th_task_team, parent_team, 2064 team->t.t_task_team[master_th->th.th_task_state], team)); 2065 2066 if (active_level || master_th->th.th_task_team) { 2067 // Take a memo of master's task_state 2068 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2069 if (master_th->th.th_task_state_top >= 2070 master_th->th.th_task_state_stack_sz) { // increase size 2071 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2072 kmp_uint8 *old_stack, *new_stack; 2073 kmp_uint32 i; 2074 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2075 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2076 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2077 } 2078 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2079 ++i) { // zero-init rest of stack 2080 new_stack[i] = 0; 2081 } 2082 old_stack = master_th->th.th_task_state_memo_stack; 2083 master_th->th.th_task_state_memo_stack = new_stack; 2084 master_th->th.th_task_state_stack_sz = new_size; 2085 __kmp_free(old_stack); 2086 } 2087 // Store master's task_state on stack 2088 master_th->th 2089 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2090 master_th->th.th_task_state; 2091 master_th->th.th_task_state_top++; 2092 #if KMP_NESTED_HOT_TEAMS 2093 if (master_th->th.th_hot_teams && 2094 active_level < __kmp_hot_teams_max_level && 2095 team == master_th->th.th_hot_teams[active_level].hot_team) { 2096 // Restore master's nested state if nested hot team 2097 master_th->th.th_task_state = 2098 master_th->th 2099 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2100 } else { 2101 #endif 2102 master_th->th.th_task_state = 0; 2103 #if KMP_NESTED_HOT_TEAMS 2104 } 2105 #endif 2106 } 2107 #if !KMP_NESTED_HOT_TEAMS 2108 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2109 (team == root->r.r_hot_team)); 2110 #endif 2111 } 2112 2113 KA_TRACE( 2114 20, 2115 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2116 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2117 team->t.t_nproc)); 2118 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2119 (team->t.t_master_tid == 0 && 2120 (team->t.t_parent == root->r.r_root_team || 2121 team->t.t_parent->t.t_serialized))); 2122 KMP_MB(); 2123 2124 /* now, setup the arguments */ 2125 argv = (void **)team->t.t_argv; 2126 if (ap) { 2127 for (i = argc - 1; i >= 0; --i) { 2128 void *new_argv = va_arg(kmp_va_deref(ap), void *); 2129 KMP_CHECK_UPDATE(*argv, new_argv); 2130 argv++; 2131 } 2132 } else { 2133 for (i = 0; i < argc; ++i) { 2134 // Get args from parent team for teams construct 2135 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2136 } 2137 } 2138 2139 /* now actually fork the threads */ 2140 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2141 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2142 root->r.r_active = TRUE; 2143 2144 __kmp_fork_team_threads(root, team, master_th, gtid); 2145 __kmp_setup_icv_copy(team, nthreads, 2146 &master_th->th.th_current_task->td_icvs, loc); 2147 2148 #if OMPT_SUPPORT 2149 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2150 #endif 2151 2152 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2153 2154 #if USE_ITT_BUILD 2155 if (team->t.t_active_level == 1 // only report frames at level 1 2156 && !master_th->th.th_teams_microtask) { // not in teams construct 2157 #if USE_ITT_NOTIFY 2158 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2159 (__kmp_forkjoin_frames_mode == 3 || 2160 __kmp_forkjoin_frames_mode == 1)) { 2161 kmp_uint64 tmp_time = 0; 2162 if (__itt_get_timestamp_ptr) 2163 tmp_time = __itt_get_timestamp(); 2164 // Internal fork - report frame begin 2165 master_th->th.th_frame_time = tmp_time; 2166 if (__kmp_forkjoin_frames_mode == 3) 2167 team->t.t_region_time = tmp_time; 2168 } else 2169 // only one notification scheme (either "submit" or "forking/joined", not both) 2170 #endif /* USE_ITT_NOTIFY */ 2171 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2172 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2173 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2174 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2175 } 2176 } 2177 #endif /* USE_ITT_BUILD */ 2178 2179 /* now go on and do the work */ 2180 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2181 KMP_MB(); 2182 KF_TRACE(10, 2183 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2184 root, team, master_th, gtid)); 2185 2186 #if USE_ITT_BUILD 2187 if (__itt_stack_caller_create_ptr) { 2188 team->t.t_stack_id = 2189 __kmp_itt_stack_caller_create(); // create new stack stitching id 2190 // before entering fork barrier 2191 } 2192 #endif /* USE_ITT_BUILD */ 2193 2194 // AC: skip __kmp_internal_fork at teams construct, let only master 2195 // threads execute 2196 if (ap) { 2197 __kmp_internal_fork(loc, gtid, team); 2198 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2199 "master_th=%p, gtid=%d\n", 2200 root, team, master_th, gtid)); 2201 } 2202 2203 if (call_context == fork_context_gnu) { 2204 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2205 return TRUE; 2206 } 2207 2208 /* Invoke microtask for MASTER thread */ 2209 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2210 team->t.t_id, team->t.t_pkfn)); 2211 } // END of timer KMP_fork_call block 2212 2213 #if KMP_STATS_ENABLED 2214 // If beginning a teams construct, then change thread state 2215 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 2216 if (!ap) { 2217 KMP_SET_THREAD_STATE(stats_state_e::TEAMS_REGION); 2218 } 2219 #endif 2220 2221 if (!team->t.t_invoke(gtid)) { 2222 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2223 } 2224 2225 #if KMP_STATS_ENABLED 2226 // If was beginning of a teams construct, then reset thread state 2227 if (!ap) { 2228 KMP_SET_THREAD_STATE(previous_state); 2229 } 2230 #endif 2231 2232 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2233 team->t.t_id, team->t.t_pkfn)); 2234 KMP_MB(); /* Flush all pending memory write invalidates. */ 2235 2236 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2237 2238 #if OMPT_SUPPORT 2239 if (ompt_enabled.enabled) { 2240 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2241 } 2242 #endif 2243 2244 return TRUE; 2245 } 2246 2247 #if OMPT_SUPPORT 2248 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2249 kmp_team_t *team) { 2250 // restore state outside the region 2251 thread->th.ompt_thread_info.state = 2252 ((team->t.t_serialized) ? ompt_state_work_serial 2253 : ompt_state_work_parallel); 2254 } 2255 2256 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2257 kmp_team_t *team, ompt_data_t *parallel_data, 2258 int flags, void *codeptr) { 2259 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2260 if (ompt_enabled.ompt_callback_parallel_end) { 2261 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2262 parallel_data, &(task_info->task_data), flags, codeptr); 2263 } 2264 2265 task_info->frame.enter_frame = ompt_data_none; 2266 __kmp_join_restore_state(thread, team); 2267 } 2268 #endif 2269 2270 void __kmp_join_call(ident_t *loc, int gtid 2271 #if OMPT_SUPPORT 2272 , 2273 enum fork_context_e fork_context 2274 #endif 2275 , 2276 int exit_teams) { 2277 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2278 kmp_team_t *team; 2279 kmp_team_t *parent_team; 2280 kmp_info_t *master_th; 2281 kmp_root_t *root; 2282 int master_active; 2283 2284 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2285 2286 /* setup current data */ 2287 master_th = __kmp_threads[gtid]; 2288 root = master_th->th.th_root; 2289 team = master_th->th.th_team; 2290 parent_team = team->t.t_parent; 2291 2292 master_th->th.th_ident = loc; 2293 2294 #if OMPT_SUPPORT 2295 void *team_microtask = (void *)team->t.t_pkfn; 2296 if (ompt_enabled.enabled) { 2297 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2298 } 2299 #endif 2300 2301 #if KMP_DEBUG 2302 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2303 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2304 "th_task_team = %p\n", 2305 __kmp_gtid_from_thread(master_th), team, 2306 team->t.t_task_team[master_th->th.th_task_state], 2307 master_th->th.th_task_team)); 2308 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2309 team->t.t_task_team[master_th->th.th_task_state]); 2310 } 2311 #endif 2312 2313 if (team->t.t_serialized) { 2314 if (master_th->th.th_teams_microtask) { 2315 // We are in teams construct 2316 int level = team->t.t_level; 2317 int tlevel = master_th->th.th_teams_level; 2318 if (level == tlevel) { 2319 // AC: we haven't incremented it earlier at start of teams construct, 2320 // so do it here - at the end of teams construct 2321 team->t.t_level++; 2322 } else if (level == tlevel + 1) { 2323 // AC: we are exiting parallel inside teams, need to increment 2324 // serialization in order to restore it in the next call to 2325 // __kmpc_end_serialized_parallel 2326 team->t.t_serialized++; 2327 } 2328 } 2329 __kmpc_end_serialized_parallel(loc, gtid); 2330 2331 #if OMPT_SUPPORT 2332 if (ompt_enabled.enabled) { 2333 __kmp_join_restore_state(master_th, parent_team); 2334 } 2335 #endif 2336 2337 return; 2338 } 2339 2340 master_active = team->t.t_master_active; 2341 2342 if (!exit_teams) { 2343 // AC: No barrier for internal teams at exit from teams construct. 2344 // But there is barrier for external team (league). 2345 __kmp_internal_join(loc, gtid, team); 2346 } else { 2347 master_th->th.th_task_state = 2348 0; // AC: no tasking in teams (out of any parallel) 2349 } 2350 2351 KMP_MB(); 2352 2353 #if OMPT_SUPPORT 2354 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2355 void *codeptr = team->t.ompt_team_info.master_return_address; 2356 #endif 2357 2358 #if USE_ITT_BUILD 2359 if (__itt_stack_caller_create_ptr) { 2360 // destroy the stack stitching id after join barrier 2361 __kmp_itt_stack_caller_destroy((__itt_caller)team->t.t_stack_id); 2362 } 2363 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2364 if (team->t.t_active_level == 1 && 2365 (!master_th->th.th_teams_microtask || /* not in teams construct */ 2366 master_th->th.th_teams_size.nteams == 1)) { 2367 master_th->th.th_ident = loc; 2368 // only one notification scheme (either "submit" or "forking/joined", not 2369 // both) 2370 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2371 __kmp_forkjoin_frames_mode == 3) 2372 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2373 master_th->th.th_frame_time, 0, loc, 2374 master_th->th.th_team_nproc, 1); 2375 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2376 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2377 __kmp_itt_region_joined(gtid); 2378 } // active_level == 1 2379 #endif /* USE_ITT_BUILD */ 2380 2381 if (master_th->th.th_teams_microtask && !exit_teams && 2382 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2383 team->t.t_level == master_th->th.th_teams_level + 1) { 2384 // AC: We need to leave the team structure intact at the end of parallel 2385 // inside the teams construct, so that at the next parallel same (hot) team 2386 // works, only adjust nesting levels 2387 #if OMPT_SUPPORT 2388 ompt_data_t ompt_parallel_data = ompt_data_none; 2389 if (ompt_enabled.enabled) { 2390 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2391 if (ompt_enabled.ompt_callback_implicit_task) { 2392 int ompt_team_size = team->t.t_nproc; 2393 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2394 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2395 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); 2396 } 2397 task_info->frame.exit_frame = ompt_data_none; 2398 task_info->task_data = ompt_data_none; 2399 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 2400 __ompt_lw_taskteam_unlink(master_th); 2401 } 2402 #endif 2403 /* Decrement our nested depth level */ 2404 team->t.t_level--; 2405 team->t.t_active_level--; 2406 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2407 2408 // Restore number of threads in the team if needed. This code relies on 2409 // the proper adjustment of th_teams_size.nth after the fork in 2410 // __kmp_teams_master on each teams master in the case that 2411 // __kmp_reserve_threads reduced it. 2412 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2413 int old_num = master_th->th.th_team_nproc; 2414 int new_num = master_th->th.th_teams_size.nth; 2415 kmp_info_t **other_threads = team->t.t_threads; 2416 team->t.t_nproc = new_num; 2417 for (int i = 0; i < old_num; ++i) { 2418 other_threads[i]->th.th_team_nproc = new_num; 2419 } 2420 // Adjust states of non-used threads of the team 2421 for (int i = old_num; i < new_num; ++i) { 2422 // Re-initialize thread's barrier data. 2423 KMP_DEBUG_ASSERT(other_threads[i]); 2424 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2425 for (int b = 0; b < bs_last_barrier; ++b) { 2426 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2427 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2428 #if USE_DEBUGGER 2429 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2430 #endif 2431 } 2432 if (__kmp_tasking_mode != tskm_immediate_exec) { 2433 // Synchronize thread's task state 2434 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2435 } 2436 } 2437 } 2438 2439 #if OMPT_SUPPORT 2440 if (ompt_enabled.enabled) { 2441 __kmp_join_ompt(gtid, master_th, parent_team, &ompt_parallel_data, 2442 OMPT_INVOKER(fork_context) | ompt_parallel_team, codeptr); 2443 } 2444 #endif 2445 2446 return; 2447 } 2448 2449 /* do cleanup and restore the parent team */ 2450 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2451 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2452 2453 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2454 2455 /* jc: The following lock has instructions with REL and ACQ semantics, 2456 separating the parallel user code called in this parallel region 2457 from the serial user code called after this function returns. */ 2458 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2459 2460 if (!master_th->th.th_teams_microtask || 2461 team->t.t_level > master_th->th.th_teams_level) { 2462 /* Decrement our nested depth level */ 2463 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2464 } 2465 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2466 2467 #if OMPT_SUPPORT 2468 if (ompt_enabled.enabled) { 2469 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2470 if (ompt_enabled.ompt_callback_implicit_task) { 2471 int flags = (team_microtask == (void *)__kmp_teams_master) 2472 ? ompt_task_initial 2473 : ompt_task_implicit; 2474 int ompt_team_size = (flags == ompt_task_initial) ? 0 : team->t.t_nproc; 2475 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2476 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2477 OMPT_CUR_TASK_INFO(master_th)->thread_num, flags); 2478 } 2479 task_info->frame.exit_frame = ompt_data_none; 2480 task_info->task_data = ompt_data_none; 2481 } 2482 #endif 2483 2484 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2485 master_th, team)); 2486 __kmp_pop_current_task_from_thread(master_th); 2487 2488 #if KMP_AFFINITY_SUPPORTED 2489 // Restore master thread's partition. 2490 master_th->th.th_first_place = team->t.t_first_place; 2491 master_th->th.th_last_place = team->t.t_last_place; 2492 #endif // KMP_AFFINITY_SUPPORTED 2493 master_th->th.th_def_allocator = team->t.t_def_allocator; 2494 2495 updateHWFPControl(team); 2496 2497 if (root->r.r_active != master_active) 2498 root->r.r_active = master_active; 2499 2500 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2501 master_th)); // this will free worker threads 2502 2503 /* this race was fun to find. make sure the following is in the critical 2504 region otherwise assertions may fail occasionally since the old team may be 2505 reallocated and the hierarchy appears inconsistent. it is actually safe to 2506 run and won't cause any bugs, but will cause those assertion failures. it's 2507 only one deref&assign so might as well put this in the critical region */ 2508 master_th->th.th_team = parent_team; 2509 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2510 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2511 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2512 2513 /* restore serialized team, if need be */ 2514 if (parent_team->t.t_serialized && 2515 parent_team != master_th->th.th_serial_team && 2516 parent_team != root->r.r_root_team) { 2517 __kmp_free_team(root, 2518 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2519 master_th->th.th_serial_team = parent_team; 2520 } 2521 2522 if (__kmp_tasking_mode != tskm_immediate_exec) { 2523 if (master_th->th.th_task_state_top > 2524 0) { // Restore task state from memo stack 2525 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2526 // Remember master's state if we re-use this nested hot team 2527 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2528 master_th->th.th_task_state; 2529 --master_th->th.th_task_state_top; // pop 2530 // Now restore state at this level 2531 master_th->th.th_task_state = 2532 master_th->th 2533 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2534 } 2535 // Copy the task team from the parent team to the master thread 2536 master_th->th.th_task_team = 2537 parent_team->t.t_task_team[master_th->th.th_task_state]; 2538 KA_TRACE(20, 2539 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2540 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2541 parent_team)); 2542 } 2543 2544 // TODO: GEH - cannot do this assertion because root thread not set up as 2545 // executing 2546 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2547 master_th->th.th_current_task->td_flags.executing = 1; 2548 2549 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2550 2551 #if OMPT_SUPPORT 2552 int flags = 2553 OMPT_INVOKER(fork_context) | 2554 ((team_microtask == (void *)__kmp_teams_master) ? ompt_parallel_league 2555 : ompt_parallel_team); 2556 if (ompt_enabled.enabled) { 2557 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, flags, 2558 codeptr); 2559 } 2560 #endif 2561 2562 KMP_MB(); 2563 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2564 } 2565 2566 /* Check whether we should push an internal control record onto the 2567 serial team stack. If so, do it. */ 2568 void __kmp_save_internal_controls(kmp_info_t *thread) { 2569 2570 if (thread->th.th_team != thread->th.th_serial_team) { 2571 return; 2572 } 2573 if (thread->th.th_team->t.t_serialized > 1) { 2574 int push = 0; 2575 2576 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2577 push = 1; 2578 } else { 2579 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2580 thread->th.th_team->t.t_serialized) { 2581 push = 1; 2582 } 2583 } 2584 if (push) { /* push a record on the serial team's stack */ 2585 kmp_internal_control_t *control = 2586 (kmp_internal_control_t *)__kmp_allocate( 2587 sizeof(kmp_internal_control_t)); 2588 2589 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2590 2591 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2592 2593 control->next = thread->th.th_team->t.t_control_stack_top; 2594 thread->th.th_team->t.t_control_stack_top = control; 2595 } 2596 } 2597 } 2598 2599 /* Changes set_nproc */ 2600 void __kmp_set_num_threads(int new_nth, int gtid) { 2601 kmp_info_t *thread; 2602 kmp_root_t *root; 2603 2604 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2605 KMP_DEBUG_ASSERT(__kmp_init_serial); 2606 2607 if (new_nth < 1) 2608 new_nth = 1; 2609 else if (new_nth > __kmp_max_nth) 2610 new_nth = __kmp_max_nth; 2611 2612 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2613 thread = __kmp_threads[gtid]; 2614 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2615 return; // nothing to do 2616 2617 __kmp_save_internal_controls(thread); 2618 2619 set__nproc(thread, new_nth); 2620 2621 // If this omp_set_num_threads() call will cause the hot team size to be 2622 // reduced (in the absence of a num_threads clause), then reduce it now, 2623 // rather than waiting for the next parallel region. 2624 root = thread->th.th_root; 2625 if (__kmp_init_parallel && (!root->r.r_active) && 2626 (root->r.r_hot_team->t.t_nproc > new_nth) 2627 #if KMP_NESTED_HOT_TEAMS 2628 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2629 #endif 2630 ) { 2631 kmp_team_t *hot_team = root->r.r_hot_team; 2632 int f; 2633 2634 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2635 2636 // Release the extra threads we don't need any more. 2637 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2638 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2639 if (__kmp_tasking_mode != tskm_immediate_exec) { 2640 // When decreasing team size, threads no longer in the team should unref 2641 // task team. 2642 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2643 } 2644 __kmp_free_thread(hot_team->t.t_threads[f]); 2645 hot_team->t.t_threads[f] = NULL; 2646 } 2647 hot_team->t.t_nproc = new_nth; 2648 #if KMP_NESTED_HOT_TEAMS 2649 if (thread->th.th_hot_teams) { 2650 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2651 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2652 } 2653 #endif 2654 2655 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2656 2657 // Update the t_nproc field in the threads that are still active. 2658 for (f = 0; f < new_nth; f++) { 2659 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2660 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2661 } 2662 // Special flag in case omp_set_num_threads() call 2663 hot_team->t.t_size_changed = -1; 2664 } 2665 } 2666 2667 /* Changes max_active_levels */ 2668 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2669 kmp_info_t *thread; 2670 2671 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2672 "%d = (%d)\n", 2673 gtid, max_active_levels)); 2674 KMP_DEBUG_ASSERT(__kmp_init_serial); 2675 2676 // validate max_active_levels 2677 if (max_active_levels < 0) { 2678 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2679 // We ignore this call if the user has specified a negative value. 2680 // The current setting won't be changed. The last valid setting will be 2681 // used. A warning will be issued (if warnings are allowed as controlled by 2682 // the KMP_WARNINGS env var). 2683 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2684 "max_active_levels for thread %d = (%d)\n", 2685 gtid, max_active_levels)); 2686 return; 2687 } 2688 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2689 // it's OK, the max_active_levels is within the valid range: [ 0; 2690 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2691 // We allow a zero value. (implementation defined behavior) 2692 } else { 2693 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2694 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2695 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2696 // Current upper limit is MAX_INT. (implementation defined behavior) 2697 // If the input exceeds the upper limit, we correct the input to be the 2698 // upper limit. (implementation defined behavior) 2699 // Actually, the flow should never get here until we use MAX_INT limit. 2700 } 2701 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2702 "max_active_levels for thread %d = (%d)\n", 2703 gtid, max_active_levels)); 2704 2705 thread = __kmp_threads[gtid]; 2706 2707 __kmp_save_internal_controls(thread); 2708 2709 set__max_active_levels(thread, max_active_levels); 2710 } 2711 2712 /* Gets max_active_levels */ 2713 int __kmp_get_max_active_levels(int gtid) { 2714 kmp_info_t *thread; 2715 2716 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2717 KMP_DEBUG_ASSERT(__kmp_init_serial); 2718 2719 thread = __kmp_threads[gtid]; 2720 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2721 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2722 "curtask_maxaclevel=%d\n", 2723 gtid, thread->th.th_current_task, 2724 thread->th.th_current_task->td_icvs.max_active_levels)); 2725 return thread->th.th_current_task->td_icvs.max_active_levels; 2726 } 2727 2728 KMP_BUILD_ASSERT(sizeof(kmp_sched_t) == sizeof(int)); 2729 KMP_BUILD_ASSERT(sizeof(enum sched_type) == sizeof(int)); 2730 2731 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2732 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2733 kmp_info_t *thread; 2734 kmp_sched_t orig_kind; 2735 // kmp_team_t *team; 2736 2737 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2738 gtid, (int)kind, chunk)); 2739 KMP_DEBUG_ASSERT(__kmp_init_serial); 2740 2741 // Check if the kind parameter is valid, correct if needed. 2742 // Valid parameters should fit in one of two intervals - standard or extended: 2743 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2744 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2745 orig_kind = kind; 2746 kind = __kmp_sched_without_mods(kind); 2747 2748 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2749 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2750 // TODO: Hint needs attention in case we change the default schedule. 2751 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2752 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2753 __kmp_msg_null); 2754 kind = kmp_sched_default; 2755 chunk = 0; // ignore chunk value in case of bad kind 2756 } 2757 2758 thread = __kmp_threads[gtid]; 2759 2760 __kmp_save_internal_controls(thread); 2761 2762 if (kind < kmp_sched_upper_std) { 2763 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2764 // differ static chunked vs. unchunked: chunk should be invalid to 2765 // indicate unchunked schedule (which is the default) 2766 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2767 } else { 2768 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2769 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2770 } 2771 } else { 2772 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2773 // kmp_sched_lower - 2 ]; 2774 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2775 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2776 kmp_sched_lower - 2]; 2777 } 2778 __kmp_sched_apply_mods_intkind( 2779 orig_kind, &(thread->th.th_current_task->td_icvs.sched.r_sched_type)); 2780 if (kind == kmp_sched_auto || chunk < 1) { 2781 // ignore parameter chunk for schedule auto 2782 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2783 } else { 2784 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2785 } 2786 } 2787 2788 /* Gets def_sched_var ICV values */ 2789 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2790 kmp_info_t *thread; 2791 enum sched_type th_type; 2792 2793 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2794 KMP_DEBUG_ASSERT(__kmp_init_serial); 2795 2796 thread = __kmp_threads[gtid]; 2797 2798 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2799 switch (SCHEDULE_WITHOUT_MODIFIERS(th_type)) { 2800 case kmp_sch_static: 2801 case kmp_sch_static_greedy: 2802 case kmp_sch_static_balanced: 2803 *kind = kmp_sched_static; 2804 __kmp_sched_apply_mods_stdkind(kind, th_type); 2805 *chunk = 0; // chunk was not set, try to show this fact via zero value 2806 return; 2807 case kmp_sch_static_chunked: 2808 *kind = kmp_sched_static; 2809 break; 2810 case kmp_sch_dynamic_chunked: 2811 *kind = kmp_sched_dynamic; 2812 break; 2813 case kmp_sch_guided_chunked: 2814 case kmp_sch_guided_iterative_chunked: 2815 case kmp_sch_guided_analytical_chunked: 2816 *kind = kmp_sched_guided; 2817 break; 2818 case kmp_sch_auto: 2819 *kind = kmp_sched_auto; 2820 break; 2821 case kmp_sch_trapezoidal: 2822 *kind = kmp_sched_trapezoidal; 2823 break; 2824 #if KMP_STATIC_STEAL_ENABLED 2825 case kmp_sch_static_steal: 2826 *kind = kmp_sched_static_steal; 2827 break; 2828 #endif 2829 default: 2830 KMP_FATAL(UnknownSchedulingType, th_type); 2831 } 2832 2833 __kmp_sched_apply_mods_stdkind(kind, th_type); 2834 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2835 } 2836 2837 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2838 2839 int ii, dd; 2840 kmp_team_t *team; 2841 kmp_info_t *thr; 2842 2843 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2844 KMP_DEBUG_ASSERT(__kmp_init_serial); 2845 2846 // validate level 2847 if (level == 0) 2848 return 0; 2849 if (level < 0) 2850 return -1; 2851 thr = __kmp_threads[gtid]; 2852 team = thr->th.th_team; 2853 ii = team->t.t_level; 2854 if (level > ii) 2855 return -1; 2856 2857 if (thr->th.th_teams_microtask) { 2858 // AC: we are in teams region where multiple nested teams have same level 2859 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2860 if (level <= 2861 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2862 KMP_DEBUG_ASSERT(ii >= tlevel); 2863 // AC: As we need to pass by the teams league, we need to artificially 2864 // increase ii 2865 if (ii == tlevel) { 2866 ii += 2; // three teams have same level 2867 } else { 2868 ii++; // two teams have same level 2869 } 2870 } 2871 } 2872 2873 if (ii == level) 2874 return __kmp_tid_from_gtid(gtid); 2875 2876 dd = team->t.t_serialized; 2877 level++; 2878 while (ii > level) { 2879 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2880 } 2881 if ((team->t.t_serialized) && (!dd)) { 2882 team = team->t.t_parent; 2883 continue; 2884 } 2885 if (ii > level) { 2886 team = team->t.t_parent; 2887 dd = team->t.t_serialized; 2888 ii--; 2889 } 2890 } 2891 2892 return (dd > 1) ? (0) : (team->t.t_master_tid); 2893 } 2894 2895 int __kmp_get_team_size(int gtid, int level) { 2896 2897 int ii, dd; 2898 kmp_team_t *team; 2899 kmp_info_t *thr; 2900 2901 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2902 KMP_DEBUG_ASSERT(__kmp_init_serial); 2903 2904 // validate level 2905 if (level == 0) 2906 return 1; 2907 if (level < 0) 2908 return -1; 2909 thr = __kmp_threads[gtid]; 2910 team = thr->th.th_team; 2911 ii = team->t.t_level; 2912 if (level > ii) 2913 return -1; 2914 2915 if (thr->th.th_teams_microtask) { 2916 // AC: we are in teams region where multiple nested teams have same level 2917 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2918 if (level <= 2919 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2920 KMP_DEBUG_ASSERT(ii >= tlevel); 2921 // AC: As we need to pass by the teams league, we need to artificially 2922 // increase ii 2923 if (ii == tlevel) { 2924 ii += 2; // three teams have same level 2925 } else { 2926 ii++; // two teams have same level 2927 } 2928 } 2929 } 2930 2931 while (ii > level) { 2932 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2933 } 2934 if (team->t.t_serialized && (!dd)) { 2935 team = team->t.t_parent; 2936 continue; 2937 } 2938 if (ii > level) { 2939 team = team->t.t_parent; 2940 ii--; 2941 } 2942 } 2943 2944 return team->t.t_nproc; 2945 } 2946 2947 kmp_r_sched_t __kmp_get_schedule_global() { 2948 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2949 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2950 // independently. So one can get the updated schedule here. 2951 2952 kmp_r_sched_t r_sched; 2953 2954 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2955 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2956 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2957 // different roots (even in OMP 2.5) 2958 enum sched_type s = SCHEDULE_WITHOUT_MODIFIERS(__kmp_sched); 2959 enum sched_type sched_modifiers = SCHEDULE_GET_MODIFIERS(__kmp_sched); 2960 if (s == kmp_sch_static) { 2961 // replace STATIC with more detailed schedule (balanced or greedy) 2962 r_sched.r_sched_type = __kmp_static; 2963 } else if (s == kmp_sch_guided_chunked) { 2964 // replace GUIDED with more detailed schedule (iterative or analytical) 2965 r_sched.r_sched_type = __kmp_guided; 2966 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2967 r_sched.r_sched_type = __kmp_sched; 2968 } 2969 SCHEDULE_SET_MODIFIERS(r_sched.r_sched_type, sched_modifiers); 2970 2971 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2972 // __kmp_chunk may be wrong here (if it was not ever set) 2973 r_sched.chunk = KMP_DEFAULT_CHUNK; 2974 } else { 2975 r_sched.chunk = __kmp_chunk; 2976 } 2977 2978 return r_sched; 2979 } 2980 2981 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2982 at least argc number of *t_argv entries for the requested team. */ 2983 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2984 2985 KMP_DEBUG_ASSERT(team); 2986 if (!realloc || argc > team->t.t_max_argc) { 2987 2988 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 2989 "current entries=%d\n", 2990 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 2991 /* if previously allocated heap space for args, free them */ 2992 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 2993 __kmp_free((void *)team->t.t_argv); 2994 2995 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 2996 /* use unused space in the cache line for arguments */ 2997 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 2998 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 2999 "argv entries\n", 3000 team->t.t_id, team->t.t_max_argc)); 3001 team->t.t_argv = &team->t.t_inline_argv[0]; 3002 if (__kmp_storage_map) { 3003 __kmp_print_storage_map_gtid( 3004 -1, &team->t.t_inline_argv[0], 3005 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3006 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3007 team->t.t_id); 3008 } 3009 } else { 3010 /* allocate space for arguments in the heap */ 3011 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3012 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3013 : 2 * argc; 3014 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3015 "argv entries\n", 3016 team->t.t_id, team->t.t_max_argc)); 3017 team->t.t_argv = 3018 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3019 if (__kmp_storage_map) { 3020 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3021 &team->t.t_argv[team->t.t_max_argc], 3022 sizeof(void *) * team->t.t_max_argc, 3023 "team_%d.t_argv", team->t.t_id); 3024 } 3025 } 3026 } 3027 } 3028 3029 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3030 int i; 3031 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3032 team->t.t_threads = 3033 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3034 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3035 sizeof(dispatch_shared_info_t) * num_disp_buff); 3036 team->t.t_dispatch = 3037 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3038 team->t.t_implicit_task_taskdata = 3039 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3040 team->t.t_max_nproc = max_nth; 3041 3042 /* setup dispatch buffers */ 3043 for (i = 0; i < num_disp_buff; ++i) { 3044 team->t.t_disp_buffer[i].buffer_index = i; 3045 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3046 } 3047 } 3048 3049 static void __kmp_free_team_arrays(kmp_team_t *team) { 3050 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3051 int i; 3052 for (i = 0; i < team->t.t_max_nproc; ++i) { 3053 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3054 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3055 team->t.t_dispatch[i].th_disp_buffer = NULL; 3056 } 3057 } 3058 #if KMP_USE_HIER_SCHED 3059 __kmp_dispatch_free_hierarchies(team); 3060 #endif 3061 __kmp_free(team->t.t_threads); 3062 __kmp_free(team->t.t_disp_buffer); 3063 __kmp_free(team->t.t_dispatch); 3064 __kmp_free(team->t.t_implicit_task_taskdata); 3065 team->t.t_threads = NULL; 3066 team->t.t_disp_buffer = NULL; 3067 team->t.t_dispatch = NULL; 3068 team->t.t_implicit_task_taskdata = 0; 3069 } 3070 3071 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3072 kmp_info_t **oldThreads = team->t.t_threads; 3073 3074 __kmp_free(team->t.t_disp_buffer); 3075 __kmp_free(team->t.t_dispatch); 3076 __kmp_free(team->t.t_implicit_task_taskdata); 3077 __kmp_allocate_team_arrays(team, max_nth); 3078 3079 KMP_MEMCPY(team->t.t_threads, oldThreads, 3080 team->t.t_nproc * sizeof(kmp_info_t *)); 3081 3082 __kmp_free(oldThreads); 3083 } 3084 3085 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3086 3087 kmp_r_sched_t r_sched = 3088 __kmp_get_schedule_global(); // get current state of scheduling globals 3089 3090 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3091 3092 kmp_internal_control_t g_icvs = { 3093 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3094 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3095 // adjustment of threads (per thread) 3096 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3097 // whether blocktime is explicitly set 3098 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3099 #if KMP_USE_MONITOR 3100 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3101 // intervals 3102 #endif 3103 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3104 // next parallel region (per thread) 3105 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3106 __kmp_cg_max_nth, // int thread_limit; 3107 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3108 // for max_active_levels 3109 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3110 // {sched,chunk} pair 3111 __kmp_nested_proc_bind.bind_types[0], 3112 __kmp_default_device, 3113 NULL // struct kmp_internal_control *next; 3114 }; 3115 3116 return g_icvs; 3117 } 3118 3119 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3120 3121 kmp_internal_control_t gx_icvs; 3122 gx_icvs.serial_nesting_level = 3123 0; // probably =team->t.t_serial like in save_inter_controls 3124 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3125 gx_icvs.next = NULL; 3126 3127 return gx_icvs; 3128 } 3129 3130 static void __kmp_initialize_root(kmp_root_t *root) { 3131 int f; 3132 kmp_team_t *root_team; 3133 kmp_team_t *hot_team; 3134 int hot_team_max_nth; 3135 kmp_r_sched_t r_sched = 3136 __kmp_get_schedule_global(); // get current state of scheduling globals 3137 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3138 KMP_DEBUG_ASSERT(root); 3139 KMP_ASSERT(!root->r.r_begin); 3140 3141 /* setup the root state structure */ 3142 __kmp_init_lock(&root->r.r_begin_lock); 3143 root->r.r_begin = FALSE; 3144 root->r.r_active = FALSE; 3145 root->r.r_in_parallel = 0; 3146 root->r.r_blocktime = __kmp_dflt_blocktime; 3147 3148 /* setup the root team for this task */ 3149 /* allocate the root team structure */ 3150 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3151 3152 root_team = 3153 __kmp_allocate_team(root, 3154 1, // new_nproc 3155 1, // max_nproc 3156 #if OMPT_SUPPORT 3157 ompt_data_none, // root parallel id 3158 #endif 3159 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3160 0 // argc 3161 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3162 ); 3163 #if USE_DEBUGGER 3164 // Non-NULL value should be assigned to make the debugger display the root 3165 // team. 3166 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3167 #endif 3168 3169 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3170 3171 root->r.r_root_team = root_team; 3172 root_team->t.t_control_stack_top = NULL; 3173 3174 /* initialize root team */ 3175 root_team->t.t_threads[0] = NULL; 3176 root_team->t.t_nproc = 1; 3177 root_team->t.t_serialized = 1; 3178 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3179 root_team->t.t_sched.sched = r_sched.sched; 3180 KA_TRACE( 3181 20, 3182 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3183 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3184 3185 /* setup the hot team for this task */ 3186 /* allocate the hot team structure */ 3187 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3188 3189 hot_team = 3190 __kmp_allocate_team(root, 3191 1, // new_nproc 3192 __kmp_dflt_team_nth_ub * 2, // max_nproc 3193 #if OMPT_SUPPORT 3194 ompt_data_none, // root parallel id 3195 #endif 3196 __kmp_nested_proc_bind.bind_types[0], &r_icvs, 3197 0 // argc 3198 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3199 ); 3200 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3201 3202 root->r.r_hot_team = hot_team; 3203 root_team->t.t_control_stack_top = NULL; 3204 3205 /* first-time initialization */ 3206 hot_team->t.t_parent = root_team; 3207 3208 /* initialize hot team */ 3209 hot_team_max_nth = hot_team->t.t_max_nproc; 3210 for (f = 0; f < hot_team_max_nth; ++f) { 3211 hot_team->t.t_threads[f] = NULL; 3212 } 3213 hot_team->t.t_nproc = 1; 3214 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3215 hot_team->t.t_sched.sched = r_sched.sched; 3216 hot_team->t.t_size_changed = 0; 3217 } 3218 3219 #ifdef KMP_DEBUG 3220 3221 typedef struct kmp_team_list_item { 3222 kmp_team_p const *entry; 3223 struct kmp_team_list_item *next; 3224 } kmp_team_list_item_t; 3225 typedef kmp_team_list_item_t *kmp_team_list_t; 3226 3227 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3228 kmp_team_list_t list, // List of teams. 3229 kmp_team_p const *team // Team to add. 3230 ) { 3231 3232 // List must terminate with item where both entry and next are NULL. 3233 // Team is added to the list only once. 3234 // List is sorted in ascending order by team id. 3235 // Team id is *not* a key. 3236 3237 kmp_team_list_t l; 3238 3239 KMP_DEBUG_ASSERT(list != NULL); 3240 if (team == NULL) { 3241 return; 3242 } 3243 3244 __kmp_print_structure_team_accum(list, team->t.t_parent); 3245 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3246 3247 // Search list for the team. 3248 l = list; 3249 while (l->next != NULL && l->entry != team) { 3250 l = l->next; 3251 } 3252 if (l->next != NULL) { 3253 return; // Team has been added before, exit. 3254 } 3255 3256 // Team is not found. Search list again for insertion point. 3257 l = list; 3258 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3259 l = l->next; 3260 } 3261 3262 // Insert team. 3263 { 3264 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3265 sizeof(kmp_team_list_item_t)); 3266 *item = *l; 3267 l->entry = team; 3268 l->next = item; 3269 } 3270 } 3271 3272 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3273 3274 ) { 3275 __kmp_printf("%s", title); 3276 if (team != NULL) { 3277 __kmp_printf("%2x %p\n", team->t.t_id, team); 3278 } else { 3279 __kmp_printf(" - (nil)\n"); 3280 } 3281 } 3282 3283 static void __kmp_print_structure_thread(char const *title, 3284 kmp_info_p const *thread) { 3285 __kmp_printf("%s", title); 3286 if (thread != NULL) { 3287 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3288 } else { 3289 __kmp_printf(" - (nil)\n"); 3290 } 3291 } 3292 3293 void __kmp_print_structure(void) { 3294 3295 kmp_team_list_t list; 3296 3297 // Initialize list of teams. 3298 list = 3299 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3300 list->entry = NULL; 3301 list->next = NULL; 3302 3303 __kmp_printf("\n------------------------------\nGlobal Thread " 3304 "Table\n------------------------------\n"); 3305 { 3306 int gtid; 3307 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3308 __kmp_printf("%2d", gtid); 3309 if (__kmp_threads != NULL) { 3310 __kmp_printf(" %p", __kmp_threads[gtid]); 3311 } 3312 if (__kmp_root != NULL) { 3313 __kmp_printf(" %p", __kmp_root[gtid]); 3314 } 3315 __kmp_printf("\n"); 3316 } 3317 } 3318 3319 // Print out __kmp_threads array. 3320 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3321 "----------\n"); 3322 if (__kmp_threads != NULL) { 3323 int gtid; 3324 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3325 kmp_info_t const *thread = __kmp_threads[gtid]; 3326 if (thread != NULL) { 3327 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3328 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3329 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3330 __kmp_print_structure_team(" Serial Team: ", 3331 thread->th.th_serial_team); 3332 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3333 __kmp_print_structure_thread(" Master: ", 3334 thread->th.th_team_master); 3335 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3336 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3337 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3338 __kmp_print_structure_thread(" Next in pool: ", 3339 thread->th.th_next_pool); 3340 __kmp_printf("\n"); 3341 __kmp_print_structure_team_accum(list, thread->th.th_team); 3342 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3343 } 3344 } 3345 } else { 3346 __kmp_printf("Threads array is not allocated.\n"); 3347 } 3348 3349 // Print out __kmp_root array. 3350 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3351 "--------\n"); 3352 if (__kmp_root != NULL) { 3353 int gtid; 3354 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3355 kmp_root_t const *root = __kmp_root[gtid]; 3356 if (root != NULL) { 3357 __kmp_printf("GTID %2d %p:\n", gtid, root); 3358 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3359 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3360 __kmp_print_structure_thread(" Uber Thread: ", 3361 root->r.r_uber_thread); 3362 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3363 __kmp_printf(" In Parallel: %2d\n", 3364 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3365 __kmp_printf("\n"); 3366 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3367 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3368 } 3369 } 3370 } else { 3371 __kmp_printf("Ubers array is not allocated.\n"); 3372 } 3373 3374 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3375 "--------\n"); 3376 while (list->next != NULL) { 3377 kmp_team_p const *team = list->entry; 3378 int i; 3379 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3380 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3381 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3382 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3383 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3384 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3385 for (i = 0; i < team->t.t_nproc; ++i) { 3386 __kmp_printf(" Thread %2d: ", i); 3387 __kmp_print_structure_thread("", team->t.t_threads[i]); 3388 } 3389 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3390 __kmp_printf("\n"); 3391 list = list->next; 3392 } 3393 3394 // Print out __kmp_thread_pool and __kmp_team_pool. 3395 __kmp_printf("\n------------------------------\nPools\n----------------------" 3396 "--------\n"); 3397 __kmp_print_structure_thread("Thread pool: ", 3398 CCAST(kmp_info_t *, __kmp_thread_pool)); 3399 __kmp_print_structure_team("Team pool: ", 3400 CCAST(kmp_team_t *, __kmp_team_pool)); 3401 __kmp_printf("\n"); 3402 3403 // Free team list. 3404 while (list != NULL) { 3405 kmp_team_list_item_t *item = list; 3406 list = list->next; 3407 KMP_INTERNAL_FREE(item); 3408 } 3409 } 3410 3411 #endif 3412 3413 //--------------------------------------------------------------------------- 3414 // Stuff for per-thread fast random number generator 3415 // Table of primes 3416 static const unsigned __kmp_primes[] = { 3417 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3418 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3419 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3420 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3421 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3422 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3423 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3424 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3425 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3426 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3427 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3428 3429 //--------------------------------------------------------------------------- 3430 // __kmp_get_random: Get a random number using a linear congruential method. 3431 unsigned short __kmp_get_random(kmp_info_t *thread) { 3432 unsigned x = thread->th.th_x; 3433 unsigned short r = x >> 16; 3434 3435 thread->th.th_x = x * thread->th.th_a + 1; 3436 3437 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3438 thread->th.th_info.ds.ds_tid, r)); 3439 3440 return r; 3441 } 3442 //-------------------------------------------------------- 3443 // __kmp_init_random: Initialize a random number generator 3444 void __kmp_init_random(kmp_info_t *thread) { 3445 unsigned seed = thread->th.th_info.ds.ds_tid; 3446 3447 thread->th.th_a = 3448 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3449 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3450 KA_TRACE(30, 3451 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3452 } 3453 3454 #if KMP_OS_WINDOWS 3455 /* reclaim array entries for root threads that are already dead, returns number 3456 * reclaimed */ 3457 static int __kmp_reclaim_dead_roots(void) { 3458 int i, r = 0; 3459 3460 for (i = 0; i < __kmp_threads_capacity; ++i) { 3461 if (KMP_UBER_GTID(i) && 3462 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3463 !__kmp_root[i] 3464 ->r.r_active) { // AC: reclaim only roots died in non-active state 3465 r += __kmp_unregister_root_other_thread(i); 3466 } 3467 } 3468 return r; 3469 } 3470 #endif 3471 3472 /* This function attempts to create free entries in __kmp_threads and 3473 __kmp_root, and returns the number of free entries generated. 3474 3475 For Windows* OS static library, the first mechanism used is to reclaim array 3476 entries for root threads that are already dead. 3477 3478 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3479 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3480 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3481 threadprivate cache array has been created. Synchronization with 3482 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3483 3484 After any dead root reclamation, if the clipping value allows array expansion 3485 to result in the generation of a total of nNeed free slots, the function does 3486 that expansion. If not, nothing is done beyond the possible initial root 3487 thread reclamation. 3488 3489 If any argument is negative, the behavior is undefined. */ 3490 static int __kmp_expand_threads(int nNeed) { 3491 int added = 0; 3492 int minimumRequiredCapacity; 3493 int newCapacity; 3494 kmp_info_t **newThreads; 3495 kmp_root_t **newRoot; 3496 3497 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3498 // resizing __kmp_threads does not need additional protection if foreign 3499 // threads are present 3500 3501 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3502 /* only for Windows static library */ 3503 /* reclaim array entries for root threads that are already dead */ 3504 added = __kmp_reclaim_dead_roots(); 3505 3506 if (nNeed) { 3507 nNeed -= added; 3508 if (nNeed < 0) 3509 nNeed = 0; 3510 } 3511 #endif 3512 if (nNeed <= 0) 3513 return added; 3514 3515 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3516 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3517 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3518 // > __kmp_max_nth in one of two ways: 3519 // 3520 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3521 // may not be reused by another thread, so we may need to increase 3522 // __kmp_threads_capacity to __kmp_max_nth + 1. 3523 // 3524 // 2) New foreign root(s) are encountered. We always register new foreign 3525 // roots. This may cause a smaller # of threads to be allocated at 3526 // subsequent parallel regions, but the worker threads hang around (and 3527 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3528 // 3529 // Anyway, that is the reason for moving the check to see if 3530 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3531 // instead of having it performed here. -BB 3532 3533 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3534 3535 /* compute expansion headroom to check if we can expand */ 3536 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3537 /* possible expansion too small -- give up */ 3538 return added; 3539 } 3540 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3541 3542 newCapacity = __kmp_threads_capacity; 3543 do { 3544 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3545 : __kmp_sys_max_nth; 3546 } while (newCapacity < minimumRequiredCapacity); 3547 newThreads = (kmp_info_t **)__kmp_allocate( 3548 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3549 newRoot = 3550 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3551 KMP_MEMCPY(newThreads, __kmp_threads, 3552 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3553 KMP_MEMCPY(newRoot, __kmp_root, 3554 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3555 3556 kmp_info_t **temp_threads = __kmp_threads; 3557 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3558 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3559 __kmp_free(temp_threads); 3560 added += newCapacity - __kmp_threads_capacity; 3561 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3562 3563 if (newCapacity > __kmp_tp_capacity) { 3564 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3565 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3566 __kmp_threadprivate_resize_cache(newCapacity); 3567 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3568 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3569 } 3570 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3571 } 3572 3573 return added; 3574 } 3575 3576 /* Register the current thread as a root thread and obtain our gtid. We must 3577 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3578 thread that calls from __kmp_do_serial_initialize() */ 3579 int __kmp_register_root(int initial_thread) { 3580 kmp_info_t *root_thread; 3581 kmp_root_t *root; 3582 int gtid; 3583 int capacity; 3584 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3585 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3586 KMP_MB(); 3587 3588 /* 2007-03-02: 3589 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3590 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3591 work as expected -- it may return false (that means there is at least one 3592 empty slot in __kmp_threads array), but it is possible the only free slot 3593 is #0, which is reserved for initial thread and so cannot be used for this 3594 one. Following code workarounds this bug. 3595 3596 However, right solution seems to be not reserving slot #0 for initial 3597 thread because: 3598 (1) there is no magic in slot #0, 3599 (2) we cannot detect initial thread reliably (the first thread which does 3600 serial initialization may be not a real initial thread). 3601 */ 3602 capacity = __kmp_threads_capacity; 3603 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3604 --capacity; 3605 } 3606 3607 /* see if there are too many threads */ 3608 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3609 if (__kmp_tp_cached) { 3610 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3611 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3612 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3613 } else { 3614 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3615 __kmp_msg_null); 3616 } 3617 } 3618 3619 /* find an available thread slot */ 3620 /* Don't reassign the zero slot since we need that to only be used by initial 3621 thread */ 3622 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL; 3623 gtid++) 3624 ; 3625 KA_TRACE(1, 3626 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3627 KMP_ASSERT(gtid < __kmp_threads_capacity); 3628 3629 /* update global accounting */ 3630 __kmp_all_nth++; 3631 TCW_4(__kmp_nth, __kmp_nth + 1); 3632 3633 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3634 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3635 if (__kmp_adjust_gtid_mode) { 3636 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3637 if (TCR_4(__kmp_gtid_mode) != 2) { 3638 TCW_4(__kmp_gtid_mode, 2); 3639 } 3640 } else { 3641 if (TCR_4(__kmp_gtid_mode) != 1) { 3642 TCW_4(__kmp_gtid_mode, 1); 3643 } 3644 } 3645 } 3646 3647 #ifdef KMP_ADJUST_BLOCKTIME 3648 /* Adjust blocktime to zero if necessary */ 3649 /* Middle initialization might not have occurred yet */ 3650 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3651 if (__kmp_nth > __kmp_avail_proc) { 3652 __kmp_zero_bt = TRUE; 3653 } 3654 } 3655 #endif /* KMP_ADJUST_BLOCKTIME */ 3656 3657 /* setup this new hierarchy */ 3658 if (!(root = __kmp_root[gtid])) { 3659 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3660 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3661 } 3662 3663 #if KMP_STATS_ENABLED 3664 // Initialize stats as soon as possible (right after gtid assignment). 3665 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3666 __kmp_stats_thread_ptr->startLife(); 3667 KMP_SET_THREAD_STATE(SERIAL_REGION); 3668 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3669 #endif 3670 __kmp_initialize_root(root); 3671 3672 /* setup new root thread structure */ 3673 if (root->r.r_uber_thread) { 3674 root_thread = root->r.r_uber_thread; 3675 } else { 3676 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3677 if (__kmp_storage_map) { 3678 __kmp_print_thread_storage_map(root_thread, gtid); 3679 } 3680 root_thread->th.th_info.ds.ds_gtid = gtid; 3681 #if OMPT_SUPPORT 3682 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3683 #endif 3684 root_thread->th.th_root = root; 3685 if (__kmp_env_consistency_check) { 3686 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3687 } 3688 #if USE_FAST_MEMORY 3689 __kmp_initialize_fast_memory(root_thread); 3690 #endif /* USE_FAST_MEMORY */ 3691 3692 #if KMP_USE_BGET 3693 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3694 __kmp_initialize_bget(root_thread); 3695 #endif 3696 __kmp_init_random(root_thread); // Initialize random number generator 3697 } 3698 3699 /* setup the serial team held in reserve by the root thread */ 3700 if (!root_thread->th.th_serial_team) { 3701 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3702 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3703 root_thread->th.th_serial_team = __kmp_allocate_team( 3704 root, 1, 1, 3705 #if OMPT_SUPPORT 3706 ompt_data_none, // root parallel id 3707 #endif 3708 proc_bind_default, &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3709 } 3710 KMP_ASSERT(root_thread->th.th_serial_team); 3711 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3712 root_thread->th.th_serial_team)); 3713 3714 /* drop root_thread into place */ 3715 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3716 3717 root->r.r_root_team->t.t_threads[0] = root_thread; 3718 root->r.r_hot_team->t.t_threads[0] = root_thread; 3719 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3720 // AC: the team created in reserve, not for execution (it is unused for now). 3721 root_thread->th.th_serial_team->t.t_serialized = 0; 3722 root->r.r_uber_thread = root_thread; 3723 3724 /* initialize the thread, get it ready to go */ 3725 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3726 TCW_4(__kmp_init_gtid, TRUE); 3727 3728 /* prepare the master thread for get_gtid() */ 3729 __kmp_gtid_set_specific(gtid); 3730 3731 #if USE_ITT_BUILD 3732 __kmp_itt_thread_name(gtid); 3733 #endif /* USE_ITT_BUILD */ 3734 3735 #ifdef KMP_TDATA_GTID 3736 __kmp_gtid = gtid; 3737 #endif 3738 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3739 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3740 3741 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3742 "plain=%u\n", 3743 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3744 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3745 KMP_INIT_BARRIER_STATE)); 3746 { // Initialize barrier data. 3747 int b; 3748 for (b = 0; b < bs_last_barrier; ++b) { 3749 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3750 #if USE_DEBUGGER 3751 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3752 #endif 3753 } 3754 } 3755 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3756 KMP_INIT_BARRIER_STATE); 3757 3758 #if KMP_AFFINITY_SUPPORTED 3759 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3760 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3761 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3762 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3763 if (TCR_4(__kmp_init_middle)) { 3764 __kmp_affinity_set_init_mask(gtid, TRUE); 3765 } 3766 #endif /* KMP_AFFINITY_SUPPORTED */ 3767 root_thread->th.th_def_allocator = __kmp_def_allocator; 3768 root_thread->th.th_prev_level = 0; 3769 root_thread->th.th_prev_num_threads = 1; 3770 3771 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 3772 tmp->cg_root = root_thread; 3773 tmp->cg_thread_limit = __kmp_cg_max_nth; 3774 tmp->cg_nthreads = 1; 3775 KA_TRACE(100, ("__kmp_register_root: Thread %p created node %p with" 3776 " cg_nthreads init to 1\n", 3777 root_thread, tmp)); 3778 tmp->up = NULL; 3779 root_thread->th.th_cg_roots = tmp; 3780 3781 __kmp_root_counter++; 3782 3783 #if OMPT_SUPPORT 3784 if (!initial_thread && ompt_enabled.enabled) { 3785 3786 kmp_info_t *root_thread = ompt_get_thread(); 3787 3788 ompt_set_thread_state(root_thread, ompt_state_overhead); 3789 3790 if (ompt_enabled.ompt_callback_thread_begin) { 3791 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3792 ompt_thread_initial, __ompt_get_thread_data_internal()); 3793 } 3794 ompt_data_t *task_data; 3795 ompt_data_t *parallel_data; 3796 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3797 if (ompt_enabled.ompt_callback_implicit_task) { 3798 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3799 ompt_scope_begin, parallel_data, task_data, 1, 1, ompt_task_initial); 3800 } 3801 3802 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3803 } 3804 #endif 3805 3806 KMP_MB(); 3807 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3808 3809 return gtid; 3810 } 3811 3812 #if KMP_NESTED_HOT_TEAMS 3813 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3814 const int max_level) { 3815 int i, n, nth; 3816 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3817 if (!hot_teams || !hot_teams[level].hot_team) { 3818 return 0; 3819 } 3820 KMP_DEBUG_ASSERT(level < max_level); 3821 kmp_team_t *team = hot_teams[level].hot_team; 3822 nth = hot_teams[level].hot_team_nth; 3823 n = nth - 1; // master is not freed 3824 if (level < max_level - 1) { 3825 for (i = 0; i < nth; ++i) { 3826 kmp_info_t *th = team->t.t_threads[i]; 3827 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3828 if (i > 0 && th->th.th_hot_teams) { 3829 __kmp_free(th->th.th_hot_teams); 3830 th->th.th_hot_teams = NULL; 3831 } 3832 } 3833 } 3834 __kmp_free_team(root, team, NULL); 3835 return n; 3836 } 3837 #endif 3838 3839 // Resets a root thread and clear its root and hot teams. 3840 // Returns the number of __kmp_threads entries directly and indirectly freed. 3841 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3842 kmp_team_t *root_team = root->r.r_root_team; 3843 kmp_team_t *hot_team = root->r.r_hot_team; 3844 int n = hot_team->t.t_nproc; 3845 int i; 3846 3847 KMP_DEBUG_ASSERT(!root->r.r_active); 3848 3849 root->r.r_root_team = NULL; 3850 root->r.r_hot_team = NULL; 3851 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3852 // before call to __kmp_free_team(). 3853 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3854 #if KMP_NESTED_HOT_TEAMS 3855 if (__kmp_hot_teams_max_level > 3856 0) { // need to free nested hot teams and their threads if any 3857 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3858 kmp_info_t *th = hot_team->t.t_threads[i]; 3859 if (__kmp_hot_teams_max_level > 1) { 3860 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3861 } 3862 if (th->th.th_hot_teams) { 3863 __kmp_free(th->th.th_hot_teams); 3864 th->th.th_hot_teams = NULL; 3865 } 3866 } 3867 } 3868 #endif 3869 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3870 3871 // Before we can reap the thread, we need to make certain that all other 3872 // threads in the teams that had this root as ancestor have stopped trying to 3873 // steal tasks. 3874 if (__kmp_tasking_mode != tskm_immediate_exec) { 3875 __kmp_wait_to_unref_task_teams(); 3876 } 3877 3878 #if KMP_OS_WINDOWS 3879 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3880 KA_TRACE( 3881 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3882 "\n", 3883 (LPVOID) & (root->r.r_uber_thread->th), 3884 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3885 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3886 #endif /* KMP_OS_WINDOWS */ 3887 3888 #if OMPT_SUPPORT 3889 ompt_data_t *task_data; 3890 ompt_data_t *parallel_data; 3891 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, ¶llel_data, NULL); 3892 if (ompt_enabled.ompt_callback_implicit_task) { 3893 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 3894 ompt_scope_end, parallel_data, task_data, 0, 1, ompt_task_initial); 3895 } 3896 if (ompt_enabled.ompt_callback_thread_end) { 3897 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3898 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3899 } 3900 #endif 3901 3902 TCW_4(__kmp_nth, 3903 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3904 i = root->r.r_uber_thread->th.th_cg_roots->cg_nthreads--; 3905 KA_TRACE(100, ("__kmp_reset_root: Thread %p decrement cg_nthreads on node %p" 3906 " to %d\n", 3907 root->r.r_uber_thread, root->r.r_uber_thread->th.th_cg_roots, 3908 root->r.r_uber_thread->th.th_cg_roots->cg_nthreads)); 3909 if (i == 1) { 3910 // need to free contention group structure 3911 KMP_DEBUG_ASSERT(root->r.r_uber_thread == 3912 root->r.r_uber_thread->th.th_cg_roots->cg_root); 3913 KMP_DEBUG_ASSERT(root->r.r_uber_thread->th.th_cg_roots->up == NULL); 3914 __kmp_free(root->r.r_uber_thread->th.th_cg_roots); 3915 root->r.r_uber_thread->th.th_cg_roots = NULL; 3916 } 3917 __kmp_reap_thread(root->r.r_uber_thread, 1); 3918 3919 // We canot put root thread to __kmp_thread_pool, so we have to reap it 3920 // instead of freeing. 3921 root->r.r_uber_thread = NULL; 3922 /* mark root as no longer in use */ 3923 root->r.r_begin = FALSE; 3924 3925 return n; 3926 } 3927 3928 void __kmp_unregister_root_current_thread(int gtid) { 3929 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3930 /* this lock should be ok, since unregister_root_current_thread is never 3931 called during an abort, only during a normal close. furthermore, if you 3932 have the forkjoin lock, you should never try to get the initz lock */ 3933 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3934 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3935 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3936 "exiting T#%d\n", 3937 gtid)); 3938 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3939 return; 3940 } 3941 kmp_root_t *root = __kmp_root[gtid]; 3942 3943 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3944 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3945 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3946 KMP_ASSERT(root->r.r_active == FALSE); 3947 3948 KMP_MB(); 3949 3950 kmp_info_t *thread = __kmp_threads[gtid]; 3951 kmp_team_t *team = thread->th.th_team; 3952 kmp_task_team_t *task_team = thread->th.th_task_team; 3953 3954 // we need to wait for the proxy tasks before finishing the thread 3955 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3956 #if OMPT_SUPPORT 3957 // the runtime is shutting down so we won't report any events 3958 thread->th.ompt_thread_info.state = ompt_state_undefined; 3959 #endif 3960 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3961 } 3962 3963 __kmp_reset_root(gtid, root); 3964 3965 /* free up this thread slot */ 3966 __kmp_gtid_set_specific(KMP_GTID_DNE); 3967 #ifdef KMP_TDATA_GTID 3968 __kmp_gtid = KMP_GTID_DNE; 3969 #endif 3970 3971 KMP_MB(); 3972 KC_TRACE(10, 3973 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3974 3975 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3976 } 3977 3978 #if KMP_OS_WINDOWS 3979 /* __kmp_forkjoin_lock must be already held 3980 Unregisters a root thread that is not the current thread. Returns the number 3981 of __kmp_threads entries freed as a result. */ 3982 static int __kmp_unregister_root_other_thread(int gtid) { 3983 kmp_root_t *root = __kmp_root[gtid]; 3984 int r; 3985 3986 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 3987 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3988 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3989 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3990 KMP_ASSERT(root->r.r_active == FALSE); 3991 3992 r = __kmp_reset_root(gtid, root); 3993 KC_TRACE(10, 3994 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 3995 return r; 3996 } 3997 #endif 3998 3999 #if KMP_DEBUG 4000 void __kmp_task_info() { 4001 4002 kmp_int32 gtid = __kmp_entry_gtid(); 4003 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4004 kmp_info_t *this_thr = __kmp_threads[gtid]; 4005 kmp_team_t *steam = this_thr->th.th_serial_team; 4006 kmp_team_t *team = this_thr->th.th_team; 4007 4008 __kmp_printf( 4009 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4010 "ptask=%p\n", 4011 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4012 team->t.t_implicit_task_taskdata[tid].td_parent); 4013 } 4014 #endif // KMP_DEBUG 4015 4016 /* TODO optimize with one big memclr, take out what isn't needed, split 4017 responsibility to workers as much as possible, and delay initialization of 4018 features as much as possible */ 4019 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4020 int tid, int gtid) { 4021 /* this_thr->th.th_info.ds.ds_gtid is setup in 4022 kmp_allocate_thread/create_worker. 4023 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4024 kmp_info_t *master = team->t.t_threads[0]; 4025 KMP_DEBUG_ASSERT(this_thr != NULL); 4026 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4027 KMP_DEBUG_ASSERT(team); 4028 KMP_DEBUG_ASSERT(team->t.t_threads); 4029 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4030 KMP_DEBUG_ASSERT(master); 4031 KMP_DEBUG_ASSERT(master->th.th_root); 4032 4033 KMP_MB(); 4034 4035 TCW_SYNC_PTR(this_thr->th.th_team, team); 4036 4037 this_thr->th.th_info.ds.ds_tid = tid; 4038 this_thr->th.th_set_nproc = 0; 4039 if (__kmp_tasking_mode != tskm_immediate_exec) 4040 // When tasking is possible, threads are not safe to reap until they are 4041 // done tasking; this will be set when tasking code is exited in wait 4042 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4043 else // no tasking --> always safe to reap 4044 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4045 this_thr->th.th_set_proc_bind = proc_bind_default; 4046 #if KMP_AFFINITY_SUPPORTED 4047 this_thr->th.th_new_place = this_thr->th.th_current_place; 4048 #endif 4049 this_thr->th.th_root = master->th.th_root; 4050 4051 /* setup the thread's cache of the team structure */ 4052 this_thr->th.th_team_nproc = team->t.t_nproc; 4053 this_thr->th.th_team_master = master; 4054 this_thr->th.th_team_serialized = team->t.t_serialized; 4055 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4056 4057 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4058 4059 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4060 tid, gtid, this_thr, this_thr->th.th_current_task)); 4061 4062 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4063 team, tid, TRUE); 4064 4065 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4066 tid, gtid, this_thr, this_thr->th.th_current_task)); 4067 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4068 // __kmp_initialize_team()? 4069 4070 /* TODO no worksharing in speculative threads */ 4071 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4072 4073 this_thr->th.th_local.this_construct = 0; 4074 4075 if (!this_thr->th.th_pri_common) { 4076 this_thr->th.th_pri_common = 4077 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4078 if (__kmp_storage_map) { 4079 __kmp_print_storage_map_gtid( 4080 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4081 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4082 } 4083 this_thr->th.th_pri_head = NULL; 4084 } 4085 4086 if (this_thr != master && // Master's CG root is initialized elsewhere 4087 this_thr->th.th_cg_roots != master->th.th_cg_roots) { // CG root not set 4088 // Make new thread's CG root same as master's 4089 KMP_DEBUG_ASSERT(master->th.th_cg_roots); 4090 kmp_cg_root_t *tmp = this_thr->th.th_cg_roots; 4091 if (tmp) { 4092 // worker changes CG, need to check if old CG should be freed 4093 int i = tmp->cg_nthreads--; 4094 KA_TRACE(100, ("__kmp_initialize_info: Thread %p decrement cg_nthreads" 4095 " on node %p of thread %p to %d\n", 4096 this_thr, tmp, tmp->cg_root, tmp->cg_nthreads)); 4097 if (i == 1) { 4098 __kmp_free(tmp); // last thread left CG --> free it 4099 } 4100 } 4101 this_thr->th.th_cg_roots = master->th.th_cg_roots; 4102 // Increment new thread's CG root's counter to add the new thread 4103 this_thr->th.th_cg_roots->cg_nthreads++; 4104 KA_TRACE(100, ("__kmp_initialize_info: Thread %p increment cg_nthreads on" 4105 " node %p of thread %p to %d\n", 4106 this_thr, this_thr->th.th_cg_roots, 4107 this_thr->th.th_cg_roots->cg_root, 4108 this_thr->th.th_cg_roots->cg_nthreads)); 4109 this_thr->th.th_current_task->td_icvs.thread_limit = 4110 this_thr->th.th_cg_roots->cg_thread_limit; 4111 } 4112 4113 /* Initialize dynamic dispatch */ 4114 { 4115 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4116 // Use team max_nproc since this will never change for the team. 4117 size_t disp_size = 4118 sizeof(dispatch_private_info_t) * 4119 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4120 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4121 team->t.t_max_nproc)); 4122 KMP_ASSERT(dispatch); 4123 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4124 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4125 4126 dispatch->th_disp_index = 0; 4127 dispatch->th_doacross_buf_idx = 0; 4128 if (!dispatch->th_disp_buffer) { 4129 dispatch->th_disp_buffer = 4130 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4131 4132 if (__kmp_storage_map) { 4133 __kmp_print_storage_map_gtid( 4134 gtid, &dispatch->th_disp_buffer[0], 4135 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4136 ? 1 4137 : __kmp_dispatch_num_buffers], 4138 disp_size, "th_%d.th_dispatch.th_disp_buffer " 4139 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4140 gtid, team->t.t_id, gtid); 4141 } 4142 } else { 4143 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4144 } 4145 4146 dispatch->th_dispatch_pr_current = 0; 4147 dispatch->th_dispatch_sh_current = 0; 4148 4149 dispatch->th_deo_fcn = 0; /* ORDERED */ 4150 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4151 } 4152 4153 this_thr->th.th_next_pool = NULL; 4154 4155 if (!this_thr->th.th_task_state_memo_stack) { 4156 size_t i; 4157 this_thr->th.th_task_state_memo_stack = 4158 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4159 this_thr->th.th_task_state_top = 0; 4160 this_thr->th.th_task_state_stack_sz = 4; 4161 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4162 ++i) // zero init the stack 4163 this_thr->th.th_task_state_memo_stack[i] = 0; 4164 } 4165 4166 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4167 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4168 4169 KMP_MB(); 4170 } 4171 4172 /* allocate a new thread for the requesting team. this is only called from 4173 within a forkjoin critical section. we will first try to get an available 4174 thread from the thread pool. if none is available, we will fork a new one 4175 assuming we are able to create a new one. this should be assured, as the 4176 caller should check on this first. */ 4177 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4178 int new_tid) { 4179 kmp_team_t *serial_team; 4180 kmp_info_t *new_thr; 4181 int new_gtid; 4182 4183 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4184 KMP_DEBUG_ASSERT(root && team); 4185 #if !KMP_NESTED_HOT_TEAMS 4186 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4187 #endif 4188 KMP_MB(); 4189 4190 /* first, try to get one from the thread pool */ 4191 if (__kmp_thread_pool) { 4192 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4193 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4194 if (new_thr == __kmp_thread_pool_insert_pt) { 4195 __kmp_thread_pool_insert_pt = NULL; 4196 } 4197 TCW_4(new_thr->th.th_in_pool, FALSE); 4198 __kmp_suspend_initialize_thread(new_thr); 4199 __kmp_lock_suspend_mx(new_thr); 4200 if (new_thr->th.th_active_in_pool == TRUE) { 4201 KMP_DEBUG_ASSERT(new_thr->th.th_active == TRUE); 4202 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 4203 new_thr->th.th_active_in_pool = FALSE; 4204 } 4205 __kmp_unlock_suspend_mx(new_thr); 4206 4207 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4208 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4209 KMP_ASSERT(!new_thr->th.th_team); 4210 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4211 4212 /* setup the thread structure */ 4213 __kmp_initialize_info(new_thr, team, new_tid, 4214 new_thr->th.th_info.ds.ds_gtid); 4215 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4216 4217 TCW_4(__kmp_nth, __kmp_nth + 1); 4218 4219 new_thr->th.th_task_state = 0; 4220 new_thr->th.th_task_state_top = 0; 4221 new_thr->th.th_task_state_stack_sz = 4; 4222 4223 #ifdef KMP_ADJUST_BLOCKTIME 4224 /* Adjust blocktime back to zero if necessary */ 4225 /* Middle initialization might not have occurred yet */ 4226 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4227 if (__kmp_nth > __kmp_avail_proc) { 4228 __kmp_zero_bt = TRUE; 4229 } 4230 } 4231 #endif /* KMP_ADJUST_BLOCKTIME */ 4232 4233 #if KMP_DEBUG 4234 // If thread entered pool via __kmp_free_thread, wait_flag should != 4235 // KMP_BARRIER_PARENT_FLAG. 4236 int b; 4237 kmp_balign_t *balign = new_thr->th.th_bar; 4238 for (b = 0; b < bs_last_barrier; ++b) 4239 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4240 #endif 4241 4242 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4243 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4244 4245 KMP_MB(); 4246 return new_thr; 4247 } 4248 4249 /* no, well fork a new one */ 4250 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4251 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4252 4253 #if KMP_USE_MONITOR 4254 // If this is the first worker thread the RTL is creating, then also 4255 // launch the monitor thread. We try to do this as early as possible. 4256 if (!TCR_4(__kmp_init_monitor)) { 4257 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4258 if (!TCR_4(__kmp_init_monitor)) { 4259 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4260 TCW_4(__kmp_init_monitor, 1); 4261 __kmp_create_monitor(&__kmp_monitor); 4262 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4263 #if KMP_OS_WINDOWS 4264 // AC: wait until monitor has started. This is a fix for CQ232808. 4265 // The reason is that if the library is loaded/unloaded in a loop with 4266 // small (parallel) work in between, then there is high probability that 4267 // monitor thread started after the library shutdown. At shutdown it is 4268 // too late to cope with the problem, because when the master is in 4269 // DllMain (process detach) the monitor has no chances to start (it is 4270 // blocked), and master has no means to inform the monitor that the 4271 // library has gone, because all the memory which the monitor can access 4272 // is going to be released/reset. 4273 while (TCR_4(__kmp_init_monitor) < 2) { 4274 KMP_YIELD(TRUE); 4275 } 4276 KF_TRACE(10, ("after monitor thread has started\n")); 4277 #endif 4278 } 4279 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4280 } 4281 #endif 4282 4283 KMP_MB(); 4284 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { 4285 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4286 } 4287 4288 /* allocate space for it. */ 4289 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4290 4291 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4292 4293 #if USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG 4294 // suppress race conditions detection on synchronization flags in debug mode 4295 // this helps to analyze library internals eliminating false positives 4296 __itt_suppress_mark_range( 4297 __itt_suppress_range, __itt_suppress_threading_errors, 4298 &new_thr->th.th_sleep_loc, sizeof(new_thr->th.th_sleep_loc)); 4299 __itt_suppress_mark_range( 4300 __itt_suppress_range, __itt_suppress_threading_errors, 4301 &new_thr->th.th_reap_state, sizeof(new_thr->th.th_reap_state)); 4302 #if KMP_OS_WINDOWS 4303 __itt_suppress_mark_range( 4304 __itt_suppress_range, __itt_suppress_threading_errors, 4305 &new_thr->th.th_suspend_init, sizeof(new_thr->th.th_suspend_init)); 4306 #else 4307 __itt_suppress_mark_range(__itt_suppress_range, 4308 __itt_suppress_threading_errors, 4309 &new_thr->th.th_suspend_init_count, 4310 sizeof(new_thr->th.th_suspend_init_count)); 4311 #endif 4312 // TODO: check if we need to also suppress b_arrived flags 4313 __itt_suppress_mark_range(__itt_suppress_range, 4314 __itt_suppress_threading_errors, 4315 CCAST(kmp_uint64 *, &new_thr->th.th_bar[0].bb.b_go), 4316 sizeof(new_thr->th.th_bar[0].bb.b_go)); 4317 __itt_suppress_mark_range(__itt_suppress_range, 4318 __itt_suppress_threading_errors, 4319 CCAST(kmp_uint64 *, &new_thr->th.th_bar[1].bb.b_go), 4320 sizeof(new_thr->th.th_bar[1].bb.b_go)); 4321 __itt_suppress_mark_range(__itt_suppress_range, 4322 __itt_suppress_threading_errors, 4323 CCAST(kmp_uint64 *, &new_thr->th.th_bar[2].bb.b_go), 4324 sizeof(new_thr->th.th_bar[2].bb.b_go)); 4325 #endif /* USE_ITT_BUILD && USE_ITT_NOTIFY && KMP_DEBUG */ 4326 if (__kmp_storage_map) { 4327 __kmp_print_thread_storage_map(new_thr, new_gtid); 4328 } 4329 4330 // add the reserve serialized team, initialized from the team's master thread 4331 { 4332 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4333 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4334 new_thr->th.th_serial_team = serial_team = 4335 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4336 #if OMPT_SUPPORT 4337 ompt_data_none, // root parallel id 4338 #endif 4339 proc_bind_default, &r_icvs, 4340 0 USE_NESTED_HOT_ARG(NULL)); 4341 } 4342 KMP_ASSERT(serial_team); 4343 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4344 // execution (it is unused for now). 4345 serial_team->t.t_threads[0] = new_thr; 4346 KF_TRACE(10, 4347 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4348 new_thr)); 4349 4350 /* setup the thread structures */ 4351 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4352 4353 #if USE_FAST_MEMORY 4354 __kmp_initialize_fast_memory(new_thr); 4355 #endif /* USE_FAST_MEMORY */ 4356 4357 #if KMP_USE_BGET 4358 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4359 __kmp_initialize_bget(new_thr); 4360 #endif 4361 4362 __kmp_init_random(new_thr); // Initialize random number generator 4363 4364 /* Initialize these only once when thread is grabbed for a team allocation */ 4365 KA_TRACE(20, 4366 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4367 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4368 4369 int b; 4370 kmp_balign_t *balign = new_thr->th.th_bar; 4371 for (b = 0; b < bs_last_barrier; ++b) { 4372 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4373 balign[b].bb.team = NULL; 4374 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4375 balign[b].bb.use_oncore_barrier = 0; 4376 } 4377 4378 new_thr->th.th_spin_here = FALSE; 4379 new_thr->th.th_next_waiting = 0; 4380 #if KMP_OS_UNIX 4381 new_thr->th.th_blocking = false; 4382 #endif 4383 4384 #if KMP_AFFINITY_SUPPORTED 4385 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4386 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4387 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4388 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4389 #endif 4390 new_thr->th.th_def_allocator = __kmp_def_allocator; 4391 new_thr->th.th_prev_level = 0; 4392 new_thr->th.th_prev_num_threads = 1; 4393 4394 TCW_4(new_thr->th.th_in_pool, FALSE); 4395 new_thr->th.th_active_in_pool = FALSE; 4396 TCW_4(new_thr->th.th_active, TRUE); 4397 4398 /* adjust the global counters */ 4399 __kmp_all_nth++; 4400 __kmp_nth++; 4401 4402 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4403 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4404 if (__kmp_adjust_gtid_mode) { 4405 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4406 if (TCR_4(__kmp_gtid_mode) != 2) { 4407 TCW_4(__kmp_gtid_mode, 2); 4408 } 4409 } else { 4410 if (TCR_4(__kmp_gtid_mode) != 1) { 4411 TCW_4(__kmp_gtid_mode, 1); 4412 } 4413 } 4414 } 4415 4416 #ifdef KMP_ADJUST_BLOCKTIME 4417 /* Adjust blocktime back to zero if necessary */ 4418 /* Middle initialization might not have occurred yet */ 4419 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4420 if (__kmp_nth > __kmp_avail_proc) { 4421 __kmp_zero_bt = TRUE; 4422 } 4423 } 4424 #endif /* KMP_ADJUST_BLOCKTIME */ 4425 4426 /* actually fork it and create the new worker thread */ 4427 KF_TRACE( 4428 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4429 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4430 KF_TRACE(10, 4431 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4432 4433 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4434 new_gtid)); 4435 KMP_MB(); 4436 return new_thr; 4437 } 4438 4439 /* Reinitialize team for reuse. 4440 The hot team code calls this case at every fork barrier, so EPCC barrier 4441 test are extremely sensitive to changes in it, esp. writes to the team 4442 struct, which cause a cache invalidation in all threads. 4443 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4444 static void __kmp_reinitialize_team(kmp_team_t *team, 4445 kmp_internal_control_t *new_icvs, 4446 ident_t *loc) { 4447 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4448 team->t.t_threads[0], team)); 4449 KMP_DEBUG_ASSERT(team && new_icvs); 4450 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4451 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4452 4453 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4454 // Copy ICVs to the master thread's implicit taskdata 4455 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4456 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4457 4458 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4459 team->t.t_threads[0], team)); 4460 } 4461 4462 /* Initialize the team data structure. 4463 This assumes the t_threads and t_max_nproc are already set. 4464 Also, we don't touch the arguments */ 4465 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4466 kmp_internal_control_t *new_icvs, 4467 ident_t *loc) { 4468 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4469 4470 /* verify */ 4471 KMP_DEBUG_ASSERT(team); 4472 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4473 KMP_DEBUG_ASSERT(team->t.t_threads); 4474 KMP_MB(); 4475 4476 team->t.t_master_tid = 0; /* not needed */ 4477 /* team->t.t_master_bar; not needed */ 4478 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4479 team->t.t_nproc = new_nproc; 4480 4481 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4482 team->t.t_next_pool = NULL; 4483 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4484 * up hot team */ 4485 4486 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4487 team->t.t_invoke = NULL; /* not needed */ 4488 4489 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4490 team->t.t_sched.sched = new_icvs->sched.sched; 4491 4492 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4493 team->t.t_fp_control_saved = FALSE; /* not needed */ 4494 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4495 team->t.t_mxcsr = 0; /* not needed */ 4496 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4497 4498 team->t.t_construct = 0; 4499 4500 team->t.t_ordered.dt.t_value = 0; 4501 team->t.t_master_active = FALSE; 4502 4503 #ifdef KMP_DEBUG 4504 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4505 #endif 4506 #if KMP_OS_WINDOWS 4507 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4508 #endif 4509 4510 team->t.t_control_stack_top = NULL; 4511 4512 __kmp_reinitialize_team(team, new_icvs, loc); 4513 4514 KMP_MB(); 4515 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4516 } 4517 4518 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 4519 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4520 static void 4521 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4522 if (KMP_AFFINITY_CAPABLE()) { 4523 int status; 4524 if (old_mask != NULL) { 4525 status = __kmp_get_system_affinity(old_mask, TRUE); 4526 int error = errno; 4527 if (status != 0) { 4528 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4529 __kmp_msg_null); 4530 } 4531 } 4532 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4533 } 4534 } 4535 #endif 4536 4537 #if KMP_AFFINITY_SUPPORTED 4538 4539 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4540 // It calculates the worker + master thread's partition based upon the parent 4541 // thread's partition, and binds each worker to a thread in their partition. 4542 // The master thread's partition should already include its current binding. 4543 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4544 // Copy the master thread's place partition to the team struct 4545 kmp_info_t *master_th = team->t.t_threads[0]; 4546 KMP_DEBUG_ASSERT(master_th != NULL); 4547 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4548 int first_place = master_th->th.th_first_place; 4549 int last_place = master_th->th.th_last_place; 4550 int masters_place = master_th->th.th_current_place; 4551 team->t.t_first_place = first_place; 4552 team->t.t_last_place = last_place; 4553 4554 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4555 "bound to place %d partition = [%d,%d]\n", 4556 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4557 team->t.t_id, masters_place, first_place, last_place)); 4558 4559 switch (proc_bind) { 4560 4561 case proc_bind_default: 4562 // serial teams might have the proc_bind policy set to proc_bind_default. It 4563 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4564 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4565 break; 4566 4567 case proc_bind_master: { 4568 int f; 4569 int n_th = team->t.t_nproc; 4570 for (f = 1; f < n_th; f++) { 4571 kmp_info_t *th = team->t.t_threads[f]; 4572 KMP_DEBUG_ASSERT(th != NULL); 4573 th->th.th_first_place = first_place; 4574 th->th.th_last_place = last_place; 4575 th->th.th_new_place = masters_place; 4576 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4577 team->t.t_display_affinity != 1) { 4578 team->t.t_display_affinity = 1; 4579 } 4580 4581 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4582 "partition = [%d,%d]\n", 4583 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4584 f, masters_place, first_place, last_place)); 4585 } 4586 } break; 4587 4588 case proc_bind_close: { 4589 int f; 4590 int n_th = team->t.t_nproc; 4591 int n_places; 4592 if (first_place <= last_place) { 4593 n_places = last_place - first_place + 1; 4594 } else { 4595 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4596 } 4597 if (n_th <= n_places) { 4598 int place = masters_place; 4599 for (f = 1; f < n_th; f++) { 4600 kmp_info_t *th = team->t.t_threads[f]; 4601 KMP_DEBUG_ASSERT(th != NULL); 4602 4603 if (place == last_place) { 4604 place = first_place; 4605 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4606 place = 0; 4607 } else { 4608 place++; 4609 } 4610 th->th.th_first_place = first_place; 4611 th->th.th_last_place = last_place; 4612 th->th.th_new_place = place; 4613 if (__kmp_display_affinity && place != th->th.th_current_place && 4614 team->t.t_display_affinity != 1) { 4615 team->t.t_display_affinity = 1; 4616 } 4617 4618 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4619 "partition = [%d,%d]\n", 4620 __kmp_gtid_from_thread(team->t.t_threads[f]), 4621 team->t.t_id, f, place, first_place, last_place)); 4622 } 4623 } else { 4624 int S, rem, gap, s_count; 4625 S = n_th / n_places; 4626 s_count = 0; 4627 rem = n_th - (S * n_places); 4628 gap = rem > 0 ? n_places / rem : n_places; 4629 int place = masters_place; 4630 int gap_ct = gap; 4631 for (f = 0; f < n_th; f++) { 4632 kmp_info_t *th = team->t.t_threads[f]; 4633 KMP_DEBUG_ASSERT(th != NULL); 4634 4635 th->th.th_first_place = first_place; 4636 th->th.th_last_place = last_place; 4637 th->th.th_new_place = place; 4638 if (__kmp_display_affinity && place != th->th.th_current_place && 4639 team->t.t_display_affinity != 1) { 4640 team->t.t_display_affinity = 1; 4641 } 4642 s_count++; 4643 4644 if ((s_count == S) && rem && (gap_ct == gap)) { 4645 // do nothing, add an extra thread to place on next iteration 4646 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4647 // we added an extra thread to this place; move to next place 4648 if (place == last_place) { 4649 place = first_place; 4650 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4651 place = 0; 4652 } else { 4653 place++; 4654 } 4655 s_count = 0; 4656 gap_ct = 1; 4657 rem--; 4658 } else if (s_count == S) { // place full; don't add extra 4659 if (place == last_place) { 4660 place = first_place; 4661 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4662 place = 0; 4663 } else { 4664 place++; 4665 } 4666 gap_ct++; 4667 s_count = 0; 4668 } 4669 4670 KA_TRACE(100, 4671 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4672 "partition = [%d,%d]\n", 4673 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4674 th->th.th_new_place, first_place, last_place)); 4675 } 4676 KMP_DEBUG_ASSERT(place == masters_place); 4677 } 4678 } break; 4679 4680 case proc_bind_spread: { 4681 int f; 4682 int n_th = team->t.t_nproc; 4683 int n_places; 4684 int thidx; 4685 if (first_place <= last_place) { 4686 n_places = last_place - first_place + 1; 4687 } else { 4688 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4689 } 4690 if (n_th <= n_places) { 4691 int place = -1; 4692 4693 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4694 int S = n_places / n_th; 4695 int s_count, rem, gap, gap_ct; 4696 4697 place = masters_place; 4698 rem = n_places - n_th * S; 4699 gap = rem ? n_th / rem : 1; 4700 gap_ct = gap; 4701 thidx = n_th; 4702 if (update_master_only == 1) 4703 thidx = 1; 4704 for (f = 0; f < thidx; f++) { 4705 kmp_info_t *th = team->t.t_threads[f]; 4706 KMP_DEBUG_ASSERT(th != NULL); 4707 4708 th->th.th_first_place = place; 4709 th->th.th_new_place = place; 4710 if (__kmp_display_affinity && place != th->th.th_current_place && 4711 team->t.t_display_affinity != 1) { 4712 team->t.t_display_affinity = 1; 4713 } 4714 s_count = 1; 4715 while (s_count < S) { 4716 if (place == last_place) { 4717 place = first_place; 4718 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4719 place = 0; 4720 } else { 4721 place++; 4722 } 4723 s_count++; 4724 } 4725 if (rem && (gap_ct == gap)) { 4726 if (place == last_place) { 4727 place = first_place; 4728 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4729 place = 0; 4730 } else { 4731 place++; 4732 } 4733 rem--; 4734 gap_ct = 0; 4735 } 4736 th->th.th_last_place = place; 4737 gap_ct++; 4738 4739 if (place == last_place) { 4740 place = first_place; 4741 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4742 place = 0; 4743 } else { 4744 place++; 4745 } 4746 4747 KA_TRACE(100, 4748 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4749 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4750 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4751 f, th->th.th_new_place, th->th.th_first_place, 4752 th->th.th_last_place, __kmp_affinity_num_masks)); 4753 } 4754 } else { 4755 /* Having uniform space of available computation places I can create 4756 T partitions of round(P/T) size and put threads into the first 4757 place of each partition. */ 4758 double current = static_cast<double>(masters_place); 4759 double spacing = 4760 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4761 int first, last; 4762 kmp_info_t *th; 4763 4764 thidx = n_th + 1; 4765 if (update_master_only == 1) 4766 thidx = 1; 4767 for (f = 0; f < thidx; f++) { 4768 first = static_cast<int>(current); 4769 last = static_cast<int>(current + spacing) - 1; 4770 KMP_DEBUG_ASSERT(last >= first); 4771 if (first >= n_places) { 4772 if (masters_place) { 4773 first -= n_places; 4774 last -= n_places; 4775 if (first == (masters_place + 1)) { 4776 KMP_DEBUG_ASSERT(f == n_th); 4777 first--; 4778 } 4779 if (last == masters_place) { 4780 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4781 last--; 4782 } 4783 } else { 4784 KMP_DEBUG_ASSERT(f == n_th); 4785 first = 0; 4786 last = 0; 4787 } 4788 } 4789 if (last >= n_places) { 4790 last = (n_places - 1); 4791 } 4792 place = first; 4793 current += spacing; 4794 if (f < n_th) { 4795 KMP_DEBUG_ASSERT(0 <= first); 4796 KMP_DEBUG_ASSERT(n_places > first); 4797 KMP_DEBUG_ASSERT(0 <= last); 4798 KMP_DEBUG_ASSERT(n_places > last); 4799 KMP_DEBUG_ASSERT(last_place >= first_place); 4800 th = team->t.t_threads[f]; 4801 KMP_DEBUG_ASSERT(th); 4802 th->th.th_first_place = first; 4803 th->th.th_new_place = place; 4804 th->th.th_last_place = last; 4805 if (__kmp_display_affinity && place != th->th.th_current_place && 4806 team->t.t_display_affinity != 1) { 4807 team->t.t_display_affinity = 1; 4808 } 4809 KA_TRACE(100, 4810 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4811 "partition = [%d,%d], spacing = %.4f\n", 4812 __kmp_gtid_from_thread(team->t.t_threads[f]), 4813 team->t.t_id, f, th->th.th_new_place, 4814 th->th.th_first_place, th->th.th_last_place, spacing)); 4815 } 4816 } 4817 } 4818 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4819 } else { 4820 int S, rem, gap, s_count; 4821 S = n_th / n_places; 4822 s_count = 0; 4823 rem = n_th - (S * n_places); 4824 gap = rem > 0 ? n_places / rem : n_places; 4825 int place = masters_place; 4826 int gap_ct = gap; 4827 thidx = n_th; 4828 if (update_master_only == 1) 4829 thidx = 1; 4830 for (f = 0; f < thidx; f++) { 4831 kmp_info_t *th = team->t.t_threads[f]; 4832 KMP_DEBUG_ASSERT(th != NULL); 4833 4834 th->th.th_first_place = place; 4835 th->th.th_last_place = place; 4836 th->th.th_new_place = place; 4837 if (__kmp_display_affinity && place != th->th.th_current_place && 4838 team->t.t_display_affinity != 1) { 4839 team->t.t_display_affinity = 1; 4840 } 4841 s_count++; 4842 4843 if ((s_count == S) && rem && (gap_ct == gap)) { 4844 // do nothing, add an extra thread to place on next iteration 4845 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4846 // we added an extra thread to this place; move on to next place 4847 if (place == last_place) { 4848 place = first_place; 4849 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4850 place = 0; 4851 } else { 4852 place++; 4853 } 4854 s_count = 0; 4855 gap_ct = 1; 4856 rem--; 4857 } else if (s_count == S) { // place is full; don't add extra thread 4858 if (place == last_place) { 4859 place = first_place; 4860 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4861 place = 0; 4862 } else { 4863 place++; 4864 } 4865 gap_ct++; 4866 s_count = 0; 4867 } 4868 4869 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4870 "partition = [%d,%d]\n", 4871 __kmp_gtid_from_thread(team->t.t_threads[f]), 4872 team->t.t_id, f, th->th.th_new_place, 4873 th->th.th_first_place, th->th.th_last_place)); 4874 } 4875 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4876 } 4877 } break; 4878 4879 default: 4880 break; 4881 } 4882 4883 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4884 } 4885 4886 #endif // KMP_AFFINITY_SUPPORTED 4887 4888 /* allocate a new team data structure to use. take one off of the free pool if 4889 available */ 4890 kmp_team_t * 4891 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4892 #if OMPT_SUPPORT 4893 ompt_data_t ompt_parallel_data, 4894 #endif 4895 kmp_proc_bind_t new_proc_bind, 4896 kmp_internal_control_t *new_icvs, 4897 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4898 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4899 int f; 4900 kmp_team_t *team; 4901 int use_hot_team = !root->r.r_active; 4902 int level = 0; 4903 4904 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4905 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4906 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4907 KMP_MB(); 4908 4909 #if KMP_NESTED_HOT_TEAMS 4910 kmp_hot_team_ptr_t *hot_teams; 4911 if (master) { 4912 team = master->th.th_team; 4913 level = team->t.t_active_level; 4914 if (master->th.th_teams_microtask) { // in teams construct? 4915 if (master->th.th_teams_size.nteams > 1 && 4916 ( // #teams > 1 4917 team->t.t_pkfn == 4918 (microtask_t)__kmp_teams_master || // inner fork of the teams 4919 master->th.th_teams_level < 4920 team->t.t_level)) { // or nested parallel inside the teams 4921 ++level; // not increment if #teams==1, or for outer fork of the teams; 4922 // increment otherwise 4923 } 4924 } 4925 hot_teams = master->th.th_hot_teams; 4926 if (level < __kmp_hot_teams_max_level && hot_teams && 4927 hot_teams[level].hot_team) { 4928 // hot team has already been allocated for given level 4929 use_hot_team = 1; 4930 } else { 4931 use_hot_team = 0; 4932 } 4933 } else { 4934 // check we won't access uninitialized hot_teams, just in case 4935 KMP_DEBUG_ASSERT(new_nproc == 1); 4936 } 4937 #endif 4938 // Optimization to use a "hot" team 4939 if (use_hot_team && new_nproc > 1) { 4940 KMP_DEBUG_ASSERT(new_nproc <= max_nproc); 4941 #if KMP_NESTED_HOT_TEAMS 4942 team = hot_teams[level].hot_team; 4943 #else 4944 team = root->r.r_hot_team; 4945 #endif 4946 #if KMP_DEBUG 4947 if (__kmp_tasking_mode != tskm_immediate_exec) { 4948 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4949 "task_team[1] = %p before reinit\n", 4950 team->t.t_task_team[0], team->t.t_task_team[1])); 4951 } 4952 #endif 4953 4954 // Has the number of threads changed? 4955 /* Let's assume the most common case is that the number of threads is 4956 unchanged, and put that case first. */ 4957 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4958 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4959 // This case can mean that omp_set_num_threads() was called and the hot 4960 // team size was already reduced, so we check the special flag 4961 if (team->t.t_size_changed == -1) { 4962 team->t.t_size_changed = 1; 4963 } else { 4964 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4965 } 4966 4967 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4968 kmp_r_sched_t new_sched = new_icvs->sched; 4969 // set master's schedule as new run-time schedule 4970 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 4971 4972 __kmp_reinitialize_team(team, new_icvs, 4973 root->r.r_uber_thread->th.th_ident); 4974 4975 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 4976 team->t.t_threads[0], team)); 4977 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4978 4979 #if KMP_AFFINITY_SUPPORTED 4980 if ((team->t.t_size_changed == 0) && 4981 (team->t.t_proc_bind == new_proc_bind)) { 4982 if (new_proc_bind == proc_bind_spread) { 4983 __kmp_partition_places( 4984 team, 1); // add flag to update only master for spread 4985 } 4986 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 4987 "proc_bind = %d, partition = [%d,%d]\n", 4988 team->t.t_id, new_proc_bind, team->t.t_first_place, 4989 team->t.t_last_place)); 4990 } else { 4991 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4992 __kmp_partition_places(team); 4993 } 4994 #else 4995 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4996 #endif /* KMP_AFFINITY_SUPPORTED */ 4997 } else if (team->t.t_nproc > new_nproc) { 4998 KA_TRACE(20, 4999 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5000 new_nproc)); 5001 5002 team->t.t_size_changed = 1; 5003 #if KMP_NESTED_HOT_TEAMS 5004 if (__kmp_hot_teams_mode == 0) { 5005 // AC: saved number of threads should correspond to team's value in this 5006 // mode, can be bigger in mode 1, when hot team has threads in reserve 5007 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5008 hot_teams[level].hot_team_nth = new_nproc; 5009 #endif // KMP_NESTED_HOT_TEAMS 5010 /* release the extra threads we don't need any more */ 5011 for (f = new_nproc; f < team->t.t_nproc; f++) { 5012 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5013 if (__kmp_tasking_mode != tskm_immediate_exec) { 5014 // When decreasing team size, threads no longer in the team should 5015 // unref task team. 5016 team->t.t_threads[f]->th.th_task_team = NULL; 5017 } 5018 __kmp_free_thread(team->t.t_threads[f]); 5019 team->t.t_threads[f] = NULL; 5020 } 5021 #if KMP_NESTED_HOT_TEAMS 5022 } // (__kmp_hot_teams_mode == 0) 5023 else { 5024 // When keeping extra threads in team, switch threads to wait on own 5025 // b_go flag 5026 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5027 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5028 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5029 for (int b = 0; b < bs_last_barrier; ++b) { 5030 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5031 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5032 } 5033 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5034 } 5035 } 5036 } 5037 #endif // KMP_NESTED_HOT_TEAMS 5038 team->t.t_nproc = new_nproc; 5039 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5040 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5041 __kmp_reinitialize_team(team, new_icvs, 5042 root->r.r_uber_thread->th.th_ident); 5043 5044 // Update remaining threads 5045 for (f = 0; f < new_nproc; ++f) { 5046 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5047 } 5048 5049 // restore the current task state of the master thread: should be the 5050 // implicit task 5051 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5052 team->t.t_threads[0], team)); 5053 5054 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5055 5056 #ifdef KMP_DEBUG 5057 for (f = 0; f < team->t.t_nproc; f++) { 5058 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5059 team->t.t_threads[f]->th.th_team_nproc == 5060 team->t.t_nproc); 5061 } 5062 #endif 5063 5064 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5065 #if KMP_AFFINITY_SUPPORTED 5066 __kmp_partition_places(team); 5067 #endif 5068 } else { // team->t.t_nproc < new_nproc 5069 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5070 kmp_affin_mask_t *old_mask; 5071 if (KMP_AFFINITY_CAPABLE()) { 5072 KMP_CPU_ALLOC(old_mask); 5073 } 5074 #endif 5075 5076 KA_TRACE(20, 5077 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5078 new_nproc)); 5079 5080 team->t.t_size_changed = 1; 5081 5082 #if KMP_NESTED_HOT_TEAMS 5083 int avail_threads = hot_teams[level].hot_team_nth; 5084 if (new_nproc < avail_threads) 5085 avail_threads = new_nproc; 5086 kmp_info_t **other_threads = team->t.t_threads; 5087 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5088 // Adjust barrier data of reserved threads (if any) of the team 5089 // Other data will be set in __kmp_initialize_info() below. 5090 int b; 5091 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5092 for (b = 0; b < bs_last_barrier; ++b) { 5093 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5094 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5095 #if USE_DEBUGGER 5096 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5097 #endif 5098 } 5099 } 5100 if (hot_teams[level].hot_team_nth >= new_nproc) { 5101 // we have all needed threads in reserve, no need to allocate any 5102 // this only possible in mode 1, cannot have reserved threads in mode 0 5103 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5104 team->t.t_nproc = new_nproc; // just get reserved threads involved 5105 } else { 5106 // we may have some threads in reserve, but not enough 5107 team->t.t_nproc = 5108 hot_teams[level] 5109 .hot_team_nth; // get reserved threads involved if any 5110 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5111 #endif // KMP_NESTED_HOT_TEAMS 5112 if (team->t.t_max_nproc < new_nproc) { 5113 /* reallocate larger arrays */ 5114 __kmp_reallocate_team_arrays(team, new_nproc); 5115 __kmp_reinitialize_team(team, new_icvs, NULL); 5116 } 5117 5118 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5119 /* Temporarily set full mask for master thread before creation of 5120 workers. The reason is that workers inherit the affinity from master, 5121 so if a lot of workers are created on the single core quickly, they 5122 don't get a chance to set their own affinity for a long time. */ 5123 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5124 #endif 5125 5126 /* allocate new threads for the hot team */ 5127 for (f = team->t.t_nproc; f < new_nproc; f++) { 5128 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5129 KMP_DEBUG_ASSERT(new_worker); 5130 team->t.t_threads[f] = new_worker; 5131 5132 KA_TRACE(20, 5133 ("__kmp_allocate_team: team %d init T#%d arrived: " 5134 "join=%llu, plain=%llu\n", 5135 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5136 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5137 team->t.t_bar[bs_plain_barrier].b_arrived)); 5138 5139 { // Initialize barrier data for new threads. 5140 int b; 5141 kmp_balign_t *balign = new_worker->th.th_bar; 5142 for (b = 0; b < bs_last_barrier; ++b) { 5143 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5144 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5145 KMP_BARRIER_PARENT_FLAG); 5146 #if USE_DEBUGGER 5147 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5148 #endif 5149 } 5150 } 5151 } 5152 5153 #if (KMP_OS_LINUX || KMP_OS_FREEBSD) && KMP_AFFINITY_SUPPORTED 5154 if (KMP_AFFINITY_CAPABLE()) { 5155 /* Restore initial master thread's affinity mask */ 5156 __kmp_set_system_affinity(old_mask, TRUE); 5157 KMP_CPU_FREE(old_mask); 5158 } 5159 #endif 5160 #if KMP_NESTED_HOT_TEAMS 5161 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5162 #endif // KMP_NESTED_HOT_TEAMS 5163 /* make sure everyone is syncronized */ 5164 int old_nproc = team->t.t_nproc; // save old value and use to update only 5165 // new threads below 5166 __kmp_initialize_team(team, new_nproc, new_icvs, 5167 root->r.r_uber_thread->th.th_ident); 5168 5169 /* reinitialize the threads */ 5170 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5171 for (f = 0; f < team->t.t_nproc; ++f) 5172 __kmp_initialize_info(team->t.t_threads[f], team, f, 5173 __kmp_gtid_from_tid(f, team)); 5174 5175 if (level) { // set th_task_state for new threads in nested hot team 5176 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5177 // only need to set the th_task_state for the new threads. th_task_state 5178 // for master thread will not be accurate until after this in 5179 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5180 // correct value. 5181 for (f = old_nproc; f < team->t.t_nproc; ++f) 5182 team->t.t_threads[f]->th.th_task_state = 5183 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5184 } else { // set th_task_state for new threads in non-nested hot team 5185 int old_state = 5186 team->t.t_threads[0]->th.th_task_state; // copy master's state 5187 for (f = old_nproc; f < team->t.t_nproc; ++f) 5188 team->t.t_threads[f]->th.th_task_state = old_state; 5189 } 5190 5191 #ifdef KMP_DEBUG 5192 for (f = 0; f < team->t.t_nproc; ++f) { 5193 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5194 team->t.t_threads[f]->th.th_team_nproc == 5195 team->t.t_nproc); 5196 } 5197 #endif 5198 5199 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5200 #if KMP_AFFINITY_SUPPORTED 5201 __kmp_partition_places(team); 5202 #endif 5203 } // Check changes in number of threads 5204 5205 kmp_info_t *master = team->t.t_threads[0]; 5206 if (master->th.th_teams_microtask) { 5207 for (f = 1; f < new_nproc; ++f) { 5208 // propagate teams construct specific info to workers 5209 kmp_info_t *thr = team->t.t_threads[f]; 5210 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5211 thr->th.th_teams_level = master->th.th_teams_level; 5212 thr->th.th_teams_size = master->th.th_teams_size; 5213 } 5214 } 5215 #if KMP_NESTED_HOT_TEAMS 5216 if (level) { 5217 // Sync barrier state for nested hot teams, not needed for outermost hot 5218 // team. 5219 for (f = 1; f < new_nproc; ++f) { 5220 kmp_info_t *thr = team->t.t_threads[f]; 5221 int b; 5222 kmp_balign_t *balign = thr->th.th_bar; 5223 for (b = 0; b < bs_last_barrier; ++b) { 5224 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5225 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5226 #if USE_DEBUGGER 5227 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5228 #endif 5229 } 5230 } 5231 } 5232 #endif // KMP_NESTED_HOT_TEAMS 5233 5234 /* reallocate space for arguments if necessary */ 5235 __kmp_alloc_argv_entries(argc, team, TRUE); 5236 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5237 // The hot team re-uses the previous task team, 5238 // if untouched during the previous release->gather phase. 5239 5240 KF_TRACE(10, (" hot_team = %p\n", team)); 5241 5242 #if KMP_DEBUG 5243 if (__kmp_tasking_mode != tskm_immediate_exec) { 5244 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5245 "task_team[1] = %p after reinit\n", 5246 team->t.t_task_team[0], team->t.t_task_team[1])); 5247 } 5248 #endif 5249 5250 #if OMPT_SUPPORT 5251 __ompt_team_assign_id(team, ompt_parallel_data); 5252 #endif 5253 5254 KMP_MB(); 5255 5256 return team; 5257 } 5258 5259 /* next, let's try to take one from the team pool */ 5260 KMP_MB(); 5261 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5262 /* TODO: consider resizing undersized teams instead of reaping them, now 5263 that we have a resizing mechanism */ 5264 if (team->t.t_max_nproc >= max_nproc) { 5265 /* take this team from the team pool */ 5266 __kmp_team_pool = team->t.t_next_pool; 5267 5268 /* setup the team for fresh use */ 5269 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5270 5271 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5272 "task_team[1] %p to NULL\n", 5273 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5274 team->t.t_task_team[0] = NULL; 5275 team->t.t_task_team[1] = NULL; 5276 5277 /* reallocate space for arguments if necessary */ 5278 __kmp_alloc_argv_entries(argc, team, TRUE); 5279 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5280 5281 KA_TRACE( 5282 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5283 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5284 { // Initialize barrier data. 5285 int b; 5286 for (b = 0; b < bs_last_barrier; ++b) { 5287 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5288 #if USE_DEBUGGER 5289 team->t.t_bar[b].b_master_arrived = 0; 5290 team->t.t_bar[b].b_team_arrived = 0; 5291 #endif 5292 } 5293 } 5294 5295 team->t.t_proc_bind = new_proc_bind; 5296 5297 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5298 team->t.t_id)); 5299 5300 #if OMPT_SUPPORT 5301 __ompt_team_assign_id(team, ompt_parallel_data); 5302 #endif 5303 5304 KMP_MB(); 5305 5306 return team; 5307 } 5308 5309 /* reap team if it is too small, then loop back and check the next one */ 5310 // not sure if this is wise, but, will be redone during the hot-teams 5311 // rewrite. 5312 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5313 team = __kmp_reap_team(team); 5314 __kmp_team_pool = team; 5315 } 5316 5317 /* nothing available in the pool, no matter, make a new team! */ 5318 KMP_MB(); 5319 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5320 5321 /* and set it up */ 5322 team->t.t_max_nproc = max_nproc; 5323 /* NOTE well, for some reason allocating one big buffer and dividing it up 5324 seems to really hurt performance a lot on the P4, so, let's not use this */ 5325 __kmp_allocate_team_arrays(team, max_nproc); 5326 5327 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5328 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5329 5330 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5331 "%p to NULL\n", 5332 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5333 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5334 // memory, no need to duplicate 5335 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5336 // memory, no need to duplicate 5337 5338 if (__kmp_storage_map) { 5339 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5340 } 5341 5342 /* allocate space for arguments */ 5343 __kmp_alloc_argv_entries(argc, team, FALSE); 5344 team->t.t_argc = argc; 5345 5346 KA_TRACE(20, 5347 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5348 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5349 { // Initialize barrier data. 5350 int b; 5351 for (b = 0; b < bs_last_barrier; ++b) { 5352 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5353 #if USE_DEBUGGER 5354 team->t.t_bar[b].b_master_arrived = 0; 5355 team->t.t_bar[b].b_team_arrived = 0; 5356 #endif 5357 } 5358 } 5359 5360 team->t.t_proc_bind = new_proc_bind; 5361 5362 #if OMPT_SUPPORT 5363 __ompt_team_assign_id(team, ompt_parallel_data); 5364 team->t.ompt_serialized_team_info = NULL; 5365 #endif 5366 5367 KMP_MB(); 5368 5369 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5370 team->t.t_id)); 5371 5372 return team; 5373 } 5374 5375 /* TODO implement hot-teams at all levels */ 5376 /* TODO implement lazy thread release on demand (disband request) */ 5377 5378 /* free the team. return it to the team pool. release all the threads 5379 * associated with it */ 5380 void __kmp_free_team(kmp_root_t *root, 5381 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5382 int f; 5383 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5384 team->t.t_id)); 5385 5386 /* verify state */ 5387 KMP_DEBUG_ASSERT(root); 5388 KMP_DEBUG_ASSERT(team); 5389 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5390 KMP_DEBUG_ASSERT(team->t.t_threads); 5391 5392 int use_hot_team = team == root->r.r_hot_team; 5393 #if KMP_NESTED_HOT_TEAMS 5394 int level; 5395 kmp_hot_team_ptr_t *hot_teams; 5396 if (master) { 5397 level = team->t.t_active_level - 1; 5398 if (master->th.th_teams_microtask) { // in teams construct? 5399 if (master->th.th_teams_size.nteams > 1) { 5400 ++level; // level was not increased in teams construct for 5401 // team_of_masters 5402 } 5403 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5404 master->th.th_teams_level == team->t.t_level) { 5405 ++level; // level was not increased in teams construct for 5406 // team_of_workers before the parallel 5407 } // team->t.t_level will be increased inside parallel 5408 } 5409 hot_teams = master->th.th_hot_teams; 5410 if (level < __kmp_hot_teams_max_level) { 5411 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5412 use_hot_team = 1; 5413 } 5414 } 5415 #endif // KMP_NESTED_HOT_TEAMS 5416 5417 /* team is done working */ 5418 TCW_SYNC_PTR(team->t.t_pkfn, 5419 NULL); // Important for Debugging Support Library. 5420 #if KMP_OS_WINDOWS 5421 team->t.t_copyin_counter = 0; // init counter for possible reuse 5422 #endif 5423 // Do not reset pointer to parent team to NULL for hot teams. 5424 5425 /* if we are non-hot team, release our threads */ 5426 if (!use_hot_team) { 5427 if (__kmp_tasking_mode != tskm_immediate_exec) { 5428 // Wait for threads to reach reapable state 5429 for (f = 1; f < team->t.t_nproc; ++f) { 5430 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5431 kmp_info_t *th = team->t.t_threads[f]; 5432 volatile kmp_uint32 *state = &th->th.th_reap_state; 5433 while (*state != KMP_SAFE_TO_REAP) { 5434 #if KMP_OS_WINDOWS 5435 // On Windows a thread can be killed at any time, check this 5436 DWORD ecode; 5437 if (!__kmp_is_thread_alive(th, &ecode)) { 5438 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5439 break; 5440 } 5441 #endif 5442 // first check if thread is sleeping 5443 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5444 if (fl.is_sleeping()) 5445 fl.resume(__kmp_gtid_from_thread(th)); 5446 KMP_CPU_PAUSE(); 5447 } 5448 } 5449 5450 // Delete task teams 5451 int tt_idx; 5452 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5453 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5454 if (task_team != NULL) { 5455 for (f = 0; f < team->t.t_nproc; ++f) { // threads unref task teams 5456 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5457 team->t.t_threads[f]->th.th_task_team = NULL; 5458 } 5459 KA_TRACE( 5460 20, 5461 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5462 __kmp_get_gtid(), task_team, team->t.t_id)); 5463 #if KMP_NESTED_HOT_TEAMS 5464 __kmp_free_task_team(master, task_team); 5465 #endif 5466 team->t.t_task_team[tt_idx] = NULL; 5467 } 5468 } 5469 } 5470 5471 // Reset pointer to parent team only for non-hot teams. 5472 team->t.t_parent = NULL; 5473 team->t.t_level = 0; 5474 team->t.t_active_level = 0; 5475 5476 /* free the worker threads */ 5477 for (f = 1; f < team->t.t_nproc; ++f) { 5478 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5479 __kmp_free_thread(team->t.t_threads[f]); 5480 team->t.t_threads[f] = NULL; 5481 } 5482 5483 /* put the team back in the team pool */ 5484 /* TODO limit size of team pool, call reap_team if pool too large */ 5485 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5486 __kmp_team_pool = (volatile kmp_team_t *)team; 5487 } else { // Check if team was created for the masters in a teams construct 5488 // See if first worker is a CG root 5489 KMP_DEBUG_ASSERT(team->t.t_threads[1] && 5490 team->t.t_threads[1]->th.th_cg_roots); 5491 if (team->t.t_threads[1]->th.th_cg_roots->cg_root == team->t.t_threads[1]) { 5492 // Clean up the CG root nodes on workers so that this team can be re-used 5493 for (f = 1; f < team->t.t_nproc; ++f) { 5494 kmp_info_t *thr = team->t.t_threads[f]; 5495 KMP_DEBUG_ASSERT(thr && thr->th.th_cg_roots && 5496 thr->th.th_cg_roots->cg_root == thr); 5497 // Pop current CG root off list 5498 kmp_cg_root_t *tmp = thr->th.th_cg_roots; 5499 thr->th.th_cg_roots = tmp->up; 5500 KA_TRACE(100, ("__kmp_free_team: Thread %p popping node %p and moving" 5501 " up to node %p. cg_nthreads was %d\n", 5502 thr, tmp, thr->th.th_cg_roots, tmp->cg_nthreads)); 5503 int i = tmp->cg_nthreads--; 5504 if (i == 1) { 5505 __kmp_free(tmp); // free CG if we are the last thread in it 5506 } 5507 // Restore current task's thread_limit from CG root 5508 if (thr->th.th_cg_roots) 5509 thr->th.th_current_task->td_icvs.thread_limit = 5510 thr->th.th_cg_roots->cg_thread_limit; 5511 } 5512 } 5513 } 5514 5515 KMP_MB(); 5516 } 5517 5518 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5519 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5520 kmp_team_t *next_pool = team->t.t_next_pool; 5521 5522 KMP_DEBUG_ASSERT(team); 5523 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5524 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5525 KMP_DEBUG_ASSERT(team->t.t_threads); 5526 KMP_DEBUG_ASSERT(team->t.t_argv); 5527 5528 /* TODO clean the threads that are a part of this? */ 5529 5530 /* free stuff */ 5531 __kmp_free_team_arrays(team); 5532 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5533 __kmp_free((void *)team->t.t_argv); 5534 __kmp_free(team); 5535 5536 KMP_MB(); 5537 return next_pool; 5538 } 5539 5540 // Free the thread. Don't reap it, just place it on the pool of available 5541 // threads. 5542 // 5543 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5544 // binding for the affinity mechanism to be useful. 5545 // 5546 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5547 // However, we want to avoid a potential performance problem by always 5548 // scanning through the list to find the correct point at which to insert 5549 // the thread (potential N**2 behavior). To do this we keep track of the 5550 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5551 // With single-level parallelism, threads will always be added to the tail 5552 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5553 // parallelism, all bets are off and we may need to scan through the entire 5554 // free list. 5555 // 5556 // This change also has a potentially large performance benefit, for some 5557 // applications. Previously, as threads were freed from the hot team, they 5558 // would be placed back on the free list in inverse order. If the hot team 5559 // grew back to it's original size, then the freed thread would be placed 5560 // back on the hot team in reverse order. This could cause bad cache 5561 // locality problems on programs where the size of the hot team regularly 5562 // grew and shrunk. 5563 // 5564 // Now, for single-level parallelism, the OMP tid is always == gtid. 5565 void __kmp_free_thread(kmp_info_t *this_th) { 5566 int gtid; 5567 kmp_info_t **scan; 5568 5569 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5570 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5571 5572 KMP_DEBUG_ASSERT(this_th); 5573 5574 // When moving thread to pool, switch thread to wait on own b_go flag, and 5575 // uninitialized (NULL team). 5576 int b; 5577 kmp_balign_t *balign = this_th->th.th_bar; 5578 for (b = 0; b < bs_last_barrier; ++b) { 5579 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5580 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5581 balign[b].bb.team = NULL; 5582 balign[b].bb.leaf_kids = 0; 5583 } 5584 this_th->th.th_task_state = 0; 5585 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5586 5587 /* put thread back on the free pool */ 5588 TCW_PTR(this_th->th.th_team, NULL); 5589 TCW_PTR(this_th->th.th_root, NULL); 5590 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5591 5592 while (this_th->th.th_cg_roots) { 5593 this_th->th.th_cg_roots->cg_nthreads--; 5594 KA_TRACE(100, ("__kmp_free_thread: Thread %p decrement cg_nthreads on node" 5595 " %p of thread %p to %d\n", 5596 this_th, this_th->th.th_cg_roots, 5597 this_th->th.th_cg_roots->cg_root, 5598 this_th->th.th_cg_roots->cg_nthreads)); 5599 kmp_cg_root_t *tmp = this_th->th.th_cg_roots; 5600 if (tmp->cg_root == this_th) { // Thread is a cg_root 5601 KMP_DEBUG_ASSERT(tmp->cg_nthreads == 0); 5602 KA_TRACE( 5603 5, ("__kmp_free_thread: Thread %p freeing node %p\n", this_th, tmp)); 5604 this_th->th.th_cg_roots = tmp->up; 5605 __kmp_free(tmp); 5606 } else { // Worker thread 5607 if (tmp->cg_nthreads == 0) { // last thread leaves contention group 5608 __kmp_free(tmp); 5609 } 5610 this_th->th.th_cg_roots = NULL; 5611 break; 5612 } 5613 } 5614 5615 /* If the implicit task assigned to this thread can be used by other threads 5616 * -> multiple threads can share the data and try to free the task at 5617 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5618 * with higher probability when hot team is disabled but can occurs even when 5619 * the hot team is enabled */ 5620 __kmp_free_implicit_task(this_th); 5621 this_th->th.th_current_task = NULL; 5622 5623 // If the __kmp_thread_pool_insert_pt is already past the new insert 5624 // point, then we need to re-scan the entire list. 5625 gtid = this_th->th.th_info.ds.ds_gtid; 5626 if (__kmp_thread_pool_insert_pt != NULL) { 5627 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5628 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5629 __kmp_thread_pool_insert_pt = NULL; 5630 } 5631 } 5632 5633 // Scan down the list to find the place to insert the thread. 5634 // scan is the address of a link in the list, possibly the address of 5635 // __kmp_thread_pool itself. 5636 // 5637 // In the absence of nested parallelism, the for loop will have 0 iterations. 5638 if (__kmp_thread_pool_insert_pt != NULL) { 5639 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5640 } else { 5641 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5642 } 5643 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5644 scan = &((*scan)->th.th_next_pool)) 5645 ; 5646 5647 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5648 // to its address. 5649 TCW_PTR(this_th->th.th_next_pool, *scan); 5650 __kmp_thread_pool_insert_pt = *scan = this_th; 5651 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5652 (this_th->th.th_info.ds.ds_gtid < 5653 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5654 TCW_4(this_th->th.th_in_pool, TRUE); 5655 __kmp_suspend_initialize_thread(this_th); 5656 __kmp_lock_suspend_mx(this_th); 5657 if (this_th->th.th_active == TRUE) { 5658 KMP_ATOMIC_INC(&__kmp_thread_pool_active_nth); 5659 this_th->th.th_active_in_pool = TRUE; 5660 } 5661 #if KMP_DEBUG 5662 else { 5663 KMP_DEBUG_ASSERT(this_th->th.th_active_in_pool == FALSE); 5664 } 5665 #endif 5666 __kmp_unlock_suspend_mx(this_th); 5667 5668 TCW_4(__kmp_nth, __kmp_nth - 1); 5669 5670 #ifdef KMP_ADJUST_BLOCKTIME 5671 /* Adjust blocktime back to user setting or default if necessary */ 5672 /* Middle initialization might never have occurred */ 5673 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5674 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5675 if (__kmp_nth <= __kmp_avail_proc) { 5676 __kmp_zero_bt = FALSE; 5677 } 5678 } 5679 #endif /* KMP_ADJUST_BLOCKTIME */ 5680 5681 KMP_MB(); 5682 } 5683 5684 /* ------------------------------------------------------------------------ */ 5685 5686 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5687 int gtid = this_thr->th.th_info.ds.ds_gtid; 5688 /* void *stack_data;*/ 5689 kmp_team_t **volatile pteam; 5690 5691 KMP_MB(); 5692 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5693 5694 if (__kmp_env_consistency_check) { 5695 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5696 } 5697 5698 #if OMPT_SUPPORT 5699 ompt_data_t *thread_data; 5700 if (ompt_enabled.enabled) { 5701 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5702 *thread_data = ompt_data_none; 5703 5704 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5705 this_thr->th.ompt_thread_info.wait_id = 0; 5706 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5707 this_thr->th.ompt_thread_info.parallel_flags = 0; 5708 if (ompt_enabled.ompt_callback_thread_begin) { 5709 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5710 ompt_thread_worker, thread_data); 5711 } 5712 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5713 } 5714 #endif 5715 5716 /* This is the place where threads wait for work */ 5717 while (!TCR_4(__kmp_global.g.g_done)) { 5718 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5719 KMP_MB(); 5720 5721 /* wait for work to do */ 5722 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5723 5724 /* No tid yet since not part of a team */ 5725 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5726 5727 #if OMPT_SUPPORT 5728 if (ompt_enabled.enabled) { 5729 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5730 } 5731 #endif 5732 5733 pteam = &this_thr->th.th_team; 5734 5735 /* have we been allocated? */ 5736 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5737 /* we were just woken up, so run our new task */ 5738 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5739 int rc; 5740 KA_TRACE(20, 5741 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5742 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5743 (*pteam)->t.t_pkfn)); 5744 5745 updateHWFPControl(*pteam); 5746 5747 #if OMPT_SUPPORT 5748 if (ompt_enabled.enabled) { 5749 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5750 } 5751 #endif 5752 5753 rc = (*pteam)->t.t_invoke(gtid); 5754 KMP_ASSERT(rc); 5755 5756 KMP_MB(); 5757 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5758 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5759 (*pteam)->t.t_pkfn)); 5760 } 5761 #if OMPT_SUPPORT 5762 if (ompt_enabled.enabled) { 5763 /* no frame set while outside task */ 5764 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5765 5766 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5767 } 5768 #endif 5769 /* join barrier after parallel region */ 5770 __kmp_join_barrier(gtid); 5771 } 5772 } 5773 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5774 5775 #if OMPT_SUPPORT 5776 if (ompt_enabled.ompt_callback_thread_end) { 5777 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5778 } 5779 #endif 5780 5781 this_thr->th.th_task_team = NULL; 5782 /* run the destructors for the threadprivate data for this thread */ 5783 __kmp_common_destroy_gtid(gtid); 5784 5785 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5786 KMP_MB(); 5787 return this_thr; 5788 } 5789 5790 /* ------------------------------------------------------------------------ */ 5791 5792 void __kmp_internal_end_dest(void *specific_gtid) { 5793 #if KMP_COMPILER_ICC 5794 #pragma warning(push) 5795 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose 5796 // significant bits 5797 #endif 5798 // Make sure no significant bits are lost 5799 int gtid = (kmp_intptr_t)specific_gtid - 1; 5800 #if KMP_COMPILER_ICC 5801 #pragma warning(pop) 5802 #endif 5803 5804 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5805 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5806 * this is because 0 is reserved for the nothing-stored case */ 5807 5808 /* josh: One reason for setting the gtid specific data even when it is being 5809 destroyed by pthread is to allow gtid lookup through thread specific data 5810 (__kmp_gtid_get_specific). Some of the code, especially stat code, 5811 that gets executed in the call to __kmp_internal_end_thread, actually 5812 gets the gtid through the thread specific data. Setting it here seems 5813 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread 5814 to run smoothly. 5815 todo: get rid of this after we remove the dependence on 5816 __kmp_gtid_get_specific */ 5817 if (gtid >= 0 && KMP_UBER_GTID(gtid)) 5818 __kmp_gtid_set_specific(gtid); 5819 #ifdef KMP_TDATA_GTID 5820 __kmp_gtid = gtid; 5821 #endif 5822 __kmp_internal_end_thread(gtid); 5823 } 5824 5825 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5826 5827 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5828 __kmp_internal_end_atexit(); 5829 } 5830 5831 #endif 5832 5833 /* [Windows] josh: when the atexit handler is called, there may still be more 5834 than one thread alive */ 5835 void __kmp_internal_end_atexit(void) { 5836 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5837 /* [Windows] 5838 josh: ideally, we want to completely shutdown the library in this atexit 5839 handler, but stat code that depends on thread specific data for gtid fails 5840 because that data becomes unavailable at some point during the shutdown, so 5841 we call __kmp_internal_end_thread instead. We should eventually remove the 5842 dependency on __kmp_get_specific_gtid in the stat code and use 5843 __kmp_internal_end_library to cleanly shutdown the library. 5844 5845 // TODO: Can some of this comment about GVS be removed? 5846 I suspect that the offending stat code is executed when the calling thread 5847 tries to clean up a dead root thread's data structures, resulting in GVS 5848 code trying to close the GVS structures for that thread, but since the stat 5849 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5850 the calling thread is cleaning up itself instead of another thread, it get 5851 confused. This happens because allowing a thread to unregister and cleanup 5852 another thread is a recent modification for addressing an issue. 5853 Based on the current design (20050722), a thread may end up 5854 trying to unregister another thread only if thread death does not trigger 5855 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5856 thread specific data destructor function to detect thread death. For 5857 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5858 is nothing. Thus, the workaround is applicable only for Windows static 5859 stat library. */ 5860 __kmp_internal_end_library(-1); 5861 #if KMP_OS_WINDOWS 5862 __kmp_close_console(); 5863 #endif 5864 } 5865 5866 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5867 // It is assumed __kmp_forkjoin_lock is acquired. 5868 5869 int gtid; 5870 5871 KMP_DEBUG_ASSERT(thread != NULL); 5872 5873 gtid = thread->th.th_info.ds.ds_gtid; 5874 5875 if (!is_root) { 5876 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5877 /* Assume the threads are at the fork barrier here */ 5878 KA_TRACE( 5879 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5880 gtid)); 5881 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5882 * (GEH) */ 5883 ANNOTATE_HAPPENS_BEFORE(thread); 5884 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 5885 __kmp_release_64(&flag); 5886 } 5887 5888 // Terminate OS thread. 5889 __kmp_reap_worker(thread); 5890 5891 // The thread was killed asynchronously. If it was actively 5892 // spinning in the thread pool, decrement the global count. 5893 // 5894 // There is a small timing hole here - if the worker thread was just waking 5895 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5896 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5897 // the global counter might not get updated. 5898 // 5899 // Currently, this can only happen as the library is unloaded, 5900 // so there are no harmful side effects. 5901 if (thread->th.th_active_in_pool) { 5902 thread->th.th_active_in_pool = FALSE; 5903 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5904 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5905 } 5906 } 5907 5908 __kmp_free_implicit_task(thread); 5909 5910 // Free the fast memory for tasking 5911 #if USE_FAST_MEMORY 5912 __kmp_free_fast_memory(thread); 5913 #endif /* USE_FAST_MEMORY */ 5914 5915 __kmp_suspend_uninitialize_thread(thread); 5916 5917 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5918 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5919 5920 --__kmp_all_nth; 5921 // __kmp_nth was decremented when thread is added to the pool. 5922 5923 #ifdef KMP_ADJUST_BLOCKTIME 5924 /* Adjust blocktime back to user setting or default if necessary */ 5925 /* Middle initialization might never have occurred */ 5926 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5927 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5928 if (__kmp_nth <= __kmp_avail_proc) { 5929 __kmp_zero_bt = FALSE; 5930 } 5931 } 5932 #endif /* KMP_ADJUST_BLOCKTIME */ 5933 5934 /* free the memory being used */ 5935 if (__kmp_env_consistency_check) { 5936 if (thread->th.th_cons) { 5937 __kmp_free_cons_stack(thread->th.th_cons); 5938 thread->th.th_cons = NULL; 5939 } 5940 } 5941 5942 if (thread->th.th_pri_common != NULL) { 5943 __kmp_free(thread->th.th_pri_common); 5944 thread->th.th_pri_common = NULL; 5945 } 5946 5947 if (thread->th.th_task_state_memo_stack != NULL) { 5948 __kmp_free(thread->th.th_task_state_memo_stack); 5949 thread->th.th_task_state_memo_stack = NULL; 5950 } 5951 5952 #if KMP_USE_BGET 5953 if (thread->th.th_local.bget_data != NULL) { 5954 __kmp_finalize_bget(thread); 5955 } 5956 #endif 5957 5958 #if KMP_AFFINITY_SUPPORTED 5959 if (thread->th.th_affin_mask != NULL) { 5960 KMP_CPU_FREE(thread->th.th_affin_mask); 5961 thread->th.th_affin_mask = NULL; 5962 } 5963 #endif /* KMP_AFFINITY_SUPPORTED */ 5964 5965 #if KMP_USE_HIER_SCHED 5966 if (thread->th.th_hier_bar_data != NULL) { 5967 __kmp_free(thread->th.th_hier_bar_data); 5968 thread->th.th_hier_bar_data = NULL; 5969 } 5970 #endif 5971 5972 __kmp_reap_team(thread->th.th_serial_team); 5973 thread->th.th_serial_team = NULL; 5974 __kmp_free(thread); 5975 5976 KMP_MB(); 5977 5978 } // __kmp_reap_thread 5979 5980 static void __kmp_internal_end(void) { 5981 int i; 5982 5983 /* First, unregister the library */ 5984 __kmp_unregister_library(); 5985 5986 #if KMP_OS_WINDOWS 5987 /* In Win static library, we can't tell when a root actually dies, so we 5988 reclaim the data structures for any root threads that have died but not 5989 unregistered themselves, in order to shut down cleanly. 5990 In Win dynamic library we also can't tell when a thread dies. */ 5991 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 5992 // dead roots 5993 #endif 5994 5995 for (i = 0; i < __kmp_threads_capacity; i++) 5996 if (__kmp_root[i]) 5997 if (__kmp_root[i]->r.r_active) 5998 break; 5999 KMP_MB(); /* Flush all pending memory write invalidates. */ 6000 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6001 6002 if (i < __kmp_threads_capacity) { 6003 #if KMP_USE_MONITOR 6004 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 6005 KMP_MB(); /* Flush all pending memory write invalidates. */ 6006 6007 // Need to check that monitor was initialized before reaping it. If we are 6008 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6009 // __kmp_monitor will appear to contain valid data, but it is only valid in 6010 // the parent process, not the child. 6011 // New behavior (201008): instead of keying off of the flag 6012 // __kmp_init_parallel, the monitor thread creation is keyed off 6013 // of the new flag __kmp_init_monitor. 6014 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6015 if (TCR_4(__kmp_init_monitor)) { 6016 __kmp_reap_monitor(&__kmp_monitor); 6017 TCW_4(__kmp_init_monitor, 0); 6018 } 6019 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6020 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6021 #endif // KMP_USE_MONITOR 6022 } else { 6023 /* TODO move this to cleanup code */ 6024 #ifdef KMP_DEBUG 6025 /* make sure that everything has properly ended */ 6026 for (i = 0; i < __kmp_threads_capacity; i++) { 6027 if (__kmp_root[i]) { 6028 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6029 // there can be uber threads alive here 6030 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6031 } 6032 } 6033 #endif 6034 6035 KMP_MB(); 6036 6037 // Reap the worker threads. 6038 // This is valid for now, but be careful if threads are reaped sooner. 6039 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6040 // Get the next thread from the pool. 6041 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6042 __kmp_thread_pool = thread->th.th_next_pool; 6043 // Reap it. 6044 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6045 thread->th.th_next_pool = NULL; 6046 thread->th.th_in_pool = FALSE; 6047 __kmp_reap_thread(thread, 0); 6048 } 6049 __kmp_thread_pool_insert_pt = NULL; 6050 6051 // Reap teams. 6052 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6053 // Get the next team from the pool. 6054 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6055 __kmp_team_pool = team->t.t_next_pool; 6056 // Reap it. 6057 team->t.t_next_pool = NULL; 6058 __kmp_reap_team(team); 6059 } 6060 6061 __kmp_reap_task_teams(); 6062 6063 #if KMP_OS_UNIX 6064 // Threads that are not reaped should not access any resources since they 6065 // are going to be deallocated soon, so the shutdown sequence should wait 6066 // until all threads either exit the final spin-waiting loop or begin 6067 // sleeping after the given blocktime. 6068 for (i = 0; i < __kmp_threads_capacity; i++) { 6069 kmp_info_t *thr = __kmp_threads[i]; 6070 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6071 KMP_CPU_PAUSE(); 6072 } 6073 #endif 6074 6075 for (i = 0; i < __kmp_threads_capacity; ++i) { 6076 // TBD: Add some checking... 6077 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6078 } 6079 6080 /* Make sure all threadprivate destructors get run by joining with all 6081 worker threads before resetting this flag */ 6082 TCW_SYNC_4(__kmp_init_common, FALSE); 6083 6084 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6085 KMP_MB(); 6086 6087 #if KMP_USE_MONITOR 6088 // See note above: One of the possible fixes for CQ138434 / CQ140126 6089 // 6090 // FIXME: push both code fragments down and CSE them? 6091 // push them into __kmp_cleanup() ? 6092 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6093 if (TCR_4(__kmp_init_monitor)) { 6094 __kmp_reap_monitor(&__kmp_monitor); 6095 TCW_4(__kmp_init_monitor, 0); 6096 } 6097 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6098 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6099 #endif 6100 } /* else !__kmp_global.t_active */ 6101 TCW_4(__kmp_init_gtid, FALSE); 6102 KMP_MB(); /* Flush all pending memory write invalidates. */ 6103 6104 __kmp_cleanup(); 6105 #if OMPT_SUPPORT 6106 ompt_fini(); 6107 #endif 6108 } 6109 6110 void __kmp_internal_end_library(int gtid_req) { 6111 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6112 /* this shouldn't be a race condition because __kmp_internal_end() is the 6113 only place to clear __kmp_serial_init */ 6114 /* we'll check this later too, after we get the lock */ 6115 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6116 // redundant, because the next check will work in any case. 6117 if (__kmp_global.g.g_abort) { 6118 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6119 /* TODO abort? */ 6120 return; 6121 } 6122 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6123 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6124 return; 6125 } 6126 6127 KMP_MB(); /* Flush all pending memory write invalidates. */ 6128 6129 /* find out who we are and what we should do */ 6130 { 6131 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6132 KA_TRACE( 6133 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6134 if (gtid == KMP_GTID_SHUTDOWN) { 6135 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6136 "already shutdown\n")); 6137 return; 6138 } else if (gtid == KMP_GTID_MONITOR) { 6139 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6140 "registered, or system shutdown\n")); 6141 return; 6142 } else if (gtid == KMP_GTID_DNE) { 6143 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6144 "shutdown\n")); 6145 /* we don't know who we are, but we may still shutdown the library */ 6146 } else if (KMP_UBER_GTID(gtid)) { 6147 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6148 if (__kmp_root[gtid]->r.r_active) { 6149 __kmp_global.g.g_abort = -1; 6150 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6151 KA_TRACE(10, 6152 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6153 gtid)); 6154 return; 6155 } else { 6156 KA_TRACE( 6157 10, 6158 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6159 __kmp_unregister_root_current_thread(gtid); 6160 } 6161 } else { 6162 /* worker threads may call this function through the atexit handler, if they 6163 * call exit() */ 6164 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6165 TODO: do a thorough shutdown instead */ 6166 #ifdef DUMP_DEBUG_ON_EXIT 6167 if (__kmp_debug_buf) 6168 __kmp_dump_debug_buffer(); 6169 #endif 6170 return; 6171 } 6172 } 6173 /* synchronize the termination process */ 6174 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6175 6176 /* have we already finished */ 6177 if (__kmp_global.g.g_abort) { 6178 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6179 /* TODO abort? */ 6180 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6181 return; 6182 } 6183 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6184 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6185 return; 6186 } 6187 6188 /* We need this lock to enforce mutex between this reading of 6189 __kmp_threads_capacity and the writing by __kmp_register_root. 6190 Alternatively, we can use a counter of roots that is atomically updated by 6191 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6192 __kmp_internal_end_*. */ 6193 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6194 6195 /* now we can safely conduct the actual termination */ 6196 __kmp_internal_end(); 6197 6198 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6199 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6200 6201 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6202 6203 #ifdef DUMP_DEBUG_ON_EXIT 6204 if (__kmp_debug_buf) 6205 __kmp_dump_debug_buffer(); 6206 #endif 6207 6208 #if KMP_OS_WINDOWS 6209 __kmp_close_console(); 6210 #endif 6211 6212 __kmp_fini_allocator(); 6213 6214 } // __kmp_internal_end_library 6215 6216 void __kmp_internal_end_thread(int gtid_req) { 6217 int i; 6218 6219 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6220 /* this shouldn't be a race condition because __kmp_internal_end() is the 6221 * only place to clear __kmp_serial_init */ 6222 /* we'll check this later too, after we get the lock */ 6223 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6224 // redundant, because the next check will work in any case. 6225 if (__kmp_global.g.g_abort) { 6226 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6227 /* TODO abort? */ 6228 return; 6229 } 6230 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6231 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6232 return; 6233 } 6234 6235 KMP_MB(); /* Flush all pending memory write invalidates. */ 6236 6237 /* find out who we are and what we should do */ 6238 { 6239 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6240 KA_TRACE(10, 6241 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6242 if (gtid == KMP_GTID_SHUTDOWN) { 6243 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6244 "already shutdown\n")); 6245 return; 6246 } else if (gtid == KMP_GTID_MONITOR) { 6247 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6248 "registered, or system shutdown\n")); 6249 return; 6250 } else if (gtid == KMP_GTID_DNE) { 6251 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6252 "shutdown\n")); 6253 return; 6254 /* we don't know who we are */ 6255 } else if (KMP_UBER_GTID(gtid)) { 6256 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6257 if (__kmp_root[gtid]->r.r_active) { 6258 __kmp_global.g.g_abort = -1; 6259 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6260 KA_TRACE(10, 6261 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6262 gtid)); 6263 return; 6264 } else { 6265 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6266 gtid)); 6267 __kmp_unregister_root_current_thread(gtid); 6268 } 6269 } else { 6270 /* just a worker thread, let's leave */ 6271 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6272 6273 if (gtid >= 0) { 6274 __kmp_threads[gtid]->th.th_task_team = NULL; 6275 } 6276 6277 KA_TRACE(10, 6278 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6279 gtid)); 6280 return; 6281 } 6282 } 6283 #if KMP_DYNAMIC_LIB 6284 if (__kmp_pause_status != kmp_hard_paused) 6285 // AC: lets not shutdown the dynamic library at the exit of uber thread, 6286 // because we will better shutdown later in the library destructor. 6287 { 6288 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6289 return; 6290 } 6291 #endif 6292 /* synchronize the termination process */ 6293 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6294 6295 /* have we already finished */ 6296 if (__kmp_global.g.g_abort) { 6297 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6298 /* TODO abort? */ 6299 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6300 return; 6301 } 6302 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6303 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6304 return; 6305 } 6306 6307 /* We need this lock to enforce mutex between this reading of 6308 __kmp_threads_capacity and the writing by __kmp_register_root. 6309 Alternatively, we can use a counter of roots that is atomically updated by 6310 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6311 __kmp_internal_end_*. */ 6312 6313 /* should we finish the run-time? are all siblings done? */ 6314 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6315 6316 for (i = 0; i < __kmp_threads_capacity; ++i) { 6317 if (KMP_UBER_GTID(i)) { 6318 KA_TRACE( 6319 10, 6320 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6321 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6322 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6323 return; 6324 } 6325 } 6326 6327 /* now we can safely conduct the actual termination */ 6328 6329 __kmp_internal_end(); 6330 6331 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6332 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6333 6334 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6335 6336 #ifdef DUMP_DEBUG_ON_EXIT 6337 if (__kmp_debug_buf) 6338 __kmp_dump_debug_buffer(); 6339 #endif 6340 } // __kmp_internal_end_thread 6341 6342 // ----------------------------------------------------------------------------- 6343 // Library registration stuff. 6344 6345 static long __kmp_registration_flag = 0; 6346 // Random value used to indicate library initialization. 6347 static char *__kmp_registration_str = NULL; 6348 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6349 6350 static inline char *__kmp_reg_status_name() { 6351 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6352 each thread. If registration and unregistration go in different threads 6353 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6354 env var can not be found, because the name will contain different pid. */ 6355 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6356 } // __kmp_reg_status_get 6357 6358 void __kmp_register_library_startup(void) { 6359 6360 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6361 int done = 0; 6362 union { 6363 double dtime; 6364 long ltime; 6365 } time; 6366 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6367 __kmp_initialize_system_tick(); 6368 #endif 6369 __kmp_read_system_time(&time.dtime); 6370 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6371 __kmp_registration_str = 6372 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6373 __kmp_registration_flag, KMP_LIBRARY_FILE); 6374 6375 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6376 __kmp_registration_str)); 6377 6378 while (!done) { 6379 6380 char *value = NULL; // Actual value of the environment variable. 6381 6382 // Set environment variable, but do not overwrite if it is exist. 6383 __kmp_env_set(name, __kmp_registration_str, 0); 6384 // Check the variable is written. 6385 value = __kmp_env_get(name); 6386 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6387 6388 done = 1; // Ok, environment variable set successfully, exit the loop. 6389 6390 } else { 6391 6392 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6393 // Check whether it alive or dead. 6394 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6395 char *tail = value; 6396 char *flag_addr_str = NULL; 6397 char *flag_val_str = NULL; 6398 char const *file_name = NULL; 6399 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6400 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6401 file_name = tail; 6402 if (tail != NULL) { 6403 long *flag_addr = 0; 6404 long flag_val = 0; 6405 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr)); 6406 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6407 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6408 // First, check whether environment-encoded address is mapped into 6409 // addr space. 6410 // If so, dereference it to see if it still has the right value. 6411 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6412 neighbor = 1; 6413 } else { 6414 // If not, then we know the other copy of the library is no longer 6415 // running. 6416 neighbor = 2; 6417 } 6418 } 6419 } 6420 switch (neighbor) { 6421 case 0: // Cannot parse environment variable -- neighbor status unknown. 6422 // Assume it is the incompatible format of future version of the 6423 // library. Assume the other library is alive. 6424 // WARN( ... ); // TODO: Issue a warning. 6425 file_name = "unknown library"; 6426 KMP_FALLTHROUGH(); 6427 // Attention! Falling to the next case. That's intentional. 6428 case 1: { // Neighbor is alive. 6429 // Check it is allowed. 6430 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6431 if (!__kmp_str_match_true(duplicate_ok)) { 6432 // That's not allowed. Issue fatal error. 6433 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6434 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6435 } 6436 KMP_INTERNAL_FREE(duplicate_ok); 6437 __kmp_duplicate_library_ok = 1; 6438 done = 1; // Exit the loop. 6439 } break; 6440 case 2: { // Neighbor is dead. 6441 // Clear the variable and try to register library again. 6442 __kmp_env_unset(name); 6443 } break; 6444 default: { KMP_DEBUG_ASSERT(0); } break; 6445 } 6446 } 6447 KMP_INTERNAL_FREE((void *)value); 6448 } 6449 KMP_INTERNAL_FREE((void *)name); 6450 6451 } // func __kmp_register_library_startup 6452 6453 void __kmp_unregister_library(void) { 6454 6455 char *name = __kmp_reg_status_name(); 6456 char *value = __kmp_env_get(name); 6457 6458 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6459 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6460 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6461 // Ok, this is our variable. Delete it. 6462 __kmp_env_unset(name); 6463 } 6464 6465 KMP_INTERNAL_FREE(__kmp_registration_str); 6466 KMP_INTERNAL_FREE(value); 6467 KMP_INTERNAL_FREE(name); 6468 6469 __kmp_registration_flag = 0; 6470 __kmp_registration_str = NULL; 6471 6472 } // __kmp_unregister_library 6473 6474 // End of Library registration stuff. 6475 // ----------------------------------------------------------------------------- 6476 6477 #if KMP_MIC_SUPPORTED 6478 6479 static void __kmp_check_mic_type() { 6480 kmp_cpuid_t cpuid_state = {0}; 6481 kmp_cpuid_t *cs_p = &cpuid_state; 6482 __kmp_x86_cpuid(1, 0, cs_p); 6483 // We don't support mic1 at the moment 6484 if ((cs_p->eax & 0xff0) == 0xB10) { 6485 __kmp_mic_type = mic2; 6486 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6487 __kmp_mic_type = mic3; 6488 } else { 6489 __kmp_mic_type = non_mic; 6490 } 6491 } 6492 6493 #endif /* KMP_MIC_SUPPORTED */ 6494 6495 static void __kmp_do_serial_initialize(void) { 6496 int i, gtid; 6497 int size; 6498 6499 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6500 6501 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6502 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6503 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6504 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6505 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6506 6507 #if OMPT_SUPPORT 6508 ompt_pre_init(); 6509 #endif 6510 6511 __kmp_validate_locks(); 6512 6513 /* Initialize internal memory allocator */ 6514 __kmp_init_allocator(); 6515 6516 /* Register the library startup via an environment variable and check to see 6517 whether another copy of the library is already registered. */ 6518 6519 __kmp_register_library_startup(); 6520 6521 /* TODO reinitialization of library */ 6522 if (TCR_4(__kmp_global.g.g_done)) { 6523 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6524 } 6525 6526 __kmp_global.g.g_abort = 0; 6527 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6528 6529 /* initialize the locks */ 6530 #if KMP_USE_ADAPTIVE_LOCKS 6531 #if KMP_DEBUG_ADAPTIVE_LOCKS 6532 __kmp_init_speculative_stats(); 6533 #endif 6534 #endif 6535 #if KMP_STATS_ENABLED 6536 __kmp_stats_init(); 6537 #endif 6538 __kmp_init_lock(&__kmp_global_lock); 6539 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6540 __kmp_init_lock(&__kmp_debug_lock); 6541 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6542 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6543 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6544 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6545 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6546 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6547 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6548 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6549 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6550 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6551 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6552 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6553 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6554 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6555 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6556 #if KMP_USE_MONITOR 6557 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6558 #endif 6559 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6560 6561 /* conduct initialization and initial setup of configuration */ 6562 6563 __kmp_runtime_initialize(); 6564 6565 #if KMP_MIC_SUPPORTED 6566 __kmp_check_mic_type(); 6567 #endif 6568 6569 // Some global variable initialization moved here from kmp_env_initialize() 6570 #ifdef KMP_DEBUG 6571 kmp_diag = 0; 6572 #endif 6573 __kmp_abort_delay = 0; 6574 6575 // From __kmp_init_dflt_team_nth() 6576 /* assume the entire machine will be used */ 6577 __kmp_dflt_team_nth_ub = __kmp_xproc; 6578 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6579 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6580 } 6581 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6582 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6583 } 6584 __kmp_max_nth = __kmp_sys_max_nth; 6585 __kmp_cg_max_nth = __kmp_sys_max_nth; 6586 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6587 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6588 __kmp_teams_max_nth = __kmp_sys_max_nth; 6589 } 6590 6591 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6592 // part 6593 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6594 #if KMP_USE_MONITOR 6595 __kmp_monitor_wakeups = 6596 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6597 __kmp_bt_intervals = 6598 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6599 #endif 6600 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6601 __kmp_library = library_throughput; 6602 // From KMP_SCHEDULE initialization 6603 __kmp_static = kmp_sch_static_balanced; 6604 // AC: do not use analytical here, because it is non-monotonous 6605 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6606 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6607 // need to repeat assignment 6608 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6609 // bit control and barrier method control parts 6610 #if KMP_FAST_REDUCTION_BARRIER 6611 #define kmp_reduction_barrier_gather_bb ((int)1) 6612 #define kmp_reduction_barrier_release_bb ((int)1) 6613 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6614 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6615 #endif // KMP_FAST_REDUCTION_BARRIER 6616 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6617 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6618 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6619 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6620 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6621 #if KMP_FAST_REDUCTION_BARRIER 6622 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6623 // lin_64 ): hyper,1 6624 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6625 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6626 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6627 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6628 } 6629 #endif // KMP_FAST_REDUCTION_BARRIER 6630 } 6631 #if KMP_FAST_REDUCTION_BARRIER 6632 #undef kmp_reduction_barrier_release_pat 6633 #undef kmp_reduction_barrier_gather_pat 6634 #undef kmp_reduction_barrier_release_bb 6635 #undef kmp_reduction_barrier_gather_bb 6636 #endif // KMP_FAST_REDUCTION_BARRIER 6637 #if KMP_MIC_SUPPORTED 6638 if (__kmp_mic_type == mic2) { // KNC 6639 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6640 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6641 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6642 1; // forkjoin release 6643 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6644 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6645 } 6646 #if KMP_FAST_REDUCTION_BARRIER 6647 if (__kmp_mic_type == mic2) { // KNC 6648 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6649 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6650 } 6651 #endif // KMP_FAST_REDUCTION_BARRIER 6652 #endif // KMP_MIC_SUPPORTED 6653 6654 // From KMP_CHECKS initialization 6655 #ifdef KMP_DEBUG 6656 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6657 #else 6658 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6659 #endif 6660 6661 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6662 __kmp_foreign_tp = TRUE; 6663 6664 __kmp_global.g.g_dynamic = FALSE; 6665 __kmp_global.g.g_dynamic_mode = dynamic_default; 6666 6667 __kmp_env_initialize(NULL); 6668 6669 // Print all messages in message catalog for testing purposes. 6670 #ifdef KMP_DEBUG 6671 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6672 if (__kmp_str_match_true(val)) { 6673 kmp_str_buf_t buffer; 6674 __kmp_str_buf_init(&buffer); 6675 __kmp_i18n_dump_catalog(&buffer); 6676 __kmp_printf("%s", buffer.str); 6677 __kmp_str_buf_free(&buffer); 6678 } 6679 __kmp_env_free(&val); 6680 #endif 6681 6682 __kmp_threads_capacity = 6683 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6684 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6685 __kmp_tp_capacity = __kmp_default_tp_capacity( 6686 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6687 6688 // If the library is shut down properly, both pools must be NULL. Just in 6689 // case, set them to NULL -- some memory may leak, but subsequent code will 6690 // work even if pools are not freed. 6691 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6692 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6693 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6694 __kmp_thread_pool = NULL; 6695 __kmp_thread_pool_insert_pt = NULL; 6696 __kmp_team_pool = NULL; 6697 6698 /* Allocate all of the variable sized records */ 6699 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6700 * expandable */ 6701 /* Since allocation is cache-aligned, just add extra padding at the end */ 6702 size = 6703 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6704 CACHE_LINE; 6705 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6706 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6707 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6708 6709 /* init thread counts */ 6710 KMP_DEBUG_ASSERT(__kmp_all_nth == 6711 0); // Asserts fail if the library is reinitializing and 6712 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6713 __kmp_all_nth = 0; 6714 __kmp_nth = 0; 6715 6716 /* setup the uber master thread and hierarchy */ 6717 gtid = __kmp_register_root(TRUE); 6718 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6719 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6720 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6721 6722 KMP_MB(); /* Flush all pending memory write invalidates. */ 6723 6724 __kmp_common_initialize(); 6725 6726 #if KMP_OS_UNIX 6727 /* invoke the child fork handler */ 6728 __kmp_register_atfork(); 6729 #endif 6730 6731 #if !KMP_DYNAMIC_LIB 6732 { 6733 /* Invoke the exit handler when the program finishes, only for static 6734 library. For dynamic library, we already have _fini and DllMain. */ 6735 int rc = atexit(__kmp_internal_end_atexit); 6736 if (rc != 0) { 6737 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6738 __kmp_msg_null); 6739 } 6740 } 6741 #endif 6742 6743 #if KMP_HANDLE_SIGNALS 6744 #if KMP_OS_UNIX 6745 /* NOTE: make sure that this is called before the user installs their own 6746 signal handlers so that the user handlers are called first. this way they 6747 can return false, not call our handler, avoid terminating the library, and 6748 continue execution where they left off. */ 6749 __kmp_install_signals(FALSE); 6750 #endif /* KMP_OS_UNIX */ 6751 #if KMP_OS_WINDOWS 6752 __kmp_install_signals(TRUE); 6753 #endif /* KMP_OS_WINDOWS */ 6754 #endif 6755 6756 /* we have finished the serial initialization */ 6757 __kmp_init_counter++; 6758 6759 __kmp_init_serial = TRUE; 6760 6761 if (__kmp_settings) { 6762 __kmp_env_print(); 6763 } 6764 6765 if (__kmp_display_env || __kmp_display_env_verbose) { 6766 __kmp_env_print_2(); 6767 } 6768 6769 #if OMPT_SUPPORT 6770 ompt_post_init(); 6771 #endif 6772 6773 KMP_MB(); 6774 6775 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6776 } 6777 6778 void __kmp_serial_initialize(void) { 6779 if (__kmp_init_serial) { 6780 return; 6781 } 6782 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6783 if (__kmp_init_serial) { 6784 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6785 return; 6786 } 6787 __kmp_do_serial_initialize(); 6788 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6789 } 6790 6791 static void __kmp_do_middle_initialize(void) { 6792 int i, j; 6793 int prev_dflt_team_nth; 6794 6795 if (!__kmp_init_serial) { 6796 __kmp_do_serial_initialize(); 6797 } 6798 6799 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6800 6801 // Save the previous value for the __kmp_dflt_team_nth so that 6802 // we can avoid some reinitialization if it hasn't changed. 6803 prev_dflt_team_nth = __kmp_dflt_team_nth; 6804 6805 #if KMP_AFFINITY_SUPPORTED 6806 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6807 // number of cores on the machine. 6808 __kmp_affinity_initialize(); 6809 6810 // Run through the __kmp_threads array and set the affinity mask 6811 // for each root thread that is currently registered with the RTL. 6812 for (i = 0; i < __kmp_threads_capacity; i++) { 6813 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6814 __kmp_affinity_set_init_mask(i, TRUE); 6815 } 6816 } 6817 #endif /* KMP_AFFINITY_SUPPORTED */ 6818 6819 KMP_ASSERT(__kmp_xproc > 0); 6820 if (__kmp_avail_proc == 0) { 6821 __kmp_avail_proc = __kmp_xproc; 6822 } 6823 6824 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6825 // correct them now 6826 j = 0; 6827 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6828 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6829 __kmp_avail_proc; 6830 j++; 6831 } 6832 6833 if (__kmp_dflt_team_nth == 0) { 6834 #ifdef KMP_DFLT_NTH_CORES 6835 // Default #threads = #cores 6836 __kmp_dflt_team_nth = __kmp_ncores; 6837 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6838 "__kmp_ncores (%d)\n", 6839 __kmp_dflt_team_nth)); 6840 #else 6841 // Default #threads = #available OS procs 6842 __kmp_dflt_team_nth = __kmp_avail_proc; 6843 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6844 "__kmp_avail_proc(%d)\n", 6845 __kmp_dflt_team_nth)); 6846 #endif /* KMP_DFLT_NTH_CORES */ 6847 } 6848 6849 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 6850 __kmp_dflt_team_nth = KMP_MIN_NTH; 6851 } 6852 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 6853 __kmp_dflt_team_nth = __kmp_sys_max_nth; 6854 } 6855 6856 // There's no harm in continuing if the following check fails, 6857 // but it indicates an error in the previous logic. 6858 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 6859 6860 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 6861 // Run through the __kmp_threads array and set the num threads icv for each 6862 // root thread that is currently registered with the RTL (which has not 6863 // already explicitly set its nthreads-var with a call to 6864 // omp_set_num_threads()). 6865 for (i = 0; i < __kmp_threads_capacity; i++) { 6866 kmp_info_t *thread = __kmp_threads[i]; 6867 if (thread == NULL) 6868 continue; 6869 if (thread->th.th_current_task->td_icvs.nproc != 0) 6870 continue; 6871 6872 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 6873 } 6874 } 6875 KA_TRACE( 6876 20, 6877 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 6878 __kmp_dflt_team_nth)); 6879 6880 #ifdef KMP_ADJUST_BLOCKTIME 6881 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 6882 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6883 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6884 if (__kmp_nth > __kmp_avail_proc) { 6885 __kmp_zero_bt = TRUE; 6886 } 6887 } 6888 #endif /* KMP_ADJUST_BLOCKTIME */ 6889 6890 /* we have finished middle initialization */ 6891 TCW_SYNC_4(__kmp_init_middle, TRUE); 6892 6893 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 6894 } 6895 6896 void __kmp_middle_initialize(void) { 6897 if (__kmp_init_middle) { 6898 return; 6899 } 6900 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6901 if (__kmp_init_middle) { 6902 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6903 return; 6904 } 6905 __kmp_do_middle_initialize(); 6906 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6907 } 6908 6909 void __kmp_parallel_initialize(void) { 6910 int gtid = __kmp_entry_gtid(); // this might be a new root 6911 6912 /* synchronize parallel initialization (for sibling) */ 6913 if (TCR_4(__kmp_init_parallel)) 6914 return; 6915 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6916 if (TCR_4(__kmp_init_parallel)) { 6917 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6918 return; 6919 } 6920 6921 /* TODO reinitialization after we have already shut down */ 6922 if (TCR_4(__kmp_global.g.g_done)) { 6923 KA_TRACE( 6924 10, 6925 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 6926 __kmp_infinite_loop(); 6927 } 6928 6929 /* jc: The lock __kmp_initz_lock is already held, so calling 6930 __kmp_serial_initialize would cause a deadlock. So we call 6931 __kmp_do_serial_initialize directly. */ 6932 if (!__kmp_init_middle) { 6933 __kmp_do_middle_initialize(); 6934 } 6935 __kmp_resume_if_hard_paused(); 6936 6937 /* begin initialization */ 6938 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 6939 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6940 6941 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6942 // Save the FP control regs. 6943 // Worker threads will set theirs to these values at thread startup. 6944 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 6945 __kmp_store_mxcsr(&__kmp_init_mxcsr); 6946 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 6947 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 6948 6949 #if KMP_OS_UNIX 6950 #if KMP_HANDLE_SIGNALS 6951 /* must be after __kmp_serial_initialize */ 6952 __kmp_install_signals(TRUE); 6953 #endif 6954 #endif 6955 6956 __kmp_suspend_initialize(); 6957 6958 #if defined(USE_LOAD_BALANCE) 6959 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6960 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 6961 } 6962 #else 6963 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6964 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 6965 } 6966 #endif 6967 6968 if (__kmp_version) { 6969 __kmp_print_version_2(); 6970 } 6971 6972 /* we have finished parallel initialization */ 6973 TCW_SYNC_4(__kmp_init_parallel, TRUE); 6974 6975 KMP_MB(); 6976 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 6977 6978 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6979 } 6980 6981 /* ------------------------------------------------------------------------ */ 6982 6983 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6984 kmp_team_t *team) { 6985 kmp_disp_t *dispatch; 6986 6987 KMP_MB(); 6988 6989 /* none of the threads have encountered any constructs, yet. */ 6990 this_thr->th.th_local.this_construct = 0; 6991 #if KMP_CACHE_MANAGE 6992 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 6993 #endif /* KMP_CACHE_MANAGE */ 6994 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 6995 KMP_DEBUG_ASSERT(dispatch); 6996 KMP_DEBUG_ASSERT(team->t.t_dispatch); 6997 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 6998 // this_thr->th.th_info.ds.ds_tid ] ); 6999 7000 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7001 dispatch->th_doacross_buf_idx = 0; // reset doacross dispatch buffer counter 7002 if (__kmp_env_consistency_check) 7003 __kmp_push_parallel(gtid, team->t.t_ident); 7004 7005 KMP_MB(); /* Flush all pending memory write invalidates. */ 7006 } 7007 7008 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7009 kmp_team_t *team) { 7010 if (__kmp_env_consistency_check) 7011 __kmp_pop_parallel(gtid, team->t.t_ident); 7012 7013 __kmp_finish_implicit_task(this_thr); 7014 } 7015 7016 int __kmp_invoke_task_func(int gtid) { 7017 int rc; 7018 int tid = __kmp_tid_from_gtid(gtid); 7019 kmp_info_t *this_thr = __kmp_threads[gtid]; 7020 kmp_team_t *team = this_thr->th.th_team; 7021 7022 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7023 #if USE_ITT_BUILD 7024 if (__itt_stack_caller_create_ptr) { 7025 __kmp_itt_stack_callee_enter( 7026 (__itt_caller) 7027 team->t.t_stack_id); // inform ittnotify about entering user's code 7028 } 7029 #endif /* USE_ITT_BUILD */ 7030 #if INCLUDE_SSC_MARKS 7031 SSC_MARK_INVOKING(); 7032 #endif 7033 7034 #if OMPT_SUPPORT 7035 void *dummy; 7036 void **exit_frame_p; 7037 ompt_data_t *my_task_data; 7038 ompt_data_t *my_parallel_data; 7039 int ompt_team_size; 7040 7041 if (ompt_enabled.enabled) { 7042 exit_frame_p = &( 7043 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr); 7044 } else { 7045 exit_frame_p = &dummy; 7046 } 7047 7048 my_task_data = 7049 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7050 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7051 if (ompt_enabled.ompt_callback_implicit_task) { 7052 ompt_team_size = team->t.t_nproc; 7053 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7054 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7055 __kmp_tid_from_gtid(gtid), ompt_task_implicit); 7056 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7057 } 7058 #endif 7059 7060 #if KMP_STATS_ENABLED 7061 stats_state_e previous_state = KMP_GET_THREAD_STATE(); 7062 if (previous_state == stats_state_e::TEAMS_REGION) { 7063 KMP_PUSH_PARTITIONED_TIMER(OMP_teams); 7064 } else { 7065 KMP_PUSH_PARTITIONED_TIMER(OMP_parallel); 7066 } 7067 KMP_SET_THREAD_STATE(IMPLICIT_TASK); 7068 #endif 7069 7070 rc = __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7071 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7072 #if OMPT_SUPPORT 7073 , 7074 exit_frame_p 7075 #endif 7076 ); 7077 #if OMPT_SUPPORT 7078 *exit_frame_p = NULL; 7079 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_team; 7080 #endif 7081 7082 #if KMP_STATS_ENABLED 7083 if (previous_state == stats_state_e::TEAMS_REGION) { 7084 KMP_SET_THREAD_STATE(previous_state); 7085 } 7086 KMP_POP_PARTITIONED_TIMER(); 7087 #endif 7088 7089 #if USE_ITT_BUILD 7090 if (__itt_stack_caller_create_ptr) { 7091 __kmp_itt_stack_callee_leave( 7092 (__itt_caller) 7093 team->t.t_stack_id); // inform ittnotify about leaving user's code 7094 } 7095 #endif /* USE_ITT_BUILD */ 7096 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7097 7098 return rc; 7099 } 7100 7101 void __kmp_teams_master(int gtid) { 7102 // This routine is called by all master threads in teams construct 7103 kmp_info_t *thr = __kmp_threads[gtid]; 7104 kmp_team_t *team = thr->th.th_team; 7105 ident_t *loc = team->t.t_ident; 7106 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7107 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7108 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7109 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7110 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7111 7112 // This thread is a new CG root. Set up the proper variables. 7113 kmp_cg_root_t *tmp = (kmp_cg_root_t *)__kmp_allocate(sizeof(kmp_cg_root_t)); 7114 tmp->cg_root = thr; // Make thr the CG root 7115 // Init to thread limit that was stored when league masters were forked 7116 tmp->cg_thread_limit = thr->th.th_current_task->td_icvs.thread_limit; 7117 tmp->cg_nthreads = 1; // Init counter to one active thread, this one 7118 KA_TRACE(100, ("__kmp_teams_master: Thread %p created node %p and init" 7119 " cg_nthreads to 1\n", 7120 thr, tmp)); 7121 tmp->up = thr->th.th_cg_roots; 7122 thr->th.th_cg_roots = tmp; 7123 7124 // Launch league of teams now, but not let workers execute 7125 // (they hang on fork barrier until next parallel) 7126 #if INCLUDE_SSC_MARKS 7127 SSC_MARK_FORKING(); 7128 #endif 7129 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7130 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7131 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7132 #if INCLUDE_SSC_MARKS 7133 SSC_MARK_JOINING(); 7134 #endif 7135 // If the team size was reduced from the limit, set it to the new size 7136 if (thr->th.th_team_nproc < thr->th.th_teams_size.nth) 7137 thr->th.th_teams_size.nth = thr->th.th_team_nproc; 7138 // AC: last parameter "1" eliminates join barrier which won't work because 7139 // worker threads are in a fork barrier waiting for more parallel regions 7140 __kmp_join_call(loc, gtid 7141 #if OMPT_SUPPORT 7142 , 7143 fork_context_intel 7144 #endif 7145 , 7146 1); 7147 } 7148 7149 int __kmp_invoke_teams_master(int gtid) { 7150 kmp_info_t *this_thr = __kmp_threads[gtid]; 7151 kmp_team_t *team = this_thr->th.th_team; 7152 #if KMP_DEBUG 7153 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7154 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7155 (void *)__kmp_teams_master); 7156 #endif 7157 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7158 #if OMPT_SUPPORT 7159 int tid = __kmp_tid_from_gtid(gtid); 7160 ompt_data_t *task_data = 7161 &team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data; 7162 ompt_data_t *parallel_data = &team->t.ompt_team_info.parallel_data; 7163 if (ompt_enabled.ompt_callback_implicit_task) { 7164 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7165 ompt_scope_begin, parallel_data, task_data, team->t.t_nproc, tid, 7166 ompt_task_initial); 7167 OMPT_CUR_TASK_INFO(this_thr)->thread_num = tid; 7168 } 7169 #endif 7170 __kmp_teams_master(gtid); 7171 #if OMPT_SUPPORT 7172 this_thr->th.ompt_thread_info.parallel_flags |= ompt_parallel_league; 7173 #endif 7174 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7175 return 1; 7176 } 7177 7178 /* this sets the requested number of threads for the next parallel region 7179 encountered by this team. since this should be enclosed in the forkjoin 7180 critical section it should avoid race conditions with asymmetrical nested 7181 parallelism */ 7182 7183 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7184 kmp_info_t *thr = __kmp_threads[gtid]; 7185 7186 if (num_threads > 0) 7187 thr->th.th_set_nproc = num_threads; 7188 } 7189 7190 /* this sets the requested number of teams for the teams region and/or 7191 the number of threads for the next parallel region encountered */ 7192 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7193 int num_threads) { 7194 kmp_info_t *thr = __kmp_threads[gtid]; 7195 KMP_DEBUG_ASSERT(num_teams >= 0); 7196 KMP_DEBUG_ASSERT(num_threads >= 0); 7197 7198 if (num_teams == 0) 7199 num_teams = 1; // default number of teams is 1. 7200 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7201 if (!__kmp_reserve_warn) { 7202 __kmp_reserve_warn = 1; 7203 __kmp_msg(kmp_ms_warning, 7204 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7205 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7206 } 7207 num_teams = __kmp_teams_max_nth; 7208 } 7209 // Set number of teams (number of threads in the outer "parallel" of the 7210 // teams) 7211 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7212 7213 // Remember the number of threads for inner parallel regions 7214 if (!TCR_4(__kmp_init_middle)) 7215 __kmp_middle_initialize(); // get internal globals calculated 7216 KMP_DEBUG_ASSERT(__kmp_avail_proc); 7217 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth); 7218 if (num_threads == 0) { 7219 num_threads = __kmp_avail_proc / num_teams; 7220 // adjust num_threads w/o warning as it is not user setting 7221 // num_threads = min(num_threads, nthreads-var, thread-limit-var) 7222 // no thread_limit clause specified - do not change thread-limit-var ICV 7223 if (num_threads > __kmp_dflt_team_nth) { 7224 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7225 } 7226 if (num_threads > thr->th.th_current_task->td_icvs.thread_limit) { 7227 num_threads = thr->th.th_current_task->td_icvs.thread_limit; 7228 } // prevent team size to exceed thread-limit-var 7229 if (num_teams * num_threads > __kmp_teams_max_nth) { 7230 num_threads = __kmp_teams_max_nth / num_teams; 7231 } 7232 } else { 7233 // This thread will be the master of the league masters 7234 // Store new thread limit; old limit is saved in th_cg_roots list 7235 thr->th.th_current_task->td_icvs.thread_limit = num_threads; 7236 // num_threads = min(num_threads, nthreads-var) 7237 if (num_threads > __kmp_dflt_team_nth) { 7238 num_threads = __kmp_dflt_team_nth; // honor nthreads-var ICV 7239 } 7240 if (num_teams * num_threads > __kmp_teams_max_nth) { 7241 int new_threads = __kmp_teams_max_nth / num_teams; 7242 if (!__kmp_reserve_warn) { // user asked for too many threads 7243 __kmp_reserve_warn = 1; // conflicts with KMP_TEAMS_THREAD_LIMIT 7244 __kmp_msg(kmp_ms_warning, 7245 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7246 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7247 } 7248 num_threads = new_threads; 7249 } 7250 } 7251 thr->th.th_teams_size.nth = num_threads; 7252 } 7253 7254 // Set the proc_bind var to use in the following parallel region. 7255 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7256 kmp_info_t *thr = __kmp_threads[gtid]; 7257 thr->th.th_set_proc_bind = proc_bind; 7258 } 7259 7260 /* Launch the worker threads into the microtask. */ 7261 7262 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7263 kmp_info_t *this_thr = __kmp_threads[gtid]; 7264 7265 #ifdef KMP_DEBUG 7266 int f; 7267 #endif /* KMP_DEBUG */ 7268 7269 KMP_DEBUG_ASSERT(team); 7270 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7271 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7272 KMP_MB(); /* Flush all pending memory write invalidates. */ 7273 7274 team->t.t_construct = 0; /* no single directives seen yet */ 7275 team->t.t_ordered.dt.t_value = 7276 0; /* thread 0 enters the ordered section first */ 7277 7278 /* Reset the identifiers on the dispatch buffer */ 7279 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7280 if (team->t.t_max_nproc > 1) { 7281 int i; 7282 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7283 team->t.t_disp_buffer[i].buffer_index = i; 7284 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7285 } 7286 } else { 7287 team->t.t_disp_buffer[0].buffer_index = 0; 7288 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7289 } 7290 7291 KMP_MB(); /* Flush all pending memory write invalidates. */ 7292 KMP_ASSERT(this_thr->th.th_team == team); 7293 7294 #ifdef KMP_DEBUG 7295 for (f = 0; f < team->t.t_nproc; f++) { 7296 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7297 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7298 } 7299 #endif /* KMP_DEBUG */ 7300 7301 /* release the worker threads so they may begin working */ 7302 __kmp_fork_barrier(gtid, 0); 7303 } 7304 7305 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7306 kmp_info_t *this_thr = __kmp_threads[gtid]; 7307 7308 KMP_DEBUG_ASSERT(team); 7309 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7310 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7311 KMP_MB(); /* Flush all pending memory write invalidates. */ 7312 7313 /* Join barrier after fork */ 7314 7315 #ifdef KMP_DEBUG 7316 if (__kmp_threads[gtid] && 7317 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7318 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7319 __kmp_threads[gtid]); 7320 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7321 "team->t.t_nproc=%d\n", 7322 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7323 team->t.t_nproc); 7324 __kmp_print_structure(); 7325 } 7326 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7327 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7328 #endif /* KMP_DEBUG */ 7329 7330 __kmp_join_barrier(gtid); /* wait for everyone */ 7331 #if OMPT_SUPPORT 7332 if (ompt_enabled.enabled && 7333 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7334 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7335 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7336 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7337 #if OMPT_OPTIONAL 7338 void *codeptr = NULL; 7339 if (KMP_MASTER_TID(ds_tid) && 7340 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7341 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7342 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7343 7344 if (ompt_enabled.ompt_callback_sync_region_wait) { 7345 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7346 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7347 codeptr); 7348 } 7349 if (ompt_enabled.ompt_callback_sync_region) { 7350 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7351 ompt_sync_region_barrier_implicit, ompt_scope_end, NULL, task_data, 7352 codeptr); 7353 } 7354 #endif 7355 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7356 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7357 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7358 } 7359 } 7360 #endif 7361 7362 KMP_MB(); /* Flush all pending memory write invalidates. */ 7363 KMP_ASSERT(this_thr->th.th_team == team); 7364 } 7365 7366 /* ------------------------------------------------------------------------ */ 7367 7368 #ifdef USE_LOAD_BALANCE 7369 7370 // Return the worker threads actively spinning in the hot team, if we 7371 // are at the outermost level of parallelism. Otherwise, return 0. 7372 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7373 int i; 7374 int retval; 7375 kmp_team_t *hot_team; 7376 7377 if (root->r.r_active) { 7378 return 0; 7379 } 7380 hot_team = root->r.r_hot_team; 7381 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7382 return hot_team->t.t_nproc - 1; // Don't count master thread 7383 } 7384 7385 // Skip the master thread - it is accounted for elsewhere. 7386 retval = 0; 7387 for (i = 1; i < hot_team->t.t_nproc; i++) { 7388 if (hot_team->t.t_threads[i]->th.th_active) { 7389 retval++; 7390 } 7391 } 7392 return retval; 7393 } 7394 7395 // Perform an automatic adjustment to the number of 7396 // threads used by the next parallel region. 7397 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7398 int retval; 7399 int pool_active; 7400 int hot_team_active; 7401 int team_curr_active; 7402 int system_active; 7403 7404 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7405 set_nproc)); 7406 KMP_DEBUG_ASSERT(root); 7407 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7408 ->th.th_current_task->td_icvs.dynamic == TRUE); 7409 KMP_DEBUG_ASSERT(set_nproc > 1); 7410 7411 if (set_nproc == 1) { 7412 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7413 return 1; 7414 } 7415 7416 // Threads that are active in the thread pool, active in the hot team for this 7417 // particular root (if we are at the outer par level), and the currently 7418 // executing thread (to become the master) are available to add to the new 7419 // team, but are currently contributing to the system load, and must be 7420 // accounted for. 7421 pool_active = __kmp_thread_pool_active_nth; 7422 hot_team_active = __kmp_active_hot_team_nproc(root); 7423 team_curr_active = pool_active + hot_team_active + 1; 7424 7425 // Check the system load. 7426 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7427 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7428 "hot team active = %d\n", 7429 system_active, pool_active, hot_team_active)); 7430 7431 if (system_active < 0) { 7432 // There was an error reading the necessary info from /proc, so use the 7433 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7434 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7435 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7436 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7437 7438 // Make this call behave like the thread limit algorithm. 7439 retval = __kmp_avail_proc - __kmp_nth + 7440 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7441 if (retval > set_nproc) { 7442 retval = set_nproc; 7443 } 7444 if (retval < KMP_MIN_NTH) { 7445 retval = KMP_MIN_NTH; 7446 } 7447 7448 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7449 retval)); 7450 return retval; 7451 } 7452 7453 // There is a slight delay in the load balance algorithm in detecting new 7454 // running procs. The real system load at this instant should be at least as 7455 // large as the #active omp thread that are available to add to the team. 7456 if (system_active < team_curr_active) { 7457 system_active = team_curr_active; 7458 } 7459 retval = __kmp_avail_proc - system_active + team_curr_active; 7460 if (retval > set_nproc) { 7461 retval = set_nproc; 7462 } 7463 if (retval < KMP_MIN_NTH) { 7464 retval = KMP_MIN_NTH; 7465 } 7466 7467 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7468 return retval; 7469 } // __kmp_load_balance_nproc() 7470 7471 #endif /* USE_LOAD_BALANCE */ 7472 7473 /* ------------------------------------------------------------------------ */ 7474 7475 /* NOTE: this is called with the __kmp_init_lock held */ 7476 void __kmp_cleanup(void) { 7477 int f; 7478 7479 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7480 7481 if (TCR_4(__kmp_init_parallel)) { 7482 #if KMP_HANDLE_SIGNALS 7483 __kmp_remove_signals(); 7484 #endif 7485 TCW_4(__kmp_init_parallel, FALSE); 7486 } 7487 7488 if (TCR_4(__kmp_init_middle)) { 7489 #if KMP_AFFINITY_SUPPORTED 7490 __kmp_affinity_uninitialize(); 7491 #endif /* KMP_AFFINITY_SUPPORTED */ 7492 __kmp_cleanup_hierarchy(); 7493 TCW_4(__kmp_init_middle, FALSE); 7494 } 7495 7496 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7497 7498 if (__kmp_init_serial) { 7499 __kmp_runtime_destroy(); 7500 __kmp_init_serial = FALSE; 7501 } 7502 7503 __kmp_cleanup_threadprivate_caches(); 7504 7505 for (f = 0; f < __kmp_threads_capacity; f++) { 7506 if (__kmp_root[f] != NULL) { 7507 __kmp_free(__kmp_root[f]); 7508 __kmp_root[f] = NULL; 7509 } 7510 } 7511 __kmp_free(__kmp_threads); 7512 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7513 // there is no need in freeing __kmp_root. 7514 __kmp_threads = NULL; 7515 __kmp_root = NULL; 7516 __kmp_threads_capacity = 0; 7517 7518 #if KMP_USE_DYNAMIC_LOCK 7519 __kmp_cleanup_indirect_user_locks(); 7520 #else 7521 __kmp_cleanup_user_locks(); 7522 #endif 7523 7524 #if KMP_AFFINITY_SUPPORTED 7525 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7526 __kmp_cpuinfo_file = NULL; 7527 #endif /* KMP_AFFINITY_SUPPORTED */ 7528 7529 #if KMP_USE_ADAPTIVE_LOCKS 7530 #if KMP_DEBUG_ADAPTIVE_LOCKS 7531 __kmp_print_speculative_stats(); 7532 #endif 7533 #endif 7534 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7535 __kmp_nested_nth.nth = NULL; 7536 __kmp_nested_nth.size = 0; 7537 __kmp_nested_nth.used = 0; 7538 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7539 __kmp_nested_proc_bind.bind_types = NULL; 7540 __kmp_nested_proc_bind.size = 0; 7541 __kmp_nested_proc_bind.used = 0; 7542 if (__kmp_affinity_format) { 7543 KMP_INTERNAL_FREE(__kmp_affinity_format); 7544 __kmp_affinity_format = NULL; 7545 } 7546 7547 __kmp_i18n_catclose(); 7548 7549 #if KMP_USE_HIER_SCHED 7550 __kmp_hier_scheds.deallocate(); 7551 #endif 7552 7553 #if KMP_STATS_ENABLED 7554 __kmp_stats_fini(); 7555 #endif 7556 7557 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7558 } 7559 7560 /* ------------------------------------------------------------------------ */ 7561 7562 int __kmp_ignore_mppbeg(void) { 7563 char *env; 7564 7565 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7566 if (__kmp_str_match_false(env)) 7567 return FALSE; 7568 } 7569 // By default __kmpc_begin() is no-op. 7570 return TRUE; 7571 } 7572 7573 int __kmp_ignore_mppend(void) { 7574 char *env; 7575 7576 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7577 if (__kmp_str_match_false(env)) 7578 return FALSE; 7579 } 7580 // By default __kmpc_end() is no-op. 7581 return TRUE; 7582 } 7583 7584 void __kmp_internal_begin(void) { 7585 int gtid; 7586 kmp_root_t *root; 7587 7588 /* this is a very important step as it will register new sibling threads 7589 and assign these new uber threads a new gtid */ 7590 gtid = __kmp_entry_gtid(); 7591 root = __kmp_threads[gtid]->th.th_root; 7592 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7593 7594 if (root->r.r_begin) 7595 return; 7596 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7597 if (root->r.r_begin) { 7598 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7599 return; 7600 } 7601 7602 root->r.r_begin = TRUE; 7603 7604 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7605 } 7606 7607 /* ------------------------------------------------------------------------ */ 7608 7609 void __kmp_user_set_library(enum library_type arg) { 7610 int gtid; 7611 kmp_root_t *root; 7612 kmp_info_t *thread; 7613 7614 /* first, make sure we are initialized so we can get our gtid */ 7615 7616 gtid = __kmp_entry_gtid(); 7617 thread = __kmp_threads[gtid]; 7618 7619 root = thread->th.th_root; 7620 7621 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7622 library_serial)); 7623 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7624 thread */ 7625 KMP_WARNING(SetLibraryIncorrectCall); 7626 return; 7627 } 7628 7629 switch (arg) { 7630 case library_serial: 7631 thread->th.th_set_nproc = 0; 7632 set__nproc(thread, 1); 7633 break; 7634 case library_turnaround: 7635 thread->th.th_set_nproc = 0; 7636 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7637 : __kmp_dflt_team_nth_ub); 7638 break; 7639 case library_throughput: 7640 thread->th.th_set_nproc = 0; 7641 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7642 : __kmp_dflt_team_nth_ub); 7643 break; 7644 default: 7645 KMP_FATAL(UnknownLibraryType, arg); 7646 } 7647 7648 __kmp_aux_set_library(arg); 7649 } 7650 7651 void __kmp_aux_set_stacksize(size_t arg) { 7652 if (!__kmp_init_serial) 7653 __kmp_serial_initialize(); 7654 7655 #if KMP_OS_DARWIN 7656 if (arg & (0x1000 - 1)) { 7657 arg &= ~(0x1000 - 1); 7658 if (arg + 0x1000) /* check for overflow if we round up */ 7659 arg += 0x1000; 7660 } 7661 #endif 7662 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7663 7664 /* only change the default stacksize before the first parallel region */ 7665 if (!TCR_4(__kmp_init_parallel)) { 7666 size_t value = arg; /* argument is in bytes */ 7667 7668 if (value < __kmp_sys_min_stksize) 7669 value = __kmp_sys_min_stksize; 7670 else if (value > KMP_MAX_STKSIZE) 7671 value = KMP_MAX_STKSIZE; 7672 7673 __kmp_stksize = value; 7674 7675 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7676 } 7677 7678 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7679 } 7680 7681 /* set the behaviour of the runtime library */ 7682 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7683 void __kmp_aux_set_library(enum library_type arg) { 7684 __kmp_library = arg; 7685 7686 switch (__kmp_library) { 7687 case library_serial: { 7688 KMP_INFORM(LibraryIsSerial); 7689 } break; 7690 case library_turnaround: 7691 if (__kmp_use_yield == 1 && !__kmp_use_yield_exp_set) 7692 __kmp_use_yield = 2; // only yield when oversubscribed 7693 break; 7694 case library_throughput: 7695 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) 7696 __kmp_dflt_blocktime = 200; 7697 break; 7698 default: 7699 KMP_FATAL(UnknownLibraryType, arg); 7700 } 7701 } 7702 7703 /* Getting team information common for all team API */ 7704 // Returns NULL if not in teams construct 7705 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 7706 kmp_info_t *thr = __kmp_entry_thread(); 7707 teams_serialized = 0; 7708 if (thr->th.th_teams_microtask) { 7709 kmp_team_t *team = thr->th.th_team; 7710 int tlevel = thr->th.th_teams_level; // the level of the teams construct 7711 int ii = team->t.t_level; 7712 teams_serialized = team->t.t_serialized; 7713 int level = tlevel + 1; 7714 KMP_DEBUG_ASSERT(ii >= tlevel); 7715 while (ii > level) { 7716 for (teams_serialized = team->t.t_serialized; 7717 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 7718 } 7719 if (team->t.t_serialized && (!teams_serialized)) { 7720 team = team->t.t_parent; 7721 continue; 7722 } 7723 if (ii > level) { 7724 team = team->t.t_parent; 7725 ii--; 7726 } 7727 } 7728 return team; 7729 } 7730 return NULL; 7731 } 7732 7733 int __kmp_aux_get_team_num() { 7734 int serialized; 7735 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7736 if (team) { 7737 if (serialized > 1) { 7738 return 0; // teams region is serialized ( 1 team of 1 thread ). 7739 } else { 7740 return team->t.t_master_tid; 7741 } 7742 } 7743 return 0; 7744 } 7745 7746 int __kmp_aux_get_num_teams() { 7747 int serialized; 7748 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7749 if (team) { 7750 if (serialized > 1) { 7751 return 1; 7752 } else { 7753 return team->t.t_parent->t.t_nproc; 7754 } 7755 } 7756 return 1; 7757 } 7758 7759 /* ------------------------------------------------------------------------ */ 7760 7761 /* 7762 * Affinity Format Parser 7763 * 7764 * Field is in form of: %[[[0].]size]type 7765 * % and type are required (%% means print a literal '%') 7766 * type is either single char or long name surrounded by {}, 7767 * e.g., N or {num_threads} 7768 * 0 => leading zeros 7769 * . => right justified when size is specified 7770 * by default output is left justified 7771 * size is the *minimum* field length 7772 * All other characters are printed as is 7773 * 7774 * Available field types: 7775 * L {thread_level} - omp_get_level() 7776 * n {thread_num} - omp_get_thread_num() 7777 * h {host} - name of host machine 7778 * P {process_id} - process id (integer) 7779 * T {thread_identifier} - native thread identifier (integer) 7780 * N {num_threads} - omp_get_num_threads() 7781 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 7782 * a {thread_affinity} - comma separated list of integers or integer ranges 7783 * (values of affinity mask) 7784 * 7785 * Implementation-specific field types can be added 7786 * If a type is unknown, print "undefined" 7787 */ 7788 7789 // Structure holding the short name, long name, and corresponding data type 7790 // for snprintf. A table of these will represent the entire valid keyword 7791 // field types. 7792 typedef struct kmp_affinity_format_field_t { 7793 char short_name; // from spec e.g., L -> thread level 7794 const char *long_name; // from spec thread_level -> thread level 7795 char field_format; // data type for snprintf (typically 'd' or 's' 7796 // for integer or string) 7797 } kmp_affinity_format_field_t; 7798 7799 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 7800 #if KMP_AFFINITY_SUPPORTED 7801 {'A', "thread_affinity", 's'}, 7802 #endif 7803 {'t', "team_num", 'd'}, 7804 {'T', "num_teams", 'd'}, 7805 {'L', "nesting_level", 'd'}, 7806 {'n', "thread_num", 'd'}, 7807 {'N', "num_threads", 'd'}, 7808 {'a', "ancestor_tnum", 'd'}, 7809 {'H', "host", 's'}, 7810 {'P', "process_id", 'd'}, 7811 {'i', "native_thread_id", 'd'}}; 7812 7813 // Return the number of characters it takes to hold field 7814 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 7815 const char **ptr, 7816 kmp_str_buf_t *field_buffer) { 7817 int rc, format_index, field_value; 7818 const char *width_left, *width_right; 7819 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 7820 static const int FORMAT_SIZE = 20; 7821 char format[FORMAT_SIZE] = {0}; 7822 char absolute_short_name = 0; 7823 7824 KMP_DEBUG_ASSERT(gtid >= 0); 7825 KMP_DEBUG_ASSERT(th); 7826 KMP_DEBUG_ASSERT(**ptr == '%'); 7827 KMP_DEBUG_ASSERT(field_buffer); 7828 7829 __kmp_str_buf_clear(field_buffer); 7830 7831 // Skip the initial % 7832 (*ptr)++; 7833 7834 // Check for %% first 7835 if (**ptr == '%') { 7836 __kmp_str_buf_cat(field_buffer, "%", 1); 7837 (*ptr)++; // skip over the second % 7838 return 1; 7839 } 7840 7841 // Parse field modifiers if they are present 7842 pad_zeros = false; 7843 if (**ptr == '0') { 7844 pad_zeros = true; 7845 (*ptr)++; // skip over 0 7846 } 7847 right_justify = false; 7848 if (**ptr == '.') { 7849 right_justify = true; 7850 (*ptr)++; // skip over . 7851 } 7852 // Parse width of field: [width_left, width_right) 7853 width_left = width_right = NULL; 7854 if (**ptr >= '0' && **ptr <= '9') { 7855 width_left = *ptr; 7856 SKIP_DIGITS(*ptr); 7857 width_right = *ptr; 7858 } 7859 7860 // Create the format for KMP_SNPRINTF based on flags parsed above 7861 format_index = 0; 7862 format[format_index++] = '%'; 7863 if (!right_justify) 7864 format[format_index++] = '-'; 7865 if (pad_zeros) 7866 format[format_index++] = '0'; 7867 if (width_left && width_right) { 7868 int i = 0; 7869 // Only allow 8 digit number widths. 7870 // This also prevents overflowing format variable 7871 while (i < 8 && width_left < width_right) { 7872 format[format_index++] = *width_left; 7873 width_left++; 7874 i++; 7875 } 7876 } 7877 7878 // Parse a name (long or short) 7879 // Canonicalize the name into absolute_short_name 7880 found_valid_name = false; 7881 parse_long_name = (**ptr == '{'); 7882 if (parse_long_name) 7883 (*ptr)++; // skip initial left brace 7884 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 7885 sizeof(__kmp_affinity_format_table[0]); 7886 ++i) { 7887 char short_name = __kmp_affinity_format_table[i].short_name; 7888 const char *long_name = __kmp_affinity_format_table[i].long_name; 7889 char field_format = __kmp_affinity_format_table[i].field_format; 7890 if (parse_long_name) { 7891 int length = KMP_STRLEN(long_name); 7892 if (strncmp(*ptr, long_name, length) == 0) { 7893 found_valid_name = true; 7894 (*ptr) += length; // skip the long name 7895 } 7896 } else if (**ptr == short_name) { 7897 found_valid_name = true; 7898 (*ptr)++; // skip the short name 7899 } 7900 if (found_valid_name) { 7901 format[format_index++] = field_format; 7902 format[format_index++] = '\0'; 7903 absolute_short_name = short_name; 7904 break; 7905 } 7906 } 7907 if (parse_long_name) { 7908 if (**ptr != '}') { 7909 absolute_short_name = 0; 7910 } else { 7911 (*ptr)++; // skip over the right brace 7912 } 7913 } 7914 7915 // Attempt to fill the buffer with the requested 7916 // value using snprintf within __kmp_str_buf_print() 7917 switch (absolute_short_name) { 7918 case 't': 7919 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 7920 break; 7921 case 'T': 7922 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 7923 break; 7924 case 'L': 7925 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 7926 break; 7927 case 'n': 7928 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 7929 break; 7930 case 'H': { 7931 static const int BUFFER_SIZE = 256; 7932 char buf[BUFFER_SIZE]; 7933 __kmp_expand_host_name(buf, BUFFER_SIZE); 7934 rc = __kmp_str_buf_print(field_buffer, format, buf); 7935 } break; 7936 case 'P': 7937 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 7938 break; 7939 case 'i': 7940 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 7941 break; 7942 case 'N': 7943 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 7944 break; 7945 case 'a': 7946 field_value = 7947 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 7948 rc = __kmp_str_buf_print(field_buffer, format, field_value); 7949 break; 7950 #if KMP_AFFINITY_SUPPORTED 7951 case 'A': { 7952 kmp_str_buf_t buf; 7953 __kmp_str_buf_init(&buf); 7954 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 7955 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 7956 __kmp_str_buf_free(&buf); 7957 } break; 7958 #endif 7959 default: 7960 // According to spec, If an implementation does not have info for field 7961 // type, then "undefined" is printed 7962 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 7963 // Skip the field 7964 if (parse_long_name) { 7965 SKIP_TOKEN(*ptr); 7966 if (**ptr == '}') 7967 (*ptr)++; 7968 } else { 7969 (*ptr)++; 7970 } 7971 } 7972 7973 KMP_ASSERT(format_index <= FORMAT_SIZE); 7974 return rc; 7975 } 7976 7977 /* 7978 * Return number of characters needed to hold the affinity string 7979 * (not including null byte character) 7980 * The resultant string is printed to buffer, which the caller can then 7981 * handle afterwards 7982 */ 7983 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 7984 kmp_str_buf_t *buffer) { 7985 const char *parse_ptr; 7986 size_t retval; 7987 const kmp_info_t *th; 7988 kmp_str_buf_t field; 7989 7990 KMP_DEBUG_ASSERT(buffer); 7991 KMP_DEBUG_ASSERT(gtid >= 0); 7992 7993 __kmp_str_buf_init(&field); 7994 __kmp_str_buf_clear(buffer); 7995 7996 th = __kmp_threads[gtid]; 7997 retval = 0; 7998 7999 // If format is NULL or zero-length string, then we use 8000 // affinity-format-var ICV 8001 parse_ptr = format; 8002 if (parse_ptr == NULL || *parse_ptr == '\0') { 8003 parse_ptr = __kmp_affinity_format; 8004 } 8005 KMP_DEBUG_ASSERT(parse_ptr); 8006 8007 while (*parse_ptr != '\0') { 8008 // Parse a field 8009 if (*parse_ptr == '%') { 8010 // Put field in the buffer 8011 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 8012 __kmp_str_buf_catbuf(buffer, &field); 8013 retval += rc; 8014 } else { 8015 // Put literal character in buffer 8016 __kmp_str_buf_cat(buffer, parse_ptr, 1); 8017 retval++; 8018 parse_ptr++; 8019 } 8020 } 8021 __kmp_str_buf_free(&field); 8022 return retval; 8023 } 8024 8025 // Displays the affinity string to stdout 8026 void __kmp_aux_display_affinity(int gtid, const char *format) { 8027 kmp_str_buf_t buf; 8028 __kmp_str_buf_init(&buf); 8029 __kmp_aux_capture_affinity(gtid, format, &buf); 8030 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 8031 __kmp_str_buf_free(&buf); 8032 } 8033 8034 /* ------------------------------------------------------------------------ */ 8035 8036 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 8037 int blocktime = arg; /* argument is in milliseconds */ 8038 #if KMP_USE_MONITOR 8039 int bt_intervals; 8040 #endif 8041 int bt_set; 8042 8043 __kmp_save_internal_controls(thread); 8044 8045 /* Normalize and set blocktime for the teams */ 8046 if (blocktime < KMP_MIN_BLOCKTIME) 8047 blocktime = KMP_MIN_BLOCKTIME; 8048 else if (blocktime > KMP_MAX_BLOCKTIME) 8049 blocktime = KMP_MAX_BLOCKTIME; 8050 8051 set__blocktime_team(thread->th.th_team, tid, blocktime); 8052 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8053 8054 #if KMP_USE_MONITOR 8055 /* Calculate and set blocktime intervals for the teams */ 8056 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8057 8058 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8059 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8060 #endif 8061 8062 /* Set whether blocktime has been set to "TRUE" */ 8063 bt_set = TRUE; 8064 8065 set__bt_set_team(thread->th.th_team, tid, bt_set); 8066 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8067 #if KMP_USE_MONITOR 8068 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8069 "bt_intervals=%d, monitor_updates=%d\n", 8070 __kmp_gtid_from_tid(tid, thread->th.th_team), 8071 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8072 __kmp_monitor_wakeups)); 8073 #else 8074 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8075 __kmp_gtid_from_tid(tid, thread->th.th_team), 8076 thread->th.th_team->t.t_id, tid, blocktime)); 8077 #endif 8078 } 8079 8080 void __kmp_aux_set_defaults(char const *str, int len) { 8081 if (!__kmp_init_serial) { 8082 __kmp_serial_initialize(); 8083 } 8084 __kmp_env_initialize(str); 8085 8086 if (__kmp_settings || __kmp_display_env || __kmp_display_env_verbose) { 8087 __kmp_env_print(); 8088 } 8089 } // __kmp_aux_set_defaults 8090 8091 /* ------------------------------------------------------------------------ */ 8092 /* internal fast reduction routines */ 8093 8094 PACKED_REDUCTION_METHOD_T 8095 __kmp_determine_reduction_method( 8096 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8097 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8098 kmp_critical_name *lck) { 8099 8100 // Default reduction method: critical construct ( lck != NULL, like in current 8101 // PAROPT ) 8102 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8103 // can be selected by RTL 8104 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8105 // can be selected by RTL 8106 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8107 // among generated by PAROPT. 8108 8109 PACKED_REDUCTION_METHOD_T retval; 8110 8111 int team_size; 8112 8113 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8114 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8115 8116 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8117 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8118 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8119 8120 retval = critical_reduce_block; 8121 8122 // another choice of getting a team size (with 1 dynamic deference) is slower 8123 team_size = __kmp_get_team_num_threads(global_tid); 8124 if (team_size == 1) { 8125 8126 retval = empty_reduce_block; 8127 8128 } else { 8129 8130 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8131 8132 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || \ 8133 KMP_ARCH_MIPS64 || KMP_ARCH_RISCV64 8134 8135 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8136 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8137 8138 int teamsize_cutoff = 4; 8139 8140 #if KMP_MIC_SUPPORTED 8141 if (__kmp_mic_type != non_mic) { 8142 teamsize_cutoff = 8; 8143 } 8144 #endif 8145 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8146 if (tree_available) { 8147 if (team_size <= teamsize_cutoff) { 8148 if (atomic_available) { 8149 retval = atomic_reduce_block; 8150 } 8151 } else { 8152 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8153 } 8154 } else if (atomic_available) { 8155 retval = atomic_reduce_block; 8156 } 8157 #else 8158 #error "Unknown or unsupported OS" 8159 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8160 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8161 8162 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8163 8164 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_WINDOWS || KMP_OS_HURD 8165 8166 // basic tuning 8167 8168 if (atomic_available) { 8169 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8170 retval = atomic_reduce_block; 8171 } 8172 } // otherwise: use critical section 8173 8174 #elif KMP_OS_DARWIN 8175 8176 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8177 if (atomic_available && (num_vars <= 3)) { 8178 retval = atomic_reduce_block; 8179 } else if (tree_available) { 8180 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8181 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8182 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8183 } 8184 } // otherwise: use critical section 8185 8186 #else 8187 #error "Unknown or unsupported OS" 8188 #endif 8189 8190 #else 8191 #error "Unknown or unsupported architecture" 8192 #endif 8193 } 8194 8195 // KMP_FORCE_REDUCTION 8196 8197 // If the team is serialized (team_size == 1), ignore the forced reduction 8198 // method and stay with the unsynchronized method (empty_reduce_block) 8199 if (__kmp_force_reduction_method != reduction_method_not_defined && 8200 team_size != 1) { 8201 8202 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8203 8204 int atomic_available, tree_available; 8205 8206 switch ((forced_retval = __kmp_force_reduction_method)) { 8207 case critical_reduce_block: 8208 KMP_ASSERT(lck); // lck should be != 0 8209 break; 8210 8211 case atomic_reduce_block: 8212 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8213 if (!atomic_available) { 8214 KMP_WARNING(RedMethodNotSupported, "atomic"); 8215 forced_retval = critical_reduce_block; 8216 } 8217 break; 8218 8219 case tree_reduce_block: 8220 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8221 if (!tree_available) { 8222 KMP_WARNING(RedMethodNotSupported, "tree"); 8223 forced_retval = critical_reduce_block; 8224 } else { 8225 #if KMP_FAST_REDUCTION_BARRIER 8226 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8227 #endif 8228 } 8229 break; 8230 8231 default: 8232 KMP_ASSERT(0); // "unsupported method specified" 8233 } 8234 8235 retval = forced_retval; 8236 } 8237 8238 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8239 8240 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8241 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8242 8243 return (retval); 8244 } 8245 // this function is for testing set/get/determine reduce method 8246 kmp_int32 __kmp_get_reduce_method(void) { 8247 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8248 } 8249 8250 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8251 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8252 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8253 8254 // Hard pause shuts down the runtime completely. Resume happens naturally when 8255 // OpenMP is used subsequently. 8256 void __kmp_hard_pause() { 8257 __kmp_pause_status = kmp_hard_paused; 8258 __kmp_internal_end_thread(-1); 8259 } 8260 8261 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8262 void __kmp_resume_if_soft_paused() { 8263 if (__kmp_pause_status == kmp_soft_paused) { 8264 __kmp_pause_status = kmp_not_paused; 8265 8266 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8267 kmp_info_t *thread = __kmp_threads[gtid]; 8268 if (thread) { // Wake it if sleeping 8269 kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 8270 if (fl.is_sleeping()) 8271 fl.resume(gtid); 8272 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8273 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8274 } else { // thread holds the lock and may sleep soon 8275 do { // until either the thread sleeps, or we can get the lock 8276 if (fl.is_sleeping()) { 8277 fl.resume(gtid); 8278 break; 8279 } else if (__kmp_try_suspend_mx(thread)) { 8280 __kmp_unlock_suspend_mx(thread); 8281 break; 8282 } 8283 } while (1); 8284 } 8285 } 8286 } 8287 } 8288 } 8289 8290 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8291 // TODO: add warning messages 8292 int __kmp_pause_resource(kmp_pause_status_t level) { 8293 if (level == kmp_not_paused) { // requesting resume 8294 if (__kmp_pause_status == kmp_not_paused) { 8295 // error message about runtime not being paused, so can't resume 8296 return 1; 8297 } else { 8298 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8299 __kmp_pause_status == kmp_hard_paused); 8300 __kmp_pause_status = kmp_not_paused; 8301 return 0; 8302 } 8303 } else if (level == kmp_soft_paused) { // requesting soft pause 8304 if (__kmp_pause_status != kmp_not_paused) { 8305 // error message about already being paused 8306 return 1; 8307 } else { 8308 __kmp_soft_pause(); 8309 return 0; 8310 } 8311 } else if (level == kmp_hard_paused) { // requesting hard pause 8312 if (__kmp_pause_status != kmp_not_paused) { 8313 // error message about already being paused 8314 return 1; 8315 } else { 8316 __kmp_hard_pause(); 8317 return 0; 8318 } 8319 } else { 8320 // error message about invalid level 8321 return 1; 8322 } 8323 } 8324 8325 8326 void __kmp_omp_display_env(int verbose) { 8327 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 8328 if (__kmp_init_serial == 0) 8329 __kmp_do_serial_initialize(); 8330 __kmp_display_env_impl(!verbose, verbose); 8331 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 8332 } 8333