1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_affinity.h" 16 #include "kmp_atomic.h" 17 #include "kmp_environment.h" 18 #include "kmp_error.h" 19 #include "kmp_i18n.h" 20 #include "kmp_io.h" 21 #include "kmp_itt.h" 22 #include "kmp_settings.h" 23 #include "kmp_stats.h" 24 #include "kmp_str.h" 25 #include "kmp_wait_release.h" 26 #include "kmp_wrapper_getpid.h" 27 28 #if OMPT_SUPPORT 29 #include "ompt-specific.h" 30 #endif 31 32 /* these are temporary issues to be dealt with */ 33 #define KMP_USE_PRCTL 0 34 35 #if KMP_OS_WINDOWS 36 #include <process.h> 37 #endif 38 39 #include "tsan_annotations.h" 40 41 #if defined(KMP_GOMP_COMPAT) 42 char const __kmp_version_alt_comp[] = 43 KMP_VERSION_PREFIX "alternative compiler support: yes"; 44 #endif /* defined(KMP_GOMP_COMPAT) */ 45 46 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: " 47 #if OMP_50_ENABLED 48 "5.0 (201611)"; 49 #elif OMP_45_ENABLED 50 "4.5 (201511)"; 51 #elif OMP_40_ENABLED 52 "4.0 (201307)"; 53 #else 54 "3.1 (201107)"; 55 #endif 56 57 #ifdef KMP_DEBUG 58 char const __kmp_version_lock[] = 59 KMP_VERSION_PREFIX "lock type: run time selectable"; 60 #endif /* KMP_DEBUG */ 61 62 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 63 64 /* ------------------------------------------------------------------------ */ 65 66 kmp_info_t __kmp_monitor; 67 68 /* Forward declarations */ 69 70 void __kmp_cleanup(void); 71 72 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 73 int gtid); 74 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 75 kmp_internal_control_t *new_icvs, 76 ident_t *loc); 77 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 78 static void __kmp_partition_places(kmp_team_t *team, 79 int update_master_only = 0); 80 #endif 81 static void __kmp_do_serial_initialize(void); 82 void __kmp_fork_barrier(int gtid, int tid); 83 void __kmp_join_barrier(int gtid); 84 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 85 kmp_internal_control_t *new_icvs, ident_t *loc); 86 87 #ifdef USE_LOAD_BALANCE 88 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 89 #endif 90 91 static int __kmp_expand_threads(int nWish, int nNeed); 92 #if KMP_OS_WINDOWS 93 static int __kmp_unregister_root_other_thread(int gtid); 94 #endif 95 static void __kmp_unregister_library(void); // called by __kmp_internal_end() 96 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 97 static kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 98 99 /* Calculate the identifier of the current thread */ 100 /* fast (and somewhat portable) way to get unique identifier of executing 101 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 102 int __kmp_get_global_thread_id() { 103 int i; 104 kmp_info_t **other_threads; 105 size_t stack_data; 106 char *stack_addr; 107 size_t stack_size; 108 char *stack_base; 109 110 KA_TRACE( 111 1000, 112 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 113 __kmp_nth, __kmp_all_nth)); 114 115 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 116 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 117 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 118 __kmp_init_gtid for this to work. */ 119 120 if (!TCR_4(__kmp_init_gtid)) 121 return KMP_GTID_DNE; 122 123 #ifdef KMP_TDATA_GTID 124 if (TCR_4(__kmp_gtid_mode) >= 3) { 125 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 126 return __kmp_gtid; 127 } 128 #endif 129 if (TCR_4(__kmp_gtid_mode) >= 2) { 130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 131 return __kmp_gtid_get_specific(); 132 } 133 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 134 135 stack_addr = (char *)&stack_data; 136 other_threads = __kmp_threads; 137 138 /* ATT: The code below is a source of potential bugs due to unsynchronized 139 access to __kmp_threads array. For example: 140 1. Current thread loads other_threads[i] to thr and checks it, it is 141 non-NULL. 142 2. Current thread is suspended by OS. 143 3. Another thread unregisters and finishes (debug versions of free() 144 may fill memory with something like 0xEF). 145 4. Current thread is resumed. 146 5. Current thread reads junk from *thr. 147 TODO: Fix it. --ln */ 148 149 for (i = 0; i < __kmp_threads_capacity; i++) { 150 151 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 152 if (!thr) 153 continue; 154 155 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 156 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 157 158 /* stack grows down -- search through all of the active threads */ 159 160 if (stack_addr <= stack_base) { 161 size_t stack_diff = stack_base - stack_addr; 162 163 if (stack_diff <= stack_size) { 164 /* The only way we can be closer than the allocated */ 165 /* stack size is if we are running on this thread. */ 166 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 167 return i; 168 } 169 } 170 } 171 172 /* get specific to try and determine our gtid */ 173 KA_TRACE(1000, 174 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 175 "thread, using TLS\n")); 176 i = __kmp_gtid_get_specific(); 177 178 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 179 180 /* if we havn't been assigned a gtid, then return code */ 181 if (i < 0) 182 return i; 183 184 /* dynamically updated stack window for uber threads to avoid get_specific 185 call */ 186 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 187 KMP_FATAL(StackOverflow, i); 188 } 189 190 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 191 if (stack_addr > stack_base) { 192 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 193 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 194 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 195 stack_base); 196 } else { 197 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 198 stack_base - stack_addr); 199 } 200 201 /* Reprint stack bounds for ubermaster since they have been refined */ 202 if (__kmp_storage_map) { 203 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 204 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 205 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 206 other_threads[i]->th.th_info.ds.ds_stacksize, 207 "th_%d stack (refinement)", i); 208 } 209 return i; 210 } 211 212 int __kmp_get_global_thread_id_reg() { 213 int gtid; 214 215 if (!__kmp_init_serial) { 216 gtid = KMP_GTID_DNE; 217 } else 218 #ifdef KMP_TDATA_GTID 219 if (TCR_4(__kmp_gtid_mode) >= 3) { 220 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 221 gtid = __kmp_gtid; 222 } else 223 #endif 224 if (TCR_4(__kmp_gtid_mode) >= 2) { 225 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 226 gtid = __kmp_gtid_get_specific(); 227 } else { 228 KA_TRACE(1000, 229 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 230 gtid = __kmp_get_global_thread_id(); 231 } 232 233 /* we must be a new uber master sibling thread */ 234 if (gtid == KMP_GTID_DNE) { 235 KA_TRACE(10, 236 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 237 "Registering a new gtid.\n")); 238 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 239 if (!__kmp_init_serial) { 240 __kmp_do_serial_initialize(); 241 gtid = __kmp_gtid_get_specific(); 242 } else { 243 gtid = __kmp_register_root(FALSE); 244 } 245 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 246 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 247 } 248 249 KMP_DEBUG_ASSERT(gtid >= 0); 250 251 return gtid; 252 } 253 254 /* caller must hold forkjoin_lock */ 255 void __kmp_check_stack_overlap(kmp_info_t *th) { 256 int f; 257 char *stack_beg = NULL; 258 char *stack_end = NULL; 259 int gtid; 260 261 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 262 if (__kmp_storage_map) { 263 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 264 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 265 266 gtid = __kmp_gtid_from_thread(th); 267 268 if (gtid == KMP_GTID_MONITOR) { 269 __kmp_print_storage_map_gtid( 270 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 271 "th_%s stack (%s)", "mon", 272 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 273 } else { 274 __kmp_print_storage_map_gtid( 275 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 276 "th_%d stack (%s)", gtid, 277 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 278 } 279 } 280 281 /* No point in checking ubermaster threads since they use refinement and 282 * cannot overlap */ 283 gtid = __kmp_gtid_from_thread(th); 284 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 285 KA_TRACE(10, 286 ("__kmp_check_stack_overlap: performing extensive checking\n")); 287 if (stack_beg == NULL) { 288 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 289 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 290 } 291 292 for (f = 0; f < __kmp_threads_capacity; f++) { 293 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 294 295 if (f_th && f_th != th) { 296 char *other_stack_end = 297 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 298 char *other_stack_beg = 299 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 300 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 301 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 302 303 /* Print the other stack values before the abort */ 304 if (__kmp_storage_map) 305 __kmp_print_storage_map_gtid( 306 -1, other_stack_beg, other_stack_end, 307 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 308 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 309 310 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 311 __kmp_msg_null); 312 } 313 } 314 } 315 } 316 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 317 } 318 319 /* ------------------------------------------------------------------------ */ 320 321 void __kmp_infinite_loop(void) { 322 static int done = FALSE; 323 324 while (!done) { 325 KMP_YIELD(1); 326 } 327 } 328 329 #define MAX_MESSAGE 512 330 331 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 332 char const *format, ...) { 333 char buffer[MAX_MESSAGE]; 334 va_list ap; 335 336 va_start(ap, format); 337 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 338 p2, (unsigned long)size, format); 339 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 340 __kmp_vprintf(kmp_err, buffer, ap); 341 #if KMP_PRINT_DATA_PLACEMENT 342 int node; 343 if (gtid >= 0) { 344 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 345 if (__kmp_storage_map_verbose) { 346 node = __kmp_get_host_node(p1); 347 if (node < 0) /* doesn't work, so don't try this next time */ 348 __kmp_storage_map_verbose = FALSE; 349 else { 350 char *last; 351 int lastNode; 352 int localProc = __kmp_get_cpu_from_gtid(gtid); 353 354 const int page_size = KMP_GET_PAGE_SIZE(); 355 356 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 357 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 358 if (localProc >= 0) 359 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 360 localProc >> 1); 361 else 362 __kmp_printf_no_lock(" GTID %d\n", gtid); 363 #if KMP_USE_PRCTL 364 /* The more elaborate format is disabled for now because of the prctl 365 * hanging bug. */ 366 do { 367 last = p1; 368 lastNode = node; 369 /* This loop collates adjacent pages with the same host node. */ 370 do { 371 (char *)p1 += page_size; 372 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 373 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 374 lastNode); 375 } while (p1 <= p2); 376 #else 377 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 378 (char *)p1 + (page_size - 1), 379 __kmp_get_host_node(p1)); 380 if (p1 < p2) { 381 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 382 (char *)p2 + (page_size - 1), 383 __kmp_get_host_node(p2)); 384 } 385 #endif 386 } 387 } 388 } else 389 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 390 } 391 #endif /* KMP_PRINT_DATA_PLACEMENT */ 392 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 393 } 394 395 void __kmp_warn(char const *format, ...) { 396 char buffer[MAX_MESSAGE]; 397 va_list ap; 398 399 if (__kmp_generate_warnings == kmp_warnings_off) { 400 return; 401 } 402 403 va_start(ap, format); 404 405 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 406 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 407 __kmp_vprintf(kmp_err, buffer, ap); 408 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 409 410 va_end(ap); 411 } 412 413 void __kmp_abort_process() { 414 // Later threads may stall here, but that's ok because abort() will kill them. 415 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 416 417 if (__kmp_debug_buf) { 418 __kmp_dump_debug_buffer(); 419 } 420 421 if (KMP_OS_WINDOWS) { 422 // Let other threads know of abnormal termination and prevent deadlock 423 // if abort happened during library initialization or shutdown 424 __kmp_global.g.g_abort = SIGABRT; 425 426 /* On Windows* OS by default abort() causes pop-up error box, which stalls 427 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 428 boxes. _set_abort_behavior() works well, but this function is not 429 available in VS7 (this is not problem for DLL, but it is a problem for 430 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 431 help, at least in some versions of MS C RTL. 432 433 It seems following sequence is the only way to simulate abort() and 434 avoid pop-up error box. */ 435 raise(SIGABRT); 436 _exit(3); // Just in case, if signal ignored, exit anyway. 437 } else { 438 abort(); 439 } 440 441 __kmp_infinite_loop(); 442 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 443 444 } // __kmp_abort_process 445 446 void __kmp_abort_thread(void) { 447 // TODO: Eliminate g_abort global variable and this function. 448 // In case of abort just call abort(), it will kill all the threads. 449 __kmp_infinite_loop(); 450 } // __kmp_abort_thread 451 452 /* Print out the storage map for the major kmp_info_t thread data structures 453 that are allocated together. */ 454 455 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 456 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 457 gtid); 458 459 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 460 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 461 462 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 463 sizeof(kmp_local_t), "th_%d.th_local", gtid); 464 465 __kmp_print_storage_map_gtid( 466 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 467 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 468 469 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 470 &thr->th.th_bar[bs_plain_barrier + 1], 471 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 472 gtid); 473 474 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 475 &thr->th.th_bar[bs_forkjoin_barrier + 1], 476 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 477 gtid); 478 479 #if KMP_FAST_REDUCTION_BARRIER 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 481 &thr->th.th_bar[bs_reduction_barrier + 1], 482 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 483 gtid); 484 #endif // KMP_FAST_REDUCTION_BARRIER 485 } 486 487 /* Print out the storage map for the major kmp_team_t team data structures 488 that are allocated together. */ 489 490 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 491 int team_id, int num_thr) { 492 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 493 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 494 header, team_id); 495 496 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 497 &team->t.t_bar[bs_last_barrier], 498 sizeof(kmp_balign_team_t) * bs_last_barrier, 499 "%s_%d.t_bar", header, team_id); 500 501 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 502 &team->t.t_bar[bs_plain_barrier + 1], 503 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 504 header, team_id); 505 506 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 507 &team->t.t_bar[bs_forkjoin_barrier + 1], 508 sizeof(kmp_balign_team_t), 509 "%s_%d.t_bar[forkjoin]", header, team_id); 510 511 #if KMP_FAST_REDUCTION_BARRIER 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 513 &team->t.t_bar[bs_reduction_barrier + 1], 514 sizeof(kmp_balign_team_t), 515 "%s_%d.t_bar[reduction]", header, team_id); 516 #endif // KMP_FAST_REDUCTION_BARRIER 517 518 __kmp_print_storage_map_gtid( 519 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 520 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 521 522 __kmp_print_storage_map_gtid( 523 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 524 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 525 526 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 527 &team->t.t_disp_buffer[num_disp_buff], 528 sizeof(dispatch_shared_info_t) * num_disp_buff, 529 "%s_%d.t_disp_buffer", header, team_id); 530 531 __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data, 532 sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, 533 team_id); 534 } 535 536 static void __kmp_init_allocator() {} 537 static void __kmp_fini_allocator() {} 538 539 /* ------------------------------------------------------------------------ */ 540 541 #ifdef KMP_DYNAMIC_LIB 542 #if KMP_OS_WINDOWS 543 544 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) { 545 // TODO: Change to __kmp_break_bootstrap_lock(). 546 __kmp_init_bootstrap_lock(lck); // make the lock released 547 } 548 549 static void __kmp_reset_locks_on_process_detach(int gtid_req) { 550 int i; 551 int thread_count; 552 553 // PROCESS_DETACH is expected to be called by a thread that executes 554 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one 555 // calling ProcessExit or FreeLibrary). So, it might be safe to access the 556 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some 557 // threads can be still alive here, although being about to be terminated. The 558 // threads in the array with ds_thread==0 are most suspicious. Actually, it 559 // can be not safe to access the __kmp_threads[]. 560 561 // TODO: does it make sense to check __kmp_roots[] ? 562 563 // Let's check that there are no other alive threads registered with the OMP 564 // lib. 565 while (1) { 566 thread_count = 0; 567 for (i = 0; i < __kmp_threads_capacity; ++i) { 568 if (!__kmp_threads) 569 continue; 570 kmp_info_t *th = __kmp_threads[i]; 571 if (th == NULL) 572 continue; 573 int gtid = th->th.th_info.ds.ds_gtid; 574 if (gtid == gtid_req) 575 continue; 576 if (gtid < 0) 577 continue; 578 DWORD exit_val; 579 int alive = __kmp_is_thread_alive(th, &exit_val); 580 if (alive) { 581 ++thread_count; 582 } 583 } 584 if (thread_count == 0) 585 break; // success 586 } 587 588 // Assume that I'm alone. Now it might be safe to check and reset locks. 589 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset. 590 __kmp_reset_lock(&__kmp_forkjoin_lock); 591 #ifdef KMP_DEBUG 592 __kmp_reset_lock(&__kmp_stdio_lock); 593 #endif // KMP_DEBUG 594 } 595 596 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 597 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 598 599 switch (fdwReason) { 600 601 case DLL_PROCESS_ATTACH: 602 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 603 604 return TRUE; 605 606 case DLL_PROCESS_DETACH: 607 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 608 609 if (lpReserved != NULL) { 610 // lpReserved is used for telling the difference: 611 // lpReserved == NULL when FreeLibrary() was called, 612 // lpReserved != NULL when the process terminates. 613 // When FreeLibrary() is called, worker threads remain alive. So they will 614 // release the forkjoin lock by themselves. When the process terminates, 615 // worker threads disappear triggering the problem of unreleased forkjoin 616 // lock as described below. 617 618 // A worker thread can take the forkjoin lock. The problem comes up if 619 // that worker thread becomes dead before it releases the forkjoin lock. 620 // The forkjoin lock remains taken, while the thread executing 621 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try 622 // to take the forkjoin lock and will always fail, so that the application 623 // will never finish [normally]. This scenario is possible if 624 // __kmpc_end() has not been executed. It looks like it's not a corner 625 // case, but common cases: 626 // - the main function was compiled by an alternative compiler; 627 // - the main function was compiled by icl but without /Qopenmp 628 // (application with plugins); 629 // - application terminates by calling C exit(), Fortran CALL EXIT() or 630 // Fortran STOP. 631 // - alive foreign thread prevented __kmpc_end from doing cleanup. 632 // 633 // This is a hack to work around the problem. 634 // TODO: !!! figure out something better. 635 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific()); 636 } 637 638 __kmp_internal_end_library(__kmp_gtid_get_specific()); 639 640 return TRUE; 641 642 case DLL_THREAD_ATTACH: 643 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 644 645 /* if we want to register new siblings all the time here call 646 * __kmp_get_gtid(); */ 647 return TRUE; 648 649 case DLL_THREAD_DETACH: 650 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 651 652 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 653 return TRUE; 654 } 655 656 return TRUE; 657 } 658 659 #endif /* KMP_OS_WINDOWS */ 660 #endif /* KMP_DYNAMIC_LIB */ 661 662 /* Change the library type to "status" and return the old type */ 663 /* called from within initialization routines where __kmp_initz_lock is held */ 664 int __kmp_change_library(int status) { 665 int old_status; 666 667 old_status = __kmp_yield_init & 668 1; // check whether KMP_LIBRARY=throughput (even init count) 669 670 if (status) { 671 __kmp_yield_init |= 1; // throughput => turnaround (odd init count) 672 } else { 673 __kmp_yield_init &= ~1; // turnaround => throughput (even init count) 674 } 675 676 return old_status; // return previous setting of whether 677 // KMP_LIBRARY=throughput 678 } 679 680 /* __kmp_parallel_deo -- Wait until it's our turn. */ 681 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 682 int gtid = *gtid_ref; 683 #ifdef BUILD_PARALLEL_ORDERED 684 kmp_team_t *team = __kmp_team_from_gtid(gtid); 685 #endif /* BUILD_PARALLEL_ORDERED */ 686 687 if (__kmp_env_consistency_check) { 688 if (__kmp_threads[gtid]->th.th_root->r.r_active) 689 #if KMP_USE_DYNAMIC_LOCK 690 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 691 #else 692 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 693 #endif 694 } 695 #ifdef BUILD_PARALLEL_ORDERED 696 if (!team->t.t_serialized) { 697 KMP_MB(); 698 KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), 699 KMP_EQ, NULL); 700 KMP_MB(); 701 } 702 #endif /* BUILD_PARALLEL_ORDERED */ 703 } 704 705 /* __kmp_parallel_dxo -- Signal the next task. */ 706 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 707 int gtid = *gtid_ref; 708 #ifdef BUILD_PARALLEL_ORDERED 709 int tid = __kmp_tid_from_gtid(gtid); 710 kmp_team_t *team = __kmp_team_from_gtid(gtid); 711 #endif /* BUILD_PARALLEL_ORDERED */ 712 713 if (__kmp_env_consistency_check) { 714 if (__kmp_threads[gtid]->th.th_root->r.r_active) 715 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 716 } 717 #ifdef BUILD_PARALLEL_ORDERED 718 if (!team->t.t_serialized) { 719 KMP_MB(); /* Flush all pending memory write invalidates. */ 720 721 /* use the tid of the next thread in this team */ 722 /* TODO replace with general release procedure */ 723 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 724 725 KMP_MB(); /* Flush all pending memory write invalidates. */ 726 } 727 #endif /* BUILD_PARALLEL_ORDERED */ 728 } 729 730 /* ------------------------------------------------------------------------ */ 731 /* The BARRIER for a SINGLE process section is always explicit */ 732 733 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 734 int status; 735 kmp_info_t *th; 736 kmp_team_t *team; 737 738 if (!TCR_4(__kmp_init_parallel)) 739 __kmp_parallel_initialize(); 740 741 th = __kmp_threads[gtid]; 742 team = th->th.th_team; 743 status = 0; 744 745 th->th.th_ident = id_ref; 746 747 if (team->t.t_serialized) { 748 status = 1; 749 } else { 750 kmp_int32 old_this = th->th.th_local.this_construct; 751 752 ++th->th.th_local.this_construct; 753 /* try to set team count to thread count--success means thread got the 754 single block */ 755 /* TODO: Should this be acquire or release? */ 756 if (team->t.t_construct == old_this) { 757 status = KMP_COMPARE_AND_STORE_ACQ32(&team->t.t_construct, old_this, 758 th->th.th_local.this_construct); 759 } 760 #if USE_ITT_BUILD 761 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 762 KMP_MASTER_GTID(gtid) && 763 #if OMP_40_ENABLED 764 th->th.th_teams_microtask == NULL && 765 #endif 766 team->t.t_active_level == 767 1) { // Only report metadata by master of active team at level 1 768 __kmp_itt_metadata_single(id_ref); 769 } 770 #endif /* USE_ITT_BUILD */ 771 } 772 773 if (__kmp_env_consistency_check) { 774 if (status && push_ws) { 775 __kmp_push_workshare(gtid, ct_psingle, id_ref); 776 } else { 777 __kmp_check_workshare(gtid, ct_psingle, id_ref); 778 } 779 } 780 #if USE_ITT_BUILD 781 if (status) { 782 __kmp_itt_single_start(gtid); 783 } 784 #endif /* USE_ITT_BUILD */ 785 return status; 786 } 787 788 void __kmp_exit_single(int gtid) { 789 #if USE_ITT_BUILD 790 __kmp_itt_single_end(gtid); 791 #endif /* USE_ITT_BUILD */ 792 if (__kmp_env_consistency_check) 793 __kmp_pop_workshare(gtid, ct_psingle, NULL); 794 } 795 796 /* determine if we can go parallel or must use a serialized parallel region and 797 * how many threads we can use 798 * set_nproc is the number of threads requested for the team 799 * returns 0 if we should serialize or only use one thread, 800 * otherwise the number of threads to use 801 * The forkjoin lock is held by the caller. */ 802 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 803 int master_tid, int set_nthreads 804 #if OMP_40_ENABLED 805 , 806 int enter_teams 807 #endif /* OMP_40_ENABLED */ 808 ) { 809 int capacity; 810 int new_nthreads; 811 KMP_DEBUG_ASSERT(__kmp_init_serial); 812 KMP_DEBUG_ASSERT(root && parent_team); 813 814 // If dyn-var is set, dynamically adjust the number of desired threads, 815 // according to the method specified by dynamic_mode. 816 new_nthreads = set_nthreads; 817 if (!get__dynamic_2(parent_team, master_tid)) { 818 ; 819 } 820 #ifdef USE_LOAD_BALANCE 821 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 822 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 823 if (new_nthreads == 1) { 824 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 825 "reservation to 1 thread\n", 826 master_tid)); 827 return 1; 828 } 829 if (new_nthreads < set_nthreads) { 830 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 831 "reservation to %d threads\n", 832 master_tid, new_nthreads)); 833 } 834 } 835 #endif /* USE_LOAD_BALANCE */ 836 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 837 new_nthreads = __kmp_avail_proc - __kmp_nth + 838 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 839 if (new_nthreads <= 1) { 840 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 841 "reservation to 1 thread\n", 842 master_tid)); 843 return 1; 844 } 845 if (new_nthreads < set_nthreads) { 846 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 847 "reservation to %d threads\n", 848 master_tid, new_nthreads)); 849 } else { 850 new_nthreads = set_nthreads; 851 } 852 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 853 if (set_nthreads > 2) { 854 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 855 new_nthreads = (new_nthreads % set_nthreads) + 1; 856 if (new_nthreads == 1) { 857 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 858 "reservation to 1 thread\n", 859 master_tid)); 860 return 1; 861 } 862 if (new_nthreads < set_nthreads) { 863 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 864 "reservation to %d threads\n", 865 master_tid, new_nthreads)); 866 } 867 } 868 } else { 869 KMP_ASSERT(0); 870 } 871 872 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 873 if (__kmp_nth + new_nthreads - 874 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 875 __kmp_max_nth) { 876 int tl_nthreads = __kmp_max_nth - __kmp_nth + 877 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 878 if (tl_nthreads <= 0) { 879 tl_nthreads = 1; 880 } 881 882 // If dyn-var is false, emit a 1-time warning. 883 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 884 __kmp_reserve_warn = 1; 885 __kmp_msg(kmp_ms_warning, 886 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 887 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 888 } 889 if (tl_nthreads == 1) { 890 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 891 "reduced reservation to 1 thread\n", 892 master_tid)); 893 return 1; 894 } 895 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 896 "reservation to %d threads\n", 897 master_tid, tl_nthreads)); 898 new_nthreads = tl_nthreads; 899 } 900 901 // Respect OMP_THREAD_LIMIT 902 if (root->r.r_cg_nthreads + new_nthreads - 903 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 904 __kmp_cg_max_nth) { 905 int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads + 906 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 907 if (tl_nthreads <= 0) { 908 tl_nthreads = 1; 909 } 910 911 // If dyn-var is false, emit a 1-time warning. 912 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 913 __kmp_reserve_warn = 1; 914 __kmp_msg(kmp_ms_warning, 915 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 916 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 917 } 918 if (tl_nthreads == 1) { 919 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 920 "reduced reservation to 1 thread\n", 921 master_tid)); 922 return 1; 923 } 924 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 925 "reservation to %d threads\n", 926 master_tid, tl_nthreads)); 927 new_nthreads = tl_nthreads; 928 } 929 930 // Check if the threads array is large enough, or needs expanding. 931 // See comment in __kmp_register_root() about the adjustment if 932 // __kmp_threads[0] == NULL. 933 capacity = __kmp_threads_capacity; 934 if (TCR_PTR(__kmp_threads[0]) == NULL) { 935 --capacity; 936 } 937 if (__kmp_nth + new_nthreads - 938 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 939 capacity) { 940 // Expand the threads array. 941 int slotsRequired = __kmp_nth + new_nthreads - 942 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 943 capacity; 944 int slotsAdded = __kmp_expand_threads(slotsRequired, slotsRequired); 945 if (slotsAdded < slotsRequired) { 946 // The threads array was not expanded enough. 947 new_nthreads -= (slotsRequired - slotsAdded); 948 KMP_ASSERT(new_nthreads >= 1); 949 950 // If dyn-var is false, emit a 1-time warning. 951 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 952 __kmp_reserve_warn = 1; 953 if (__kmp_tp_cached) { 954 __kmp_msg(kmp_ms_warning, 955 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 956 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 957 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 958 } else { 959 __kmp_msg(kmp_ms_warning, 960 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 961 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 962 } 963 } 964 } 965 } 966 967 #ifdef KMP_DEBUG 968 if (new_nthreads == 1) { 969 KC_TRACE(10, 970 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 971 "dead roots and rechecking; requested %d threads\n", 972 __kmp_get_gtid(), set_nthreads)); 973 } else { 974 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 975 " %d threads\n", 976 __kmp_get_gtid(), new_nthreads, set_nthreads)); 977 } 978 #endif // KMP_DEBUG 979 return new_nthreads; 980 } 981 982 /* Allocate threads from the thread pool and assign them to the new team. We are 983 assured that there are enough threads available, because we checked on that 984 earlier within critical section forkjoin */ 985 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 986 kmp_info_t *master_th, int master_gtid) { 987 int i; 988 int use_hot_team; 989 990 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 991 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 992 KMP_MB(); 993 994 /* first, let's setup the master thread */ 995 master_th->th.th_info.ds.ds_tid = 0; 996 master_th->th.th_team = team; 997 master_th->th.th_team_nproc = team->t.t_nproc; 998 master_th->th.th_team_master = master_th; 999 master_th->th.th_team_serialized = FALSE; 1000 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 1001 1002 /* make sure we are not the optimized hot team */ 1003 #if KMP_NESTED_HOT_TEAMS 1004 use_hot_team = 0; 1005 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 1006 if (hot_teams) { // hot teams array is not allocated if 1007 // KMP_HOT_TEAMS_MAX_LEVEL=0 1008 int level = team->t.t_active_level - 1; // index in array of hot teams 1009 if (master_th->th.th_teams_microtask) { // are we inside the teams? 1010 if (master_th->th.th_teams_size.nteams > 1) { 1011 ++level; // level was not increased in teams construct for 1012 // team_of_masters 1013 } 1014 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1015 master_th->th.th_teams_level == team->t.t_level) { 1016 ++level; // level was not increased in teams construct for 1017 // team_of_workers before the parallel 1018 } // team->t.t_level will be increased inside parallel 1019 } 1020 if (level < __kmp_hot_teams_max_level) { 1021 if (hot_teams[level].hot_team) { 1022 // hot team has already been allocated for given level 1023 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 1024 use_hot_team = 1; // the team is ready to use 1025 } else { 1026 use_hot_team = 0; // AC: threads are not allocated yet 1027 hot_teams[level].hot_team = team; // remember new hot team 1028 hot_teams[level].hot_team_nth = team->t.t_nproc; 1029 } 1030 } else { 1031 use_hot_team = 0; 1032 } 1033 } 1034 #else 1035 use_hot_team = team == root->r.r_hot_team; 1036 #endif 1037 if (!use_hot_team) { 1038 1039 /* install the master thread */ 1040 team->t.t_threads[0] = master_th; 1041 __kmp_initialize_info(master_th, team, 0, master_gtid); 1042 1043 /* now, install the worker threads */ 1044 for (i = 1; i < team->t.t_nproc; i++) { 1045 1046 /* fork or reallocate a new thread and install it in team */ 1047 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1048 team->t.t_threads[i] = thr; 1049 KMP_DEBUG_ASSERT(thr); 1050 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1051 /* align team and thread arrived states */ 1052 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1053 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1054 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1055 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1056 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1057 team->t.t_bar[bs_plain_barrier].b_arrived)); 1058 #if OMP_40_ENABLED 1059 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1060 thr->th.th_teams_level = master_th->th.th_teams_level; 1061 thr->th.th_teams_size = master_th->th.th_teams_size; 1062 #endif 1063 { // Initialize threads' barrier data. 1064 int b; 1065 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1066 for (b = 0; b < bs_last_barrier; ++b) { 1067 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1068 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1069 #if USE_DEBUGGER 1070 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1071 #endif 1072 } 1073 } 1074 } 1075 1076 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1077 __kmp_partition_places(team); 1078 #endif 1079 } 1080 1081 KMP_MB(); 1082 } 1083 1084 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1085 // Propagate any changes to the floating point control registers out to the team 1086 // We try to avoid unnecessary writes to the relevant cache line in the team 1087 // structure, so we don't make changes unless they are needed. 1088 inline static void propagateFPControl(kmp_team_t *team) { 1089 if (__kmp_inherit_fp_control) { 1090 kmp_int16 x87_fpu_control_word; 1091 kmp_uint32 mxcsr; 1092 1093 // Get master values of FPU control flags (both X87 and vector) 1094 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1095 __kmp_store_mxcsr(&mxcsr); 1096 mxcsr &= KMP_X86_MXCSR_MASK; 1097 1098 // There is no point looking at t_fp_control_saved here. 1099 // If it is TRUE, we still have to update the values if they are different 1100 // from those we now have. If it is FALSE we didn't save anything yet, but 1101 // our objective is the same. We have to ensure that the values in the team 1102 // are the same as those we have. 1103 // So, this code achieves what we need whether or not t_fp_control_saved is 1104 // true. By checking whether the value needs updating we avoid unnecessary 1105 // writes that would put the cache-line into a written state, causing all 1106 // threads in the team to have to read it again. 1107 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1108 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1109 // Although we don't use this value, other code in the runtime wants to know 1110 // whether it should restore them. So we must ensure it is correct. 1111 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1112 } else { 1113 // Similarly here. Don't write to this cache-line in the team structure 1114 // unless we have to. 1115 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1116 } 1117 } 1118 1119 // Do the opposite, setting the hardware registers to the updated values from 1120 // the team. 1121 inline static void updateHWFPControl(kmp_team_t *team) { 1122 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1123 // Only reset the fp control regs if they have been changed in the team. 1124 // the parallel region that we are exiting. 1125 kmp_int16 x87_fpu_control_word; 1126 kmp_uint32 mxcsr; 1127 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1128 __kmp_store_mxcsr(&mxcsr); 1129 mxcsr &= KMP_X86_MXCSR_MASK; 1130 1131 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1132 __kmp_clear_x87_fpu_status_word(); 1133 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1134 } 1135 1136 if (team->t.t_mxcsr != mxcsr) { 1137 __kmp_load_mxcsr(&team->t.t_mxcsr); 1138 } 1139 } 1140 } 1141 #else 1142 #define propagateFPControl(x) ((void)0) 1143 #define updateHWFPControl(x) ((void)0) 1144 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1145 1146 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1147 int realloc); // forward declaration 1148 1149 /* Run a parallel region that has been serialized, so runs only in a team of the 1150 single master thread. */ 1151 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1152 kmp_info_t *this_thr; 1153 kmp_team_t *serial_team; 1154 1155 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1156 1157 /* Skip all this code for autopar serialized loops since it results in 1158 unacceptable overhead */ 1159 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1160 return; 1161 1162 if (!TCR_4(__kmp_init_parallel)) 1163 __kmp_parallel_initialize(); 1164 1165 this_thr = __kmp_threads[global_tid]; 1166 serial_team = this_thr->th.th_serial_team; 1167 1168 /* utilize the serialized team held by this thread */ 1169 KMP_DEBUG_ASSERT(serial_team); 1170 KMP_MB(); 1171 1172 if (__kmp_tasking_mode != tskm_immediate_exec) { 1173 KMP_DEBUG_ASSERT( 1174 this_thr->th.th_task_team == 1175 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1176 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1177 NULL); 1178 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1179 "team %p, new task_team = NULL\n", 1180 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1181 this_thr->th.th_task_team = NULL; 1182 } 1183 1184 #if OMP_40_ENABLED 1185 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1186 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1187 proc_bind = proc_bind_false; 1188 } else if (proc_bind == proc_bind_default) { 1189 // No proc_bind clause was specified, so use the current value 1190 // of proc-bind-var for this parallel region. 1191 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1192 } 1193 // Reset for next parallel region 1194 this_thr->th.th_set_proc_bind = proc_bind_default; 1195 #endif /* OMP_40_ENABLED */ 1196 1197 #if OMPT_SUPPORT 1198 ompt_data_t ompt_parallel_data; 1199 ompt_parallel_data.ptr = NULL; 1200 ompt_data_t *implicit_task_data; 1201 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1202 if (ompt_enabled.enabled && 1203 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 1204 1205 ompt_task_info_t *parent_task_info; 1206 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1207 1208 parent_task_info->frame.reenter_runtime_frame = OMPT_GET_FRAME_ADDRESS(1); 1209 if (ompt_enabled.ompt_callback_parallel_begin) { 1210 int team_size = 1; 1211 1212 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1213 &(parent_task_info->task_data), &(parent_task_info->frame), 1214 &ompt_parallel_data, team_size, ompt_invoker_program, codeptr); 1215 } 1216 } 1217 #endif // OMPT_SUPPORT 1218 1219 if (this_thr->th.th_team != serial_team) { 1220 // Nested level will be an index in the nested nthreads array 1221 int level = this_thr->th.th_team->t.t_level; 1222 1223 if (serial_team->t.t_serialized) { 1224 /* this serial team was already used 1225 TODO increase performance by making this locks more specific */ 1226 kmp_team_t *new_team; 1227 1228 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1229 1230 new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1231 #if OMPT_SUPPORT 1232 ompt_parallel_data, 1233 #endif 1234 #if OMP_40_ENABLED 1235 proc_bind, 1236 #endif 1237 &this_thr->th.th_current_task->td_icvs, 1238 0 USE_NESTED_HOT_ARG(NULL)); 1239 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1240 KMP_ASSERT(new_team); 1241 1242 /* setup new serialized team and install it */ 1243 new_team->t.t_threads[0] = this_thr; 1244 new_team->t.t_parent = this_thr->th.th_team; 1245 serial_team = new_team; 1246 this_thr->th.th_serial_team = serial_team; 1247 1248 KF_TRACE( 1249 10, 1250 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1251 global_tid, serial_team)); 1252 1253 /* TODO the above breaks the requirement that if we run out of resources, 1254 then we can still guarantee that serialized teams are ok, since we may 1255 need to allocate a new one */ 1256 } else { 1257 KF_TRACE( 1258 10, 1259 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1260 global_tid, serial_team)); 1261 } 1262 1263 /* we have to initialize this serial team */ 1264 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1265 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1266 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1267 serial_team->t.t_ident = loc; 1268 serial_team->t.t_serialized = 1; 1269 serial_team->t.t_nproc = 1; 1270 serial_team->t.t_parent = this_thr->th.th_team; 1271 serial_team->t.t_sched = this_thr->th.th_team->t.t_sched; 1272 this_thr->th.th_team = serial_team; 1273 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1274 1275 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1276 this_thr->th.th_current_task)); 1277 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1278 this_thr->th.th_current_task->td_flags.executing = 0; 1279 1280 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1281 1282 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1283 implicit task for each serialized task represented by 1284 team->t.t_serialized? */ 1285 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1286 &this_thr->th.th_current_task->td_parent->td_icvs); 1287 1288 // Thread value exists in the nested nthreads array for the next nested 1289 // level 1290 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1291 this_thr->th.th_current_task->td_icvs.nproc = 1292 __kmp_nested_nth.nth[level + 1]; 1293 } 1294 1295 #if OMP_40_ENABLED 1296 if (__kmp_nested_proc_bind.used && 1297 (level + 1 < __kmp_nested_proc_bind.used)) { 1298 this_thr->th.th_current_task->td_icvs.proc_bind = 1299 __kmp_nested_proc_bind.bind_types[level + 1]; 1300 } 1301 #endif /* OMP_40_ENABLED */ 1302 1303 #if USE_DEBUGGER 1304 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1305 #endif 1306 this_thr->th.th_info.ds.ds_tid = 0; 1307 1308 /* set thread cache values */ 1309 this_thr->th.th_team_nproc = 1; 1310 this_thr->th.th_team_master = this_thr; 1311 this_thr->th.th_team_serialized = 1; 1312 1313 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1314 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1315 1316 propagateFPControl(serial_team); 1317 1318 /* check if we need to allocate dispatch buffers stack */ 1319 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1320 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1321 serial_team->t.t_dispatch->th_disp_buffer = 1322 (dispatch_private_info_t *)__kmp_allocate( 1323 sizeof(dispatch_private_info_t)); 1324 } 1325 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1326 1327 KMP_MB(); 1328 1329 } else { 1330 /* this serialized team is already being used, 1331 * that's fine, just add another nested level */ 1332 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1333 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1334 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1335 ++serial_team->t.t_serialized; 1336 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1337 1338 // Nested level will be an index in the nested nthreads array 1339 int level = this_thr->th.th_team->t.t_level; 1340 // Thread value exists in the nested nthreads array for the next nested 1341 // level 1342 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1343 this_thr->th.th_current_task->td_icvs.nproc = 1344 __kmp_nested_nth.nth[level + 1]; 1345 } 1346 serial_team->t.t_level++; 1347 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1348 "of serial team %p to %d\n", 1349 global_tid, serial_team, serial_team->t.t_level)); 1350 1351 /* allocate/push dispatch buffers stack */ 1352 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1353 { 1354 dispatch_private_info_t *disp_buffer = 1355 (dispatch_private_info_t *)__kmp_allocate( 1356 sizeof(dispatch_private_info_t)); 1357 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1358 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1359 } 1360 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1361 1362 KMP_MB(); 1363 } 1364 #if OMP_40_ENABLED 1365 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1366 #endif 1367 1368 if (__kmp_env_consistency_check) 1369 __kmp_push_parallel(global_tid, NULL); 1370 #if OMPT_SUPPORT 1371 serial_team->t.ompt_team_info.master_return_address = codeptr; 1372 if (ompt_enabled.enabled && 1373 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 1374 OMPT_CUR_TASK_INFO(this_thr) 1375 ->frame.exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(1); 1376 1377 ompt_lw_taskteam_t lw_taskteam; 1378 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1379 &ompt_parallel_data, codeptr); 1380 1381 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1382 // don't use lw_taskteam after linking. content was swaped 1383 1384 /* OMPT implicit task begin */ 1385 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1386 if (ompt_enabled.ompt_callback_implicit_task) { 1387 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1388 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1389 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid)); 1390 } 1391 1392 /* OMPT state */ 1393 this_thr->th.ompt_thread_info.state = omp_state_work_parallel; 1394 OMPT_CUR_TASK_INFO(this_thr) 1395 ->frame.exit_runtime_frame = OMPT_GET_FRAME_ADDRESS(1); 1396 } 1397 #endif 1398 } 1399 1400 /* most of the work for a fork */ 1401 /* return true if we really went parallel, false if serialized */ 1402 int __kmp_fork_call(ident_t *loc, int gtid, 1403 enum fork_context_e call_context, // Intel, GNU, ... 1404 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1405 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1406 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1407 va_list *ap 1408 #else 1409 va_list ap 1410 #endif 1411 ) { 1412 void **argv; 1413 int i; 1414 int master_tid; 1415 int master_this_cons; 1416 kmp_team_t *team; 1417 kmp_team_t *parent_team; 1418 kmp_info_t *master_th; 1419 kmp_root_t *root; 1420 int nthreads; 1421 int master_active; 1422 int master_set_numthreads; 1423 int level; 1424 #if OMP_40_ENABLED 1425 int active_level; 1426 int teams_level; 1427 #endif 1428 #if KMP_NESTED_HOT_TEAMS 1429 kmp_hot_team_ptr_t **p_hot_teams; 1430 #endif 1431 { // KMP_TIME_BLOCK 1432 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1433 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1434 1435 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1436 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1437 /* Some systems prefer the stack for the root thread(s) to start with */ 1438 /* some gap from the parent stack to prevent false sharing. */ 1439 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1440 /* These 2 lines below are so this does not get optimized out */ 1441 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1442 __kmp_stkpadding += (short)((kmp_int64)dummy); 1443 } 1444 1445 /* initialize if needed */ 1446 KMP_DEBUG_ASSERT( 1447 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1448 if (!TCR_4(__kmp_init_parallel)) 1449 __kmp_parallel_initialize(); 1450 1451 /* setup current data */ 1452 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1453 // shutdown 1454 parent_team = master_th->th.th_team; 1455 master_tid = master_th->th.th_info.ds.ds_tid; 1456 master_this_cons = master_th->th.th_local.this_construct; 1457 root = master_th->th.th_root; 1458 master_active = root->r.r_active; 1459 master_set_numthreads = master_th->th.th_set_nproc; 1460 1461 #if OMPT_SUPPORT 1462 ompt_data_t ompt_parallel_data; 1463 ompt_parallel_data.ptr = NULL; 1464 ompt_data_t *parent_task_data; 1465 ompt_frame_t *ompt_frame; 1466 ompt_data_t *implicit_task_data; 1467 void *return_address = NULL; 1468 1469 if (ompt_enabled.enabled) { 1470 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1471 NULL, NULL); 1472 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1473 } 1474 #endif 1475 1476 // Nested level will be an index in the nested nthreads array 1477 level = parent_team->t.t_level; 1478 // used to launch non-serial teams even if nested is not allowed 1479 active_level = parent_team->t.t_active_level; 1480 #if OMP_40_ENABLED 1481 // needed to check nesting inside the teams 1482 teams_level = master_th->th.th_teams_level; 1483 #endif 1484 #if KMP_NESTED_HOT_TEAMS 1485 p_hot_teams = &master_th->th.th_hot_teams; 1486 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1487 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1488 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1489 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1490 // it is either actual or not needed (when active_level > 0) 1491 (*p_hot_teams)[0].hot_team_nth = 1; 1492 } 1493 #endif 1494 1495 #if OMPT_SUPPORT 1496 if (ompt_enabled.enabled) { 1497 if (ompt_enabled.ompt_callback_parallel_begin) { 1498 int team_size = master_set_numthreads 1499 ? master_set_numthreads 1500 : get__nproc_2(parent_team, master_tid); 1501 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1502 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, 1503 OMPT_INVOKER(call_context), return_address); 1504 } 1505 master_th->th.ompt_thread_info.state = omp_state_overhead; 1506 } 1507 #endif 1508 1509 master_th->th.th_ident = loc; 1510 1511 #if OMP_40_ENABLED 1512 if (master_th->th.th_teams_microtask && ap && 1513 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1514 // AC: This is start of parallel that is nested inside teams construct. 1515 // The team is actual (hot), all workers are ready at the fork barrier. 1516 // No lock needed to initialize the team a bit, then free workers. 1517 parent_team->t.t_ident = loc; 1518 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1519 parent_team->t.t_argc = argc; 1520 argv = (void **)parent_team->t.t_argv; 1521 for (i = argc - 1; i >= 0; --i) 1522 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1523 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1524 *argv++ = va_arg(*ap, void *); 1525 #else 1526 *argv++ = va_arg(ap, void *); 1527 #endif 1528 // Increment our nested depth levels, but not increase the serialization 1529 if (parent_team == master_th->th.th_serial_team) { 1530 // AC: we are in serialized parallel 1531 __kmpc_serialized_parallel(loc, gtid); 1532 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1533 // AC: need this in order enquiry functions work 1534 // correctly, will restore at join time 1535 parent_team->t.t_serialized--; 1536 #if OMPT_SUPPORT 1537 void *dummy; 1538 void **exit_runtime_p; 1539 1540 ompt_lw_taskteam_t lw_taskteam; 1541 1542 if (ompt_enabled.enabled) { 1543 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1544 &ompt_parallel_data, return_address); 1545 exit_runtime_p = 1546 &(lw_taskteam.ompt_task_info.frame.exit_runtime_frame); 1547 1548 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1549 // don't use lw_taskteam after linking. content was swaped 1550 1551 /* OMPT implicit task begin */ 1552 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1553 if (ompt_enabled.ompt_callback_implicit_task) { 1554 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1555 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1556 implicit_task_data, 1, __kmp_tid_from_gtid(gtid)); 1557 } 1558 1559 /* OMPT state */ 1560 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 1561 } else { 1562 exit_runtime_p = &dummy; 1563 } 1564 #endif 1565 1566 { 1567 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1568 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1569 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1570 #if OMPT_SUPPORT 1571 , 1572 exit_runtime_p 1573 #endif 1574 ); 1575 } 1576 1577 #if OMPT_SUPPORT 1578 *exit_runtime_p = NULL; 1579 if (ompt_enabled.enabled) { 1580 OMPT_CUR_TASK_INFO(master_th)->frame.exit_runtime_frame = NULL; 1581 if (ompt_enabled.ompt_callback_implicit_task) { 1582 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1583 ompt_scope_end, NULL, implicit_task_data, 1, 1584 __kmp_tid_from_gtid(gtid)); 1585 } 1586 __ompt_lw_taskteam_unlink(master_th); 1587 1588 if (ompt_enabled.ompt_callback_parallel_end) { 1589 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1590 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th), 1591 OMPT_INVOKER(call_context), return_address); 1592 } 1593 master_th->th.ompt_thread_info.state = omp_state_overhead; 1594 } 1595 #endif 1596 return TRUE; 1597 } 1598 1599 parent_team->t.t_pkfn = microtask; 1600 parent_team->t.t_invoke = invoker; 1601 KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel); 1602 parent_team->t.t_active_level++; 1603 parent_team->t.t_level++; 1604 1605 /* Change number of threads in the team if requested */ 1606 if (master_set_numthreads) { // The parallel has num_threads clause 1607 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1608 // AC: only can reduce number of threads dynamically, can't increase 1609 kmp_info_t **other_threads = parent_team->t.t_threads; 1610 parent_team->t.t_nproc = master_set_numthreads; 1611 for (i = 0; i < master_set_numthreads; ++i) { 1612 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1613 } 1614 // Keep extra threads hot in the team for possible next parallels 1615 } 1616 master_th->th.th_set_nproc = 0; 1617 } 1618 1619 #if USE_DEBUGGER 1620 if (__kmp_debugging) { // Let debugger override number of threads. 1621 int nth = __kmp_omp_num_threads(loc); 1622 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1623 master_set_numthreads = nth; 1624 } 1625 } 1626 #endif 1627 1628 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1629 "master_th=%p, gtid=%d\n", 1630 root, parent_team, master_th, gtid)); 1631 __kmp_internal_fork(loc, gtid, parent_team); 1632 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1633 "master_th=%p, gtid=%d\n", 1634 root, parent_team, master_th, gtid)); 1635 1636 /* Invoke microtask for MASTER thread */ 1637 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1638 parent_team->t.t_id, parent_team->t.t_pkfn)); 1639 1640 { 1641 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1642 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1643 if (!parent_team->t.t_invoke(gtid)) { 1644 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1645 } 1646 } 1647 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1648 parent_team->t.t_id, parent_team->t.t_pkfn)); 1649 KMP_MB(); /* Flush all pending memory write invalidates. */ 1650 1651 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1652 1653 return TRUE; 1654 } // Parallel closely nested in teams construct 1655 #endif /* OMP_40_ENABLED */ 1656 1657 #if KMP_DEBUG 1658 if (__kmp_tasking_mode != tskm_immediate_exec) { 1659 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1660 parent_team->t.t_task_team[master_th->th.th_task_state]); 1661 } 1662 #endif 1663 1664 if (parent_team->t.t_active_level >= 1665 master_th->th.th_current_task->td_icvs.max_active_levels) { 1666 nthreads = 1; 1667 } else { 1668 #if OMP_40_ENABLED 1669 int enter_teams = ((ap == NULL && active_level == 0) || 1670 (ap && teams_level > 0 && teams_level == level)); 1671 #endif 1672 nthreads = 1673 master_set_numthreads 1674 ? master_set_numthreads 1675 : get__nproc_2( 1676 parent_team, 1677 master_tid); // TODO: get nproc directly from current task 1678 1679 // Check if we need to take forkjoin lock? (no need for serialized 1680 // parallel out of teams construct). This code moved here from 1681 // __kmp_reserve_threads() to speedup nested serialized parallels. 1682 if (nthreads > 1) { 1683 if ((!get__nested(master_th) && (root->r.r_in_parallel 1684 #if OMP_40_ENABLED 1685 && !enter_teams 1686 #endif /* OMP_40_ENABLED */ 1687 )) || 1688 (__kmp_library == library_serial)) { 1689 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1690 " threads\n", 1691 gtid, nthreads)); 1692 nthreads = 1; 1693 } 1694 } 1695 if (nthreads > 1) { 1696 /* determine how many new threads we can use */ 1697 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1698 nthreads = __kmp_reserve_threads( 1699 root, parent_team, master_tid, nthreads 1700 #if OMP_40_ENABLED 1701 /* AC: If we execute teams from parallel region (on host), then 1702 teams should be created but each can only have 1 thread if 1703 nesting is disabled. If teams called from serial region, then 1704 teams and their threads should be created regardless of the 1705 nesting setting. */ 1706 , 1707 enter_teams 1708 #endif /* OMP_40_ENABLED */ 1709 ); 1710 if (nthreads == 1) { 1711 // Free lock for single thread execution here; for multi-thread 1712 // execution it will be freed later after team of threads created 1713 // and initialized 1714 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1715 } 1716 } 1717 } 1718 KMP_DEBUG_ASSERT(nthreads > 0); 1719 1720 // If we temporarily changed the set number of threads then restore it now 1721 master_th->th.th_set_nproc = 0; 1722 1723 /* create a serialized parallel region? */ 1724 if (nthreads == 1) { 1725 /* josh todo: hypothetical question: what do we do for OS X*? */ 1726 #if KMP_OS_LINUX && \ 1727 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1728 void *args[argc]; 1729 #else 1730 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1731 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1732 KMP_ARCH_AARCH64) */ 1733 1734 KA_TRACE(20, 1735 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1736 1737 __kmpc_serialized_parallel(loc, gtid); 1738 1739 if (call_context == fork_context_intel) { 1740 /* TODO this sucks, use the compiler itself to pass args! :) */ 1741 master_th->th.th_serial_team->t.t_ident = loc; 1742 #if OMP_40_ENABLED 1743 if (!ap) { 1744 // revert change made in __kmpc_serialized_parallel() 1745 master_th->th.th_serial_team->t.t_level--; 1746 // Get args from parent team for teams construct 1747 1748 #if OMPT_SUPPORT 1749 void *dummy; 1750 void **exit_runtime_p; 1751 ompt_task_info_t *task_info; 1752 1753 ompt_lw_taskteam_t lw_taskteam; 1754 1755 if (ompt_enabled.enabled) { 1756 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1757 &ompt_parallel_data, return_address); 1758 1759 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1760 // don't use lw_taskteam after linking. content was swaped 1761 1762 task_info = OMPT_CUR_TASK_INFO(master_th); 1763 exit_runtime_p = &(task_info->frame.exit_runtime_frame); 1764 if (ompt_enabled.ompt_callback_implicit_task) { 1765 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1766 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1767 &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid)); 1768 } 1769 1770 /* OMPT state */ 1771 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 1772 } else { 1773 exit_runtime_p = &dummy; 1774 } 1775 #endif 1776 1777 { 1778 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1779 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1780 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1781 parent_team->t.t_argv 1782 #if OMPT_SUPPORT 1783 , 1784 exit_runtime_p 1785 #endif 1786 ); 1787 } 1788 1789 #if OMPT_SUPPORT 1790 if (ompt_enabled.enabled) { 1791 exit_runtime_p = NULL; 1792 if (ompt_enabled.ompt_callback_implicit_task) { 1793 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1794 ompt_scope_end, NULL, &(task_info->task_data), 1, 1795 __kmp_tid_from_gtid(gtid)); 1796 } 1797 1798 __ompt_lw_taskteam_unlink(master_th); 1799 if (ompt_enabled.ompt_callback_parallel_end) { 1800 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1801 OMPT_CUR_TEAM_DATA(master_th), parent_task_data, 1802 OMPT_INVOKER(call_context), return_address); 1803 } 1804 master_th->th.ompt_thread_info.state = omp_state_overhead; 1805 } 1806 #endif 1807 } else if (microtask == (microtask_t)__kmp_teams_master) { 1808 KMP_DEBUG_ASSERT(master_th->th.th_team == 1809 master_th->th.th_serial_team); 1810 team = master_th->th.th_team; 1811 // team->t.t_pkfn = microtask; 1812 team->t.t_invoke = invoker; 1813 __kmp_alloc_argv_entries(argc, team, TRUE); 1814 team->t.t_argc = argc; 1815 argv = (void **)team->t.t_argv; 1816 if (ap) { 1817 for (i = argc - 1; i >= 0; --i) 1818 // TODO: revert workaround for Intel(R) 64 tracker #96 1819 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1820 *argv++ = va_arg(*ap, void *); 1821 #else 1822 *argv++ = va_arg(ap, void *); 1823 #endif 1824 } else { 1825 for (i = 0; i < argc; ++i) 1826 // Get args from parent team for teams construct 1827 argv[i] = parent_team->t.t_argv[i]; 1828 } 1829 // AC: revert change made in __kmpc_serialized_parallel() 1830 // because initial code in teams should have level=0 1831 team->t.t_level--; 1832 // AC: call special invoker for outer "parallel" of teams construct 1833 { 1834 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1835 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1836 invoker(gtid); 1837 } 1838 } else { 1839 #endif /* OMP_40_ENABLED */ 1840 argv = args; 1841 for (i = argc - 1; i >= 0; --i) 1842 // TODO: revert workaround for Intel(R) 64 tracker #96 1843 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1844 *argv++ = va_arg(*ap, void *); 1845 #else 1846 *argv++ = va_arg(ap, void *); 1847 #endif 1848 KMP_MB(); 1849 1850 #if OMPT_SUPPORT 1851 void *dummy; 1852 void **exit_runtime_p; 1853 ompt_task_info_t *task_info; 1854 1855 ompt_lw_taskteam_t lw_taskteam; 1856 1857 if (ompt_enabled.enabled) { 1858 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1859 &ompt_parallel_data, return_address); 1860 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1861 // don't use lw_taskteam after linking. content was swaped 1862 task_info = OMPT_CUR_TASK_INFO(master_th); 1863 exit_runtime_p = &(task_info->frame.exit_runtime_frame); 1864 1865 /* OMPT implicit task begin */ 1866 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1867 if (ompt_enabled.ompt_callback_implicit_task) { 1868 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1869 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1870 implicit_task_data, 1, __kmp_tid_from_gtid(gtid)); 1871 } 1872 1873 /* OMPT state */ 1874 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 1875 } else { 1876 exit_runtime_p = &dummy; 1877 } 1878 #endif 1879 1880 { 1881 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1882 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1883 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1884 #if OMPT_SUPPORT 1885 , 1886 exit_runtime_p 1887 #endif 1888 ); 1889 } 1890 1891 #if OMPT_SUPPORT 1892 if (ompt_enabled.enabled) { 1893 *exit_runtime_p = NULL; 1894 if (ompt_enabled.ompt_callback_implicit_task) { 1895 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1896 ompt_scope_end, NULL, &(task_info->task_data), 1, 1897 __kmp_tid_from_gtid(gtid)); 1898 } 1899 1900 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1901 __ompt_lw_taskteam_unlink(master_th); 1902 if (ompt_enabled.ompt_callback_parallel_end) { 1903 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1904 &ompt_parallel_data, parent_task_data, 1905 OMPT_INVOKER(call_context), return_address); 1906 } 1907 master_th->th.ompt_thread_info.state = omp_state_overhead; 1908 } 1909 #endif 1910 #if OMP_40_ENABLED 1911 } 1912 #endif /* OMP_40_ENABLED */ 1913 } else if (call_context == fork_context_gnu) { 1914 #if OMPT_SUPPORT 1915 ompt_lw_taskteam_t lwt; 1916 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1917 return_address); 1918 1919 lwt.ompt_task_info.frame.exit_runtime_frame = NULL; 1920 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1921 // don't use lw_taskteam after linking. content was swaped 1922 #endif 1923 1924 // we were called from GNU native code 1925 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1926 return FALSE; 1927 } else { 1928 KMP_ASSERT2(call_context < fork_context_last, 1929 "__kmp_fork_call: unknown fork_context parameter"); 1930 } 1931 1932 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1933 KMP_MB(); 1934 return FALSE; 1935 } 1936 1937 // GEH: only modify the executing flag in the case when not serialized 1938 // serialized case is handled in kmpc_serialized_parallel 1939 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1940 "curtask=%p, curtask_max_aclevel=%d\n", 1941 parent_team->t.t_active_level, master_th, 1942 master_th->th.th_current_task, 1943 master_th->th.th_current_task->td_icvs.max_active_levels)); 1944 // TODO: GEH - cannot do this assertion because root thread not set up as 1945 // executing 1946 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1947 master_th->th.th_current_task->td_flags.executing = 0; 1948 1949 #if OMP_40_ENABLED 1950 if (!master_th->th.th_teams_microtask || level > teams_level) 1951 #endif /* OMP_40_ENABLED */ 1952 { 1953 /* Increment our nested depth level */ 1954 KMP_TEST_THEN_INC32((kmp_int32 *)&root->r.r_in_parallel); 1955 } 1956 1957 // See if we need to make a copy of the ICVs. 1958 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1959 if ((level + 1 < __kmp_nested_nth.used) && 1960 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1961 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1962 } else { 1963 nthreads_icv = 0; // don't update 1964 } 1965 1966 #if OMP_40_ENABLED 1967 // Figure out the proc_bind_policy for the new team. 1968 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1969 kmp_proc_bind_t proc_bind_icv = 1970 proc_bind_default; // proc_bind_default means don't update 1971 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1972 proc_bind = proc_bind_false; 1973 } else { 1974 if (proc_bind == proc_bind_default) { 1975 // No proc_bind clause specified; use current proc-bind-var for this 1976 // parallel region 1977 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1978 } 1979 /* else: The proc_bind policy was specified explicitly on parallel clause. 1980 This overrides proc-bind-var for this parallel region, but does not 1981 change proc-bind-var. */ 1982 // Figure the value of proc-bind-var for the child threads. 1983 if ((level + 1 < __kmp_nested_proc_bind.used) && 1984 (__kmp_nested_proc_bind.bind_types[level + 1] != 1985 master_th->th.th_current_task->td_icvs.proc_bind)) { 1986 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1987 } 1988 } 1989 1990 // Reset for next parallel region 1991 master_th->th.th_set_proc_bind = proc_bind_default; 1992 #endif /* OMP_40_ENABLED */ 1993 1994 if ((nthreads_icv > 0) 1995 #if OMP_40_ENABLED 1996 || (proc_bind_icv != proc_bind_default) 1997 #endif /* OMP_40_ENABLED */ 1998 ) { 1999 kmp_internal_control_t new_icvs; 2000 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2001 new_icvs.next = NULL; 2002 if (nthreads_icv > 0) { 2003 new_icvs.nproc = nthreads_icv; 2004 } 2005 2006 #if OMP_40_ENABLED 2007 if (proc_bind_icv != proc_bind_default) { 2008 new_icvs.proc_bind = proc_bind_icv; 2009 } 2010 #endif /* OMP_40_ENABLED */ 2011 2012 /* allocate a new parallel team */ 2013 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2014 team = __kmp_allocate_team(root, nthreads, nthreads, 2015 #if OMPT_SUPPORT 2016 ompt_parallel_data, 2017 #endif 2018 #if OMP_40_ENABLED 2019 proc_bind, 2020 #endif 2021 &new_icvs, argc USE_NESTED_HOT_ARG(master_th)); 2022 } else { 2023 /* allocate a new parallel team */ 2024 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2025 team = __kmp_allocate_team(root, nthreads, nthreads, 2026 #if OMPT_SUPPORT 2027 ompt_parallel_data, 2028 #endif 2029 #if OMP_40_ENABLED 2030 proc_bind, 2031 #endif 2032 &master_th->th.th_current_task->td_icvs, 2033 argc USE_NESTED_HOT_ARG(master_th)); 2034 } 2035 KF_TRACE( 2036 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2037 2038 /* setup the new team */ 2039 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2040 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2041 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2042 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2043 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2044 #if OMPT_SUPPORT 2045 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2046 return_address); 2047 #endif 2048 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2049 // TODO: parent_team->t.t_level == INT_MAX ??? 2050 #if OMP_40_ENABLED 2051 if (!master_th->th.th_teams_microtask || level > teams_level) { 2052 #endif /* OMP_40_ENABLED */ 2053 int new_level = parent_team->t.t_level + 1; 2054 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2055 new_level = parent_team->t.t_active_level + 1; 2056 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2057 #if OMP_40_ENABLED 2058 } else { 2059 // AC: Do not increase parallel level at start of the teams construct 2060 int new_level = parent_team->t.t_level; 2061 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2062 new_level = parent_team->t.t_active_level; 2063 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2064 } 2065 #endif /* OMP_40_ENABLED */ 2066 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2067 if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || 2068 team->t.t_sched.chunk != new_sched.chunk) 2069 team->t.t_sched = 2070 new_sched; // set master's schedule as new run-time schedule 2071 2072 #if OMP_40_ENABLED 2073 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2074 #endif 2075 2076 // Update the floating point rounding in the team if required. 2077 propagateFPControl(team); 2078 2079 if (__kmp_tasking_mode != tskm_immediate_exec) { 2080 // Set master's task team to team's task team. Unless this is hot team, it 2081 // should be NULL. 2082 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2083 parent_team->t.t_task_team[master_th->th.th_task_state]); 2084 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2085 "%p, new task_team %p / team %p\n", 2086 __kmp_gtid_from_thread(master_th), 2087 master_th->th.th_task_team, parent_team, 2088 team->t.t_task_team[master_th->th.th_task_state], team)); 2089 2090 if (active_level || master_th->th.th_task_team) { 2091 // Take a memo of master's task_state 2092 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2093 if (master_th->th.th_task_state_top >= 2094 master_th->th.th_task_state_stack_sz) { // increase size 2095 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2096 kmp_uint8 *old_stack, *new_stack; 2097 kmp_uint32 i; 2098 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2099 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2100 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2101 } 2102 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2103 ++i) { // zero-init rest of stack 2104 new_stack[i] = 0; 2105 } 2106 old_stack = master_th->th.th_task_state_memo_stack; 2107 master_th->th.th_task_state_memo_stack = new_stack; 2108 master_th->th.th_task_state_stack_sz = new_size; 2109 __kmp_free(old_stack); 2110 } 2111 // Store master's task_state on stack 2112 master_th->th 2113 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2114 master_th->th.th_task_state; 2115 master_th->th.th_task_state_top++; 2116 #if KMP_NESTED_HOT_TEAMS 2117 if (team == master_th->th.th_hot_teams[active_level].hot_team) { 2118 // Restore master's nested state if nested hot team 2119 master_th->th.th_task_state = 2120 master_th->th 2121 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2122 } else { 2123 #endif 2124 master_th->th.th_task_state = 0; 2125 #if KMP_NESTED_HOT_TEAMS 2126 } 2127 #endif 2128 } 2129 #if !KMP_NESTED_HOT_TEAMS 2130 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2131 (team == root->r.r_hot_team)); 2132 #endif 2133 } 2134 2135 KA_TRACE( 2136 20, 2137 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2138 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2139 team->t.t_nproc)); 2140 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2141 (team->t.t_master_tid == 0 && 2142 (team->t.t_parent == root->r.r_root_team || 2143 team->t.t_parent->t.t_serialized))); 2144 KMP_MB(); 2145 2146 /* now, setup the arguments */ 2147 argv = (void **)team->t.t_argv; 2148 #if OMP_40_ENABLED 2149 if (ap) { 2150 #endif /* OMP_40_ENABLED */ 2151 for (i = argc - 1; i >= 0; --i) { 2152 // TODO: revert workaround for Intel(R) 64 tracker #96 2153 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 2154 void *new_argv = va_arg(*ap, void *); 2155 #else 2156 void *new_argv = va_arg(ap, void *); 2157 #endif 2158 KMP_CHECK_UPDATE(*argv, new_argv); 2159 argv++; 2160 } 2161 #if OMP_40_ENABLED 2162 } else { 2163 for (i = 0; i < argc; ++i) { 2164 // Get args from parent team for teams construct 2165 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2166 } 2167 } 2168 #endif /* OMP_40_ENABLED */ 2169 2170 /* now actually fork the threads */ 2171 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2172 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2173 root->r.r_active = TRUE; 2174 2175 __kmp_fork_team_threads(root, team, master_th, gtid); 2176 __kmp_setup_icv_copy(team, nthreads, 2177 &master_th->th.th_current_task->td_icvs, loc); 2178 2179 #if OMPT_SUPPORT 2180 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 2181 #endif 2182 2183 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2184 2185 #if USE_ITT_BUILD 2186 if (team->t.t_active_level == 1 // only report frames at level 1 2187 #if OMP_40_ENABLED 2188 && !master_th->th.th_teams_microtask // not in teams construct 2189 #endif /* OMP_40_ENABLED */ 2190 ) { 2191 #if USE_ITT_NOTIFY 2192 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2193 (__kmp_forkjoin_frames_mode == 3 || 2194 __kmp_forkjoin_frames_mode == 1)) { 2195 kmp_uint64 tmp_time = 0; 2196 if (__itt_get_timestamp_ptr) 2197 tmp_time = __itt_get_timestamp(); 2198 // Internal fork - report frame begin 2199 master_th->th.th_frame_time = tmp_time; 2200 if (__kmp_forkjoin_frames_mode == 3) 2201 team->t.t_region_time = tmp_time; 2202 } else 2203 // only one notification scheme (either "submit" or "forking/joined", not both) 2204 #endif /* USE_ITT_NOTIFY */ 2205 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2206 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2207 // Mark start of "parallel" region for VTune. 2208 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2209 } 2210 } 2211 #endif /* USE_ITT_BUILD */ 2212 2213 /* now go on and do the work */ 2214 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2215 KMP_MB(); 2216 KF_TRACE(10, 2217 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2218 root, team, master_th, gtid)); 2219 2220 #if USE_ITT_BUILD 2221 if (__itt_stack_caller_create_ptr) { 2222 team->t.t_stack_id = 2223 __kmp_itt_stack_caller_create(); // create new stack stitching id 2224 // before entering fork barrier 2225 } 2226 #endif /* USE_ITT_BUILD */ 2227 2228 #if OMP_40_ENABLED 2229 // AC: skip __kmp_internal_fork at teams construct, let only master 2230 // threads execute 2231 if (ap) 2232 #endif /* OMP_40_ENABLED */ 2233 { 2234 __kmp_internal_fork(loc, gtid, team); 2235 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2236 "master_th=%p, gtid=%d\n", 2237 root, team, master_th, gtid)); 2238 } 2239 2240 if (call_context == fork_context_gnu) { 2241 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2242 return TRUE; 2243 } 2244 2245 /* Invoke microtask for MASTER thread */ 2246 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2247 team->t.t_id, team->t.t_pkfn)); 2248 } // END of timer KMP_fork_call block 2249 2250 { 2251 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 2252 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 2253 if (!team->t.t_invoke(gtid)) { 2254 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2255 } 2256 } 2257 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2258 team->t.t_id, team->t.t_pkfn)); 2259 KMP_MB(); /* Flush all pending memory write invalidates. */ 2260 2261 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2262 2263 #if OMPT_SUPPORT 2264 if (ompt_enabled.enabled) { 2265 master_th->th.ompt_thread_info.state = omp_state_overhead; 2266 } 2267 #endif 2268 2269 return TRUE; 2270 } 2271 2272 #if OMPT_SUPPORT 2273 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2274 kmp_team_t *team) { 2275 // restore state outside the region 2276 thread->th.ompt_thread_info.state = 2277 ((team->t.t_serialized) ? omp_state_work_serial 2278 : omp_state_work_parallel); 2279 } 2280 2281 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2282 kmp_team_t *team, ompt_data_t *parallel_data, 2283 fork_context_e fork_context, void *codeptr) { 2284 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2285 if (ompt_enabled.ompt_callback_parallel_end) { 2286 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2287 parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context), 2288 codeptr); 2289 } 2290 2291 task_info->frame.reenter_runtime_frame = NULL; 2292 __kmp_join_restore_state(thread, team); 2293 } 2294 #endif 2295 2296 void __kmp_join_call(ident_t *loc, int gtid 2297 #if OMPT_SUPPORT 2298 , 2299 enum fork_context_e fork_context 2300 #endif 2301 #if OMP_40_ENABLED 2302 , 2303 int exit_teams 2304 #endif /* OMP_40_ENABLED */ 2305 ) { 2306 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2307 kmp_team_t *team; 2308 kmp_team_t *parent_team; 2309 kmp_info_t *master_th; 2310 kmp_root_t *root; 2311 int master_active; 2312 int i; 2313 2314 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2315 2316 /* setup current data */ 2317 master_th = __kmp_threads[gtid]; 2318 root = master_th->th.th_root; 2319 team = master_th->th.th_team; 2320 parent_team = team->t.t_parent; 2321 2322 master_th->th.th_ident = loc; 2323 2324 #if OMPT_SUPPORT 2325 if (ompt_enabled.enabled) { 2326 master_th->th.ompt_thread_info.state = omp_state_overhead; 2327 } 2328 #endif 2329 2330 #if KMP_DEBUG 2331 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2332 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2333 "th_task_team = %p\n", 2334 __kmp_gtid_from_thread(master_th), team, 2335 team->t.t_task_team[master_th->th.th_task_state], 2336 master_th->th.th_task_team)); 2337 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2338 team->t.t_task_team[master_th->th.th_task_state]); 2339 } 2340 #endif 2341 2342 if (team->t.t_serialized) { 2343 #if OMP_40_ENABLED 2344 if (master_th->th.th_teams_microtask) { 2345 // We are in teams construct 2346 int level = team->t.t_level; 2347 int tlevel = master_th->th.th_teams_level; 2348 if (level == tlevel) { 2349 // AC: we haven't incremented it earlier at start of teams construct, 2350 // so do it here - at the end of teams construct 2351 team->t.t_level++; 2352 } else if (level == tlevel + 1) { 2353 // AC: we are exiting parallel inside teams, need to increment 2354 // serialization in order to restore it in the next call to 2355 // __kmpc_end_serialized_parallel 2356 team->t.t_serialized++; 2357 } 2358 } 2359 #endif /* OMP_40_ENABLED */ 2360 __kmpc_end_serialized_parallel(loc, gtid); 2361 2362 #if OMPT_SUPPORT 2363 if (ompt_enabled.enabled) { 2364 __kmp_join_restore_state(master_th, parent_team); 2365 } 2366 #endif 2367 2368 return; 2369 } 2370 2371 master_active = team->t.t_master_active; 2372 2373 #if OMP_40_ENABLED 2374 if (!exit_teams) 2375 #endif /* OMP_40_ENABLED */ 2376 { 2377 // AC: No barrier for internal teams at exit from teams construct. 2378 // But there is barrier for external team (league). 2379 __kmp_internal_join(loc, gtid, team); 2380 } 2381 #if OMP_40_ENABLED 2382 else { 2383 master_th->th.th_task_state = 2384 0; // AC: no tasking in teams (out of any parallel) 2385 } 2386 #endif /* OMP_40_ENABLED */ 2387 2388 KMP_MB(); 2389 2390 #if OMPT_SUPPORT 2391 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2392 void *codeptr = team->t.ompt_team_info.master_return_address; 2393 #endif 2394 2395 #if USE_ITT_BUILD 2396 if (__itt_stack_caller_create_ptr) { 2397 __kmp_itt_stack_caller_destroy( 2398 (__itt_caller)team->t 2399 .t_stack_id); // destroy the stack stitching id after join barrier 2400 } 2401 2402 // Mark end of "parallel" region for VTune. 2403 if (team->t.t_active_level == 1 2404 #if OMP_40_ENABLED 2405 && !master_th->th.th_teams_microtask /* not in teams construct */ 2406 #endif /* OMP_40_ENABLED */ 2407 ) { 2408 master_th->th.th_ident = loc; 2409 // only one notification scheme (either "submit" or "forking/joined", not 2410 // both) 2411 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2412 __kmp_forkjoin_frames_mode == 3) 2413 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2414 master_th->th.th_frame_time, 0, loc, 2415 master_th->th.th_team_nproc, 1); 2416 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2417 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2418 __kmp_itt_region_joined(gtid); 2419 } // active_level == 1 2420 #endif /* USE_ITT_BUILD */ 2421 2422 #if OMP_40_ENABLED 2423 if (master_th->th.th_teams_microtask && !exit_teams && 2424 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2425 team->t.t_level == master_th->th.th_teams_level + 1) { 2426 // AC: We need to leave the team structure intact at the end of parallel 2427 // inside the teams construct, so that at the next parallel same (hot) team 2428 // works, only adjust nesting levels 2429 2430 /* Decrement our nested depth level */ 2431 team->t.t_level--; 2432 team->t.t_active_level--; 2433 KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel); 2434 2435 /* Restore number of threads in the team if needed */ 2436 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2437 int old_num = master_th->th.th_team_nproc; 2438 int new_num = master_th->th.th_teams_size.nth; 2439 kmp_info_t **other_threads = team->t.t_threads; 2440 team->t.t_nproc = new_num; 2441 for (i = 0; i < old_num; ++i) { 2442 other_threads[i]->th.th_team_nproc = new_num; 2443 } 2444 // Adjust states of non-used threads of the team 2445 for (i = old_num; i < new_num; ++i) { 2446 // Re-initialize thread's barrier data. 2447 int b; 2448 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2449 for (b = 0; b < bs_last_barrier; ++b) { 2450 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2451 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2452 #if USE_DEBUGGER 2453 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2454 #endif 2455 } 2456 if (__kmp_tasking_mode != tskm_immediate_exec) { 2457 // Synchronize thread's task state 2458 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2459 } 2460 } 2461 } 2462 2463 #if OMPT_SUPPORT 2464 if (ompt_enabled.enabled) { 2465 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2466 codeptr); 2467 } 2468 #endif 2469 2470 return; 2471 } 2472 #endif /* OMP_40_ENABLED */ 2473 2474 /* do cleanup and restore the parent team */ 2475 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2476 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2477 2478 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2479 2480 /* jc: The following lock has instructions with REL and ACQ semantics, 2481 separating the parallel user code called in this parallel region 2482 from the serial user code called after this function returns. */ 2483 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2484 2485 #if OMP_40_ENABLED 2486 if (!master_th->th.th_teams_microtask || 2487 team->t.t_level > master_th->th.th_teams_level) 2488 #endif /* OMP_40_ENABLED */ 2489 { 2490 /* Decrement our nested depth level */ 2491 KMP_TEST_THEN_DEC32((kmp_int32 *)&root->r.r_in_parallel); 2492 } 2493 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2494 2495 #if OMPT_SUPPORT 2496 if (ompt_enabled.enabled) { 2497 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2498 if (ompt_enabled.ompt_callback_implicit_task) { 2499 int ompt_team_size = team->t.t_nproc; 2500 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2501 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2502 __kmp_tid_from_gtid(gtid)); 2503 } 2504 2505 task_info->frame.exit_runtime_frame = NULL; 2506 task_info->task_data = ompt_data_none; 2507 } 2508 #endif 2509 2510 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2511 master_th, team)); 2512 __kmp_pop_current_task_from_thread(master_th); 2513 2514 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 2515 // Restore master thread's partition. 2516 master_th->th.th_first_place = team->t.t_first_place; 2517 master_th->th.th_last_place = team->t.t_last_place; 2518 #endif /* OMP_40_ENABLED */ 2519 2520 updateHWFPControl(team); 2521 2522 if (root->r.r_active != master_active) 2523 root->r.r_active = master_active; 2524 2525 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2526 master_th)); // this will free worker threads 2527 2528 /* this race was fun to find. make sure the following is in the critical 2529 region otherwise assertions may fail occasionally since the old team may be 2530 reallocated and the hierarchy appears inconsistent. it is actually safe to 2531 run and won't cause any bugs, but will cause those assertion failures. it's 2532 only one deref&assign so might as well put this in the critical region */ 2533 master_th->th.th_team = parent_team; 2534 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2535 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2536 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2537 2538 /* restore serialized team, if need be */ 2539 if (parent_team->t.t_serialized && 2540 parent_team != master_th->th.th_serial_team && 2541 parent_team != root->r.r_root_team) { 2542 __kmp_free_team(root, 2543 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2544 master_th->th.th_serial_team = parent_team; 2545 } 2546 2547 if (__kmp_tasking_mode != tskm_immediate_exec) { 2548 if (master_th->th.th_task_state_top > 2549 0) { // Restore task state from memo stack 2550 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2551 // Remember master's state if we re-use this nested hot team 2552 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2553 master_th->th.th_task_state; 2554 --master_th->th.th_task_state_top; // pop 2555 // Now restore state at this level 2556 master_th->th.th_task_state = 2557 master_th->th 2558 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2559 } 2560 // Copy the task team from the parent team to the master thread 2561 master_th->th.th_task_team = 2562 parent_team->t.t_task_team[master_th->th.th_task_state]; 2563 KA_TRACE(20, 2564 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2565 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2566 parent_team)); 2567 } 2568 2569 // TODO: GEH - cannot do this assertion because root thread not set up as 2570 // executing 2571 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2572 master_th->th.th_current_task->td_flags.executing = 1; 2573 2574 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2575 2576 #if OMPT_SUPPORT 2577 if (ompt_enabled.enabled) { 2578 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2579 codeptr); 2580 } 2581 #endif 2582 2583 KMP_MB(); 2584 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2585 } 2586 2587 /* Check whether we should push an internal control record onto the 2588 serial team stack. If so, do it. */ 2589 void __kmp_save_internal_controls(kmp_info_t *thread) { 2590 2591 if (thread->th.th_team != thread->th.th_serial_team) { 2592 return; 2593 } 2594 if (thread->th.th_team->t.t_serialized > 1) { 2595 int push = 0; 2596 2597 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2598 push = 1; 2599 } else { 2600 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2601 thread->th.th_team->t.t_serialized) { 2602 push = 1; 2603 } 2604 } 2605 if (push) { /* push a record on the serial team's stack */ 2606 kmp_internal_control_t *control = 2607 (kmp_internal_control_t *)__kmp_allocate( 2608 sizeof(kmp_internal_control_t)); 2609 2610 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2611 2612 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2613 2614 control->next = thread->th.th_team->t.t_control_stack_top; 2615 thread->th.th_team->t.t_control_stack_top = control; 2616 } 2617 } 2618 } 2619 2620 /* Changes set_nproc */ 2621 void __kmp_set_num_threads(int new_nth, int gtid) { 2622 kmp_info_t *thread; 2623 kmp_root_t *root; 2624 2625 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2626 KMP_DEBUG_ASSERT(__kmp_init_serial); 2627 2628 if (new_nth < 1) 2629 new_nth = 1; 2630 else if (new_nth > __kmp_max_nth) 2631 new_nth = __kmp_max_nth; 2632 2633 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2634 thread = __kmp_threads[gtid]; 2635 2636 __kmp_save_internal_controls(thread); 2637 2638 set__nproc(thread, new_nth); 2639 2640 // If this omp_set_num_threads() call will cause the hot team size to be 2641 // reduced (in the absence of a num_threads clause), then reduce it now, 2642 // rather than waiting for the next parallel region. 2643 root = thread->th.th_root; 2644 if (__kmp_init_parallel && (!root->r.r_active) && 2645 (root->r.r_hot_team->t.t_nproc > new_nth) 2646 #if KMP_NESTED_HOT_TEAMS 2647 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2648 #endif 2649 ) { 2650 kmp_team_t *hot_team = root->r.r_hot_team; 2651 int f; 2652 2653 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2654 2655 // Release the extra threads we don't need any more. 2656 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2657 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2658 if (__kmp_tasking_mode != tskm_immediate_exec) { 2659 // When decreasing team size, threads no longer in the team should unref 2660 // task team. 2661 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2662 } 2663 __kmp_free_thread(hot_team->t.t_threads[f]); 2664 hot_team->t.t_threads[f] = NULL; 2665 } 2666 hot_team->t.t_nproc = new_nth; 2667 #if KMP_NESTED_HOT_TEAMS 2668 if (thread->th.th_hot_teams) { 2669 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2670 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2671 } 2672 #endif 2673 2674 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2675 2676 // Update the t_nproc field in the threads that are still active. 2677 for (f = 0; f < new_nth; f++) { 2678 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2679 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2680 } 2681 // Special flag in case omp_set_num_threads() call 2682 hot_team->t.t_size_changed = -1; 2683 } 2684 } 2685 2686 /* Changes max_active_levels */ 2687 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2688 kmp_info_t *thread; 2689 2690 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2691 "%d = (%d)\n", 2692 gtid, max_active_levels)); 2693 KMP_DEBUG_ASSERT(__kmp_init_serial); 2694 2695 // validate max_active_levels 2696 if (max_active_levels < 0) { 2697 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2698 // We ignore this call if the user has specified a negative value. 2699 // The current setting won't be changed. The last valid setting will be 2700 // used. A warning will be issued (if warnings are allowed as controlled by 2701 // the KMP_WARNINGS env var). 2702 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2703 "max_active_levels for thread %d = (%d)\n", 2704 gtid, max_active_levels)); 2705 return; 2706 } 2707 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2708 // it's OK, the max_active_levels is within the valid range: [ 0; 2709 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2710 // We allow a zero value. (implementation defined behavior) 2711 } else { 2712 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2713 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2714 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2715 // Current upper limit is MAX_INT. (implementation defined behavior) 2716 // If the input exceeds the upper limit, we correct the input to be the 2717 // upper limit. (implementation defined behavior) 2718 // Actually, the flow should never get here until we use MAX_INT limit. 2719 } 2720 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2721 "max_active_levels for thread %d = (%d)\n", 2722 gtid, max_active_levels)); 2723 2724 thread = __kmp_threads[gtid]; 2725 2726 __kmp_save_internal_controls(thread); 2727 2728 set__max_active_levels(thread, max_active_levels); 2729 } 2730 2731 /* Gets max_active_levels */ 2732 int __kmp_get_max_active_levels(int gtid) { 2733 kmp_info_t *thread; 2734 2735 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2736 KMP_DEBUG_ASSERT(__kmp_init_serial); 2737 2738 thread = __kmp_threads[gtid]; 2739 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2740 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2741 "curtask_maxaclevel=%d\n", 2742 gtid, thread->th.th_current_task, 2743 thread->th.th_current_task->td_icvs.max_active_levels)); 2744 return thread->th.th_current_task->td_icvs.max_active_levels; 2745 } 2746 2747 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2748 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2749 kmp_info_t *thread; 2750 // kmp_team_t *team; 2751 2752 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2753 gtid, (int)kind, chunk)); 2754 KMP_DEBUG_ASSERT(__kmp_init_serial); 2755 2756 // Check if the kind parameter is valid, correct if needed. 2757 // Valid parameters should fit in one of two intervals - standard or extended: 2758 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2759 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2760 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2761 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2762 // TODO: Hint needs attention in case we change the default schedule. 2763 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2764 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2765 __kmp_msg_null); 2766 kind = kmp_sched_default; 2767 chunk = 0; // ignore chunk value in case of bad kind 2768 } 2769 2770 thread = __kmp_threads[gtid]; 2771 2772 __kmp_save_internal_controls(thread); 2773 2774 if (kind < kmp_sched_upper_std) { 2775 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2776 // differ static chunked vs. unchunked: chunk should be invalid to 2777 // indicate unchunked schedule (which is the default) 2778 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2779 } else { 2780 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2781 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2782 } 2783 } else { 2784 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2785 // kmp_sched_lower - 2 ]; 2786 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2787 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2788 kmp_sched_lower - 2]; 2789 } 2790 if (kind == kmp_sched_auto || chunk < 1) { 2791 // ignore parameter chunk for schedule auto 2792 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2793 } else { 2794 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2795 } 2796 } 2797 2798 /* Gets def_sched_var ICV values */ 2799 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2800 kmp_info_t *thread; 2801 enum sched_type th_type; 2802 2803 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2804 KMP_DEBUG_ASSERT(__kmp_init_serial); 2805 2806 thread = __kmp_threads[gtid]; 2807 2808 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2809 2810 switch (th_type) { 2811 case kmp_sch_static: 2812 case kmp_sch_static_greedy: 2813 case kmp_sch_static_balanced: 2814 *kind = kmp_sched_static; 2815 *chunk = 0; // chunk was not set, try to show this fact via zero value 2816 return; 2817 case kmp_sch_static_chunked: 2818 *kind = kmp_sched_static; 2819 break; 2820 case kmp_sch_dynamic_chunked: 2821 *kind = kmp_sched_dynamic; 2822 break; 2823 case kmp_sch_guided_chunked: 2824 case kmp_sch_guided_iterative_chunked: 2825 case kmp_sch_guided_analytical_chunked: 2826 *kind = kmp_sched_guided; 2827 break; 2828 case kmp_sch_auto: 2829 *kind = kmp_sched_auto; 2830 break; 2831 case kmp_sch_trapezoidal: 2832 *kind = kmp_sched_trapezoidal; 2833 break; 2834 #if KMP_STATIC_STEAL_ENABLED 2835 case kmp_sch_static_steal: 2836 *kind = kmp_sched_static_steal; 2837 break; 2838 #endif 2839 default: 2840 KMP_FATAL(UnknownSchedulingType, th_type); 2841 } 2842 2843 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2844 } 2845 2846 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2847 2848 int ii, dd; 2849 kmp_team_t *team; 2850 kmp_info_t *thr; 2851 2852 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2853 KMP_DEBUG_ASSERT(__kmp_init_serial); 2854 2855 // validate level 2856 if (level == 0) 2857 return 0; 2858 if (level < 0) 2859 return -1; 2860 thr = __kmp_threads[gtid]; 2861 team = thr->th.th_team; 2862 ii = team->t.t_level; 2863 if (level > ii) 2864 return -1; 2865 2866 #if OMP_40_ENABLED 2867 if (thr->th.th_teams_microtask) { 2868 // AC: we are in teams region where multiple nested teams have same level 2869 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2870 if (level <= 2871 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2872 KMP_DEBUG_ASSERT(ii >= tlevel); 2873 // AC: As we need to pass by the teams league, we need to artificially 2874 // increase ii 2875 if (ii == tlevel) { 2876 ii += 2; // three teams have same level 2877 } else { 2878 ii++; // two teams have same level 2879 } 2880 } 2881 } 2882 #endif 2883 2884 if (ii == level) 2885 return __kmp_tid_from_gtid(gtid); 2886 2887 dd = team->t.t_serialized; 2888 level++; 2889 while (ii > level) { 2890 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2891 } 2892 if ((team->t.t_serialized) && (!dd)) { 2893 team = team->t.t_parent; 2894 continue; 2895 } 2896 if (ii > level) { 2897 team = team->t.t_parent; 2898 dd = team->t.t_serialized; 2899 ii--; 2900 } 2901 } 2902 2903 return (dd > 1) ? (0) : (team->t.t_master_tid); 2904 } 2905 2906 int __kmp_get_team_size(int gtid, int level) { 2907 2908 int ii, dd; 2909 kmp_team_t *team; 2910 kmp_info_t *thr; 2911 2912 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2913 KMP_DEBUG_ASSERT(__kmp_init_serial); 2914 2915 // validate level 2916 if (level == 0) 2917 return 1; 2918 if (level < 0) 2919 return -1; 2920 thr = __kmp_threads[gtid]; 2921 team = thr->th.th_team; 2922 ii = team->t.t_level; 2923 if (level > ii) 2924 return -1; 2925 2926 #if OMP_40_ENABLED 2927 if (thr->th.th_teams_microtask) { 2928 // AC: we are in teams region where multiple nested teams have same level 2929 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2930 if (level <= 2931 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2932 KMP_DEBUG_ASSERT(ii >= tlevel); 2933 // AC: As we need to pass by the teams league, we need to artificially 2934 // increase ii 2935 if (ii == tlevel) { 2936 ii += 2; // three teams have same level 2937 } else { 2938 ii++; // two teams have same level 2939 } 2940 } 2941 } 2942 #endif 2943 2944 while (ii > level) { 2945 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2946 } 2947 if (team->t.t_serialized && (!dd)) { 2948 team = team->t.t_parent; 2949 continue; 2950 } 2951 if (ii > level) { 2952 team = team->t.t_parent; 2953 ii--; 2954 } 2955 } 2956 2957 return team->t.t_nproc; 2958 } 2959 2960 kmp_r_sched_t __kmp_get_schedule_global() { 2961 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2962 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2963 // independently. So one can get the updated schedule here. 2964 2965 kmp_r_sched_t r_sched; 2966 2967 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2968 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2969 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2970 // different roots (even in OMP 2.5) 2971 if (__kmp_sched == kmp_sch_static) { 2972 r_sched.r_sched_type = __kmp_static; // replace STATIC with more detailed 2973 // schedule (balanced or greedy) 2974 } else if (__kmp_sched == kmp_sch_guided_chunked) { 2975 r_sched.r_sched_type = __kmp_guided; // replace GUIDED with more detailed 2976 // schedule (iterative or analytical) 2977 } else { 2978 r_sched.r_sched_type = 2979 __kmp_sched; // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2980 } 2981 2982 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { // __kmp_chunk may be wrong here (if it 2983 // was not ever set) 2984 r_sched.chunk = KMP_DEFAULT_CHUNK; 2985 } else { 2986 r_sched.chunk = __kmp_chunk; 2987 } 2988 2989 return r_sched; 2990 } 2991 2992 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2993 at least argc number of *t_argv entries for the requested team. */ 2994 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2995 2996 KMP_DEBUG_ASSERT(team); 2997 if (!realloc || argc > team->t.t_max_argc) { 2998 2999 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3000 "current entries=%d\n", 3001 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3002 /* if previously allocated heap space for args, free them */ 3003 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3004 __kmp_free((void *)team->t.t_argv); 3005 3006 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3007 /* use unused space in the cache line for arguments */ 3008 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3009 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3010 "argv entries\n", 3011 team->t.t_id, team->t.t_max_argc)); 3012 team->t.t_argv = &team->t.t_inline_argv[0]; 3013 if (__kmp_storage_map) { 3014 __kmp_print_storage_map_gtid( 3015 -1, &team->t.t_inline_argv[0], 3016 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3017 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3018 team->t.t_id); 3019 } 3020 } else { 3021 /* allocate space for arguments in the heap */ 3022 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3023 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3024 : 2 * argc; 3025 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3026 "argv entries\n", 3027 team->t.t_id, team->t.t_max_argc)); 3028 team->t.t_argv = 3029 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3030 if (__kmp_storage_map) { 3031 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3032 &team->t.t_argv[team->t.t_max_argc], 3033 sizeof(void *) * team->t.t_max_argc, 3034 "team_%d.t_argv", team->t.t_id); 3035 } 3036 } 3037 } 3038 } 3039 3040 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3041 int i; 3042 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3043 team->t.t_threads = 3044 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3045 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3046 sizeof(dispatch_shared_info_t) * num_disp_buff); 3047 team->t.t_dispatch = 3048 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3049 team->t.t_implicit_task_taskdata = 3050 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3051 team->t.t_max_nproc = max_nth; 3052 3053 /* setup dispatch buffers */ 3054 for (i = 0; i < num_disp_buff; ++i) { 3055 team->t.t_disp_buffer[i].buffer_index = i; 3056 #if OMP_45_ENABLED 3057 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3058 #endif 3059 } 3060 } 3061 3062 static void __kmp_free_team_arrays(kmp_team_t *team) { 3063 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3064 int i; 3065 for (i = 0; i < team->t.t_max_nproc; ++i) { 3066 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3067 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3068 team->t.t_dispatch[i].th_disp_buffer = NULL; 3069 } 3070 } 3071 __kmp_free(team->t.t_threads); 3072 __kmp_free(team->t.t_disp_buffer); 3073 __kmp_free(team->t.t_dispatch); 3074 __kmp_free(team->t.t_implicit_task_taskdata); 3075 team->t.t_threads = NULL; 3076 team->t.t_disp_buffer = NULL; 3077 team->t.t_dispatch = NULL; 3078 team->t.t_implicit_task_taskdata = 0; 3079 } 3080 3081 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3082 kmp_info_t **oldThreads = team->t.t_threads; 3083 3084 __kmp_free(team->t.t_disp_buffer); 3085 __kmp_free(team->t.t_dispatch); 3086 __kmp_free(team->t.t_implicit_task_taskdata); 3087 __kmp_allocate_team_arrays(team, max_nth); 3088 3089 KMP_MEMCPY(team->t.t_threads, oldThreads, 3090 team->t.t_nproc * sizeof(kmp_info_t *)); 3091 3092 __kmp_free(oldThreads); 3093 } 3094 3095 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3096 3097 kmp_r_sched_t r_sched = 3098 __kmp_get_schedule_global(); // get current state of scheduling globals 3099 3100 #if OMP_40_ENABLED 3101 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3102 #endif /* OMP_40_ENABLED */ 3103 3104 kmp_internal_control_t g_icvs = { 3105 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3106 (kmp_int8)__kmp_dflt_nested, // int nested; //internal control 3107 // for nested parallelism (per thread) 3108 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3109 // adjustment of threads (per thread) 3110 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3111 // whether blocktime is explicitly set 3112 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3113 #if KMP_USE_MONITOR 3114 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3115 // intervals 3116 #endif 3117 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3118 // next parallel region (per thread) 3119 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3120 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3121 // for max_active_levels 3122 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3123 // {sched,chunk} pair 3124 #if OMP_40_ENABLED 3125 __kmp_nested_proc_bind.bind_types[0], 3126 __kmp_default_device, 3127 #endif /* OMP_40_ENABLED */ 3128 NULL // struct kmp_internal_control *next; 3129 }; 3130 3131 return g_icvs; 3132 } 3133 3134 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3135 3136 kmp_internal_control_t gx_icvs; 3137 gx_icvs.serial_nesting_level = 3138 0; // probably =team->t.t_serial like in save_inter_controls 3139 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3140 gx_icvs.next = NULL; 3141 3142 return gx_icvs; 3143 } 3144 3145 static void __kmp_initialize_root(kmp_root_t *root) { 3146 int f; 3147 kmp_team_t *root_team; 3148 kmp_team_t *hot_team; 3149 int hot_team_max_nth; 3150 kmp_r_sched_t r_sched = 3151 __kmp_get_schedule_global(); // get current state of scheduling globals 3152 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3153 KMP_DEBUG_ASSERT(root); 3154 KMP_ASSERT(!root->r.r_begin); 3155 3156 /* setup the root state structure */ 3157 __kmp_init_lock(&root->r.r_begin_lock); 3158 root->r.r_begin = FALSE; 3159 root->r.r_active = FALSE; 3160 root->r.r_in_parallel = 0; 3161 root->r.r_blocktime = __kmp_dflt_blocktime; 3162 root->r.r_nested = __kmp_dflt_nested; 3163 root->r.r_cg_nthreads = 1; 3164 3165 /* setup the root team for this task */ 3166 /* allocate the root team structure */ 3167 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3168 3169 root_team = 3170 __kmp_allocate_team(root, 3171 1, // new_nproc 3172 1, // max_nproc 3173 #if OMPT_SUPPORT 3174 ompt_data_none, // root parallel id 3175 #endif 3176 #if OMP_40_ENABLED 3177 __kmp_nested_proc_bind.bind_types[0], 3178 #endif 3179 &r_icvs, 3180 0 // argc 3181 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3182 ); 3183 #if USE_DEBUGGER 3184 // Non-NULL value should be assigned to make the debugger display the root 3185 // team. 3186 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3187 #endif 3188 3189 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3190 3191 root->r.r_root_team = root_team; 3192 root_team->t.t_control_stack_top = NULL; 3193 3194 /* initialize root team */ 3195 root_team->t.t_threads[0] = NULL; 3196 root_team->t.t_nproc = 1; 3197 root_team->t.t_serialized = 1; 3198 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3199 root_team->t.t_sched.r_sched_type = r_sched.r_sched_type; 3200 root_team->t.t_sched.chunk = r_sched.chunk; 3201 KA_TRACE( 3202 20, 3203 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3204 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3205 3206 /* setup the hot team for this task */ 3207 /* allocate the hot team structure */ 3208 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3209 3210 hot_team = 3211 __kmp_allocate_team(root, 3212 1, // new_nproc 3213 __kmp_dflt_team_nth_ub * 2, // max_nproc 3214 #if OMPT_SUPPORT 3215 ompt_data_none, // root parallel id 3216 #endif 3217 #if OMP_40_ENABLED 3218 __kmp_nested_proc_bind.bind_types[0], 3219 #endif 3220 &r_icvs, 3221 0 // argc 3222 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3223 ); 3224 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3225 3226 root->r.r_hot_team = hot_team; 3227 root_team->t.t_control_stack_top = NULL; 3228 3229 /* first-time initialization */ 3230 hot_team->t.t_parent = root_team; 3231 3232 /* initialize hot team */ 3233 hot_team_max_nth = hot_team->t.t_max_nproc; 3234 for (f = 0; f < hot_team_max_nth; ++f) { 3235 hot_team->t.t_threads[f] = NULL; 3236 } 3237 hot_team->t.t_nproc = 1; 3238 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3239 hot_team->t.t_sched.r_sched_type = r_sched.r_sched_type; 3240 hot_team->t.t_sched.chunk = r_sched.chunk; 3241 hot_team->t.t_size_changed = 0; 3242 } 3243 3244 #ifdef KMP_DEBUG 3245 3246 typedef struct kmp_team_list_item { 3247 kmp_team_p const *entry; 3248 struct kmp_team_list_item *next; 3249 } kmp_team_list_item_t; 3250 typedef kmp_team_list_item_t *kmp_team_list_t; 3251 3252 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3253 kmp_team_list_t list, // List of teams. 3254 kmp_team_p const *team // Team to add. 3255 ) { 3256 3257 // List must terminate with item where both entry and next are NULL. 3258 // Team is added to the list only once. 3259 // List is sorted in ascending order by team id. 3260 // Team id is *not* a key. 3261 3262 kmp_team_list_t l; 3263 3264 KMP_DEBUG_ASSERT(list != NULL); 3265 if (team == NULL) { 3266 return; 3267 } 3268 3269 __kmp_print_structure_team_accum(list, team->t.t_parent); 3270 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3271 3272 // Search list for the team. 3273 l = list; 3274 while (l->next != NULL && l->entry != team) { 3275 l = l->next; 3276 } 3277 if (l->next != NULL) { 3278 return; // Team has been added before, exit. 3279 } 3280 3281 // Team is not found. Search list again for insertion point. 3282 l = list; 3283 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3284 l = l->next; 3285 } 3286 3287 // Insert team. 3288 { 3289 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3290 sizeof(kmp_team_list_item_t)); 3291 *item = *l; 3292 l->entry = team; 3293 l->next = item; 3294 } 3295 } 3296 3297 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3298 3299 ) { 3300 __kmp_printf("%s", title); 3301 if (team != NULL) { 3302 __kmp_printf("%2x %p\n", team->t.t_id, team); 3303 } else { 3304 __kmp_printf(" - (nil)\n"); 3305 } 3306 } 3307 3308 static void __kmp_print_structure_thread(char const *title, 3309 kmp_info_p const *thread) { 3310 __kmp_printf("%s", title); 3311 if (thread != NULL) { 3312 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3313 } else { 3314 __kmp_printf(" - (nil)\n"); 3315 } 3316 } 3317 3318 void __kmp_print_structure(void) { 3319 3320 kmp_team_list_t list; 3321 3322 // Initialize list of teams. 3323 list = 3324 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3325 list->entry = NULL; 3326 list->next = NULL; 3327 3328 __kmp_printf("\n------------------------------\nGlobal Thread " 3329 "Table\n------------------------------\n"); 3330 { 3331 int gtid; 3332 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3333 __kmp_printf("%2d", gtid); 3334 if (__kmp_threads != NULL) { 3335 __kmp_printf(" %p", __kmp_threads[gtid]); 3336 } 3337 if (__kmp_root != NULL) { 3338 __kmp_printf(" %p", __kmp_root[gtid]); 3339 } 3340 __kmp_printf("\n"); 3341 } 3342 } 3343 3344 // Print out __kmp_threads array. 3345 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3346 "----------\n"); 3347 if (__kmp_threads != NULL) { 3348 int gtid; 3349 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3350 kmp_info_t const *thread = __kmp_threads[gtid]; 3351 if (thread != NULL) { 3352 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3353 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3354 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3355 __kmp_print_structure_team(" Serial Team: ", 3356 thread->th.th_serial_team); 3357 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3358 __kmp_print_structure_thread(" Master: ", 3359 thread->th.th_team_master); 3360 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3361 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3362 #if OMP_40_ENABLED 3363 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3364 #endif 3365 __kmp_print_structure_thread(" Next in pool: ", 3366 thread->th.th_next_pool); 3367 __kmp_printf("\n"); 3368 __kmp_print_structure_team_accum(list, thread->th.th_team); 3369 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3370 } 3371 } 3372 } else { 3373 __kmp_printf("Threads array is not allocated.\n"); 3374 } 3375 3376 // Print out __kmp_root array. 3377 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3378 "--------\n"); 3379 if (__kmp_root != NULL) { 3380 int gtid; 3381 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3382 kmp_root_t const *root = __kmp_root[gtid]; 3383 if (root != NULL) { 3384 __kmp_printf("GTID %2d %p:\n", gtid, root); 3385 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3386 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3387 __kmp_print_structure_thread(" Uber Thread: ", 3388 root->r.r_uber_thread); 3389 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3390 __kmp_printf(" Nested?: %2d\n", root->r.r_nested); 3391 __kmp_printf(" In Parallel: %2d\n", root->r.r_in_parallel); 3392 __kmp_printf("\n"); 3393 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3394 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3395 } 3396 } 3397 } else { 3398 __kmp_printf("Ubers array is not allocated.\n"); 3399 } 3400 3401 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3402 "--------\n"); 3403 while (list->next != NULL) { 3404 kmp_team_p const *team = list->entry; 3405 int i; 3406 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3407 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3408 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3409 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3410 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3411 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3412 for (i = 0; i < team->t.t_nproc; ++i) { 3413 __kmp_printf(" Thread %2d: ", i); 3414 __kmp_print_structure_thread("", team->t.t_threads[i]); 3415 } 3416 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3417 __kmp_printf("\n"); 3418 list = list->next; 3419 } 3420 3421 // Print out __kmp_thread_pool and __kmp_team_pool. 3422 __kmp_printf("\n------------------------------\nPools\n----------------------" 3423 "--------\n"); 3424 __kmp_print_structure_thread("Thread pool: ", 3425 CCAST(kmp_info_t *, __kmp_thread_pool)); 3426 __kmp_print_structure_team("Team pool: ", 3427 CCAST(kmp_team_t *, __kmp_team_pool)); 3428 __kmp_printf("\n"); 3429 3430 // Free team list. 3431 while (list != NULL) { 3432 kmp_team_list_item_t *item = list; 3433 list = list->next; 3434 KMP_INTERNAL_FREE(item); 3435 } 3436 } 3437 3438 #endif 3439 3440 //--------------------------------------------------------------------------- 3441 // Stuff for per-thread fast random number generator 3442 // Table of primes 3443 static const unsigned __kmp_primes[] = { 3444 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3445 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3446 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3447 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3448 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3449 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3450 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3451 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3452 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3453 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3454 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3455 3456 //--------------------------------------------------------------------------- 3457 // __kmp_get_random: Get a random number using a linear congruential method. 3458 unsigned short __kmp_get_random(kmp_info_t *thread) { 3459 unsigned x = thread->th.th_x; 3460 unsigned short r = x >> 16; 3461 3462 thread->th.th_x = x * thread->th.th_a + 1; 3463 3464 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3465 thread->th.th_info.ds.ds_tid, r)); 3466 3467 return r; 3468 } 3469 //-------------------------------------------------------- 3470 // __kmp_init_random: Initialize a random number generator 3471 void __kmp_init_random(kmp_info_t *thread) { 3472 unsigned seed = thread->th.th_info.ds.ds_tid; 3473 3474 thread->th.th_a = 3475 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3476 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3477 KA_TRACE(30, 3478 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3479 } 3480 3481 #if KMP_OS_WINDOWS 3482 /* reclaim array entries for root threads that are already dead, returns number 3483 * reclaimed */ 3484 static int __kmp_reclaim_dead_roots(void) { 3485 int i, r = 0; 3486 3487 for (i = 0; i < __kmp_threads_capacity; ++i) { 3488 if (KMP_UBER_GTID(i) && 3489 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3490 !__kmp_root[i] 3491 ->r.r_active) { // AC: reclaim only roots died in non-active state 3492 r += __kmp_unregister_root_other_thread(i); 3493 } 3494 } 3495 return r; 3496 } 3497 #endif 3498 3499 /* This function attempts to create free entries in __kmp_threads and 3500 __kmp_root, and returns the number of free entries generated. 3501 3502 For Windows* OS static library, the first mechanism used is to reclaim array 3503 entries for root threads that are already dead. 3504 3505 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3506 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3507 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3508 threadprivate cache array has been created. Synchronization with 3509 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3510 3511 After any dead root reclamation, if the clipping value allows array expansion 3512 to result in the generation of a total of nWish free slots, the function does 3513 that expansion. If not, but the clipping value allows array expansion to 3514 result in the generation of a total of nNeed free slots, the function does 3515 that expansion. Otherwise, nothing is done beyond the possible initial root 3516 thread reclamation. However, if nNeed is zero, a best-effort attempt is made 3517 to fulfil nWish as far as possible, i.e. the function will attempt to create 3518 as many free slots as possible up to nWish. 3519 3520 If any argument is negative, the behavior is undefined. */ 3521 static int __kmp_expand_threads(int nWish, int nNeed) { 3522 int added = 0; 3523 int old_tp_cached; 3524 int __kmp_actual_max_nth; 3525 3526 if (nNeed > nWish) /* normalize the arguments */ 3527 nWish = nNeed; 3528 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB 3529 /* only for Windows static library */ 3530 /* reclaim array entries for root threads that are already dead */ 3531 added = __kmp_reclaim_dead_roots(); 3532 3533 if (nNeed) { 3534 nNeed -= added; 3535 if (nNeed < 0) 3536 nNeed = 0; 3537 } 3538 if (nWish) { 3539 nWish -= added; 3540 if (nWish < 0) 3541 nWish = 0; 3542 } 3543 #endif 3544 if (nWish <= 0) 3545 return added; 3546 3547 while (1) { 3548 int nTarget; 3549 int minimumRequiredCapacity; 3550 int newCapacity; 3551 kmp_info_t **newThreads; 3552 kmp_root_t **newRoot; 3553 3554 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3555 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3556 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3557 // > __kmp_max_nth in one of two ways: 3558 // 3559 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3560 // may not be resused by another thread, so we may need to increase 3561 // __kmp_threads_capacity to __kmp_max_nth + 1. 3562 // 3563 // 2) New foreign root(s) are encountered. We always register new foreign 3564 // roots. This may cause a smaller # of threads to be allocated at 3565 // subsequent parallel regions, but the worker threads hang around (and 3566 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3567 // 3568 // Anyway, that is the reason for moving the check to see if 3569 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3570 // instead of having it performed here. -BB 3571 old_tp_cached = __kmp_tp_cached; 3572 __kmp_actual_max_nth = 3573 old_tp_cached ? __kmp_tp_capacity : __kmp_sys_max_nth; 3574 KMP_DEBUG_ASSERT(__kmp_actual_max_nth >= __kmp_threads_capacity); 3575 3576 /* compute expansion headroom to check if we can expand and whether to aim 3577 for nWish or nNeed */ 3578 nTarget = nWish; 3579 if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) { 3580 /* can't fulfil nWish, so try nNeed */ 3581 if (nNeed) { 3582 nTarget = nNeed; 3583 if (__kmp_actual_max_nth - __kmp_threads_capacity < nTarget) { 3584 /* possible expansion too small -- give up */ 3585 break; 3586 } 3587 } else { 3588 /* best-effort */ 3589 nTarget = __kmp_actual_max_nth - __kmp_threads_capacity; 3590 if (!nTarget) { 3591 /* can expand at all -- give up */ 3592 break; 3593 } 3594 } 3595 } 3596 minimumRequiredCapacity = __kmp_threads_capacity + nTarget; 3597 3598 newCapacity = __kmp_threads_capacity; 3599 do { 3600 newCapacity = newCapacity <= (__kmp_actual_max_nth >> 1) 3601 ? (newCapacity << 1) 3602 : __kmp_actual_max_nth; 3603 } while (newCapacity < minimumRequiredCapacity); 3604 newThreads = (kmp_info_t **)__kmp_allocate( 3605 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + 3606 CACHE_LINE); 3607 newRoot = (kmp_root_t **)((char *)newThreads + 3608 sizeof(kmp_info_t *) * newCapacity); 3609 KMP_MEMCPY(newThreads, __kmp_threads, 3610 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3611 KMP_MEMCPY(newRoot, __kmp_root, 3612 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3613 memset(newThreads + __kmp_threads_capacity, 0, 3614 (newCapacity - __kmp_threads_capacity) * sizeof(kmp_info_t *)); 3615 memset(newRoot + __kmp_threads_capacity, 0, 3616 (newCapacity - __kmp_threads_capacity) * sizeof(kmp_root_t *)); 3617 3618 if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3619 /* __kmp_tp_cached has changed, i.e. __kmpc_threadprivate_cached has 3620 allocated a threadprivate cache while we were allocating the expanded 3621 array, and our new capacity is larger than the threadprivate cache 3622 capacity, so we should deallocate the expanded arrays and try again. 3623 This is the first check of a double-check pair. */ 3624 __kmp_free(newThreads); 3625 continue; /* start over and try again */ 3626 } 3627 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3628 if (!old_tp_cached && __kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3629 /* Same check as above, but this time with the lock so we can be sure if 3630 we can succeed. */ 3631 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3632 __kmp_free(newThreads); 3633 continue; /* start over and try again */ 3634 } else { 3635 /* success */ 3636 // __kmp_free( __kmp_threads ); // ATT: It leads to crash. Need to be 3637 // investigated. 3638 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3639 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3640 added += newCapacity - __kmp_threads_capacity; 3641 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3642 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3643 break; /* succeeded, so we can exit the loop */ 3644 } 3645 } 3646 return added; 3647 } 3648 3649 /* Register the current thread as a root thread and obtain our gtid. We must 3650 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3651 thread that calls from __kmp_do_serial_initialize() */ 3652 int __kmp_register_root(int initial_thread) { 3653 kmp_info_t *root_thread; 3654 kmp_root_t *root; 3655 int gtid; 3656 int capacity; 3657 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3658 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3659 KMP_MB(); 3660 3661 /* 2007-03-02: 3662 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3663 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3664 work as expected -- it may return false (that means there is at least one 3665 empty slot in __kmp_threads array), but it is possible the only free slot 3666 is #0, which is reserved for initial thread and so cannot be used for this 3667 one. Following code workarounds this bug. 3668 3669 However, right solution seems to be not reserving slot #0 for initial 3670 thread because: 3671 (1) there is no magic in slot #0, 3672 (2) we cannot detect initial thread reliably (the first thread which does 3673 serial initialization may be not a real initial thread). 3674 */ 3675 capacity = __kmp_threads_capacity; 3676 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3677 --capacity; 3678 } 3679 3680 /* see if there are too many threads */ 3681 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1, 1)) { 3682 if (__kmp_tp_cached) { 3683 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3684 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3685 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3686 } else { 3687 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3688 __kmp_msg_null); 3689 } 3690 } 3691 3692 /* find an available thread slot */ 3693 /* Don't reassign the zero slot since we need that to only be used by initial 3694 thread */ 3695 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL; 3696 gtid++) 3697 ; 3698 KA_TRACE(1, 3699 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3700 KMP_ASSERT(gtid < __kmp_threads_capacity); 3701 3702 /* update global accounting */ 3703 __kmp_all_nth++; 3704 TCW_4(__kmp_nth, __kmp_nth + 1); 3705 3706 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3707 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3708 if (__kmp_adjust_gtid_mode) { 3709 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3710 if (TCR_4(__kmp_gtid_mode) != 2) { 3711 TCW_4(__kmp_gtid_mode, 2); 3712 } 3713 } else { 3714 if (TCR_4(__kmp_gtid_mode) != 1) { 3715 TCW_4(__kmp_gtid_mode, 1); 3716 } 3717 } 3718 } 3719 3720 #ifdef KMP_ADJUST_BLOCKTIME 3721 /* Adjust blocktime to zero if necessary */ 3722 /* Middle initialization might not have occurred yet */ 3723 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3724 if (__kmp_nth > __kmp_avail_proc) { 3725 __kmp_zero_bt = TRUE; 3726 } 3727 } 3728 #endif /* KMP_ADJUST_BLOCKTIME */ 3729 3730 /* setup this new hierarchy */ 3731 if (!(root = __kmp_root[gtid])) { 3732 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3733 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3734 } 3735 3736 #if KMP_STATS_ENABLED 3737 // Initialize stats as soon as possible (right after gtid assignment). 3738 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3739 KMP_START_EXPLICIT_TIMER(OMP_worker_thread_life); 3740 KMP_SET_THREAD_STATE(SERIAL_REGION); 3741 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3742 #endif 3743 __kmp_initialize_root(root); 3744 3745 /* setup new root thread structure */ 3746 if (root->r.r_uber_thread) { 3747 root_thread = root->r.r_uber_thread; 3748 } else { 3749 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3750 if (__kmp_storage_map) { 3751 __kmp_print_thread_storage_map(root_thread, gtid); 3752 } 3753 root_thread->th.th_info.ds.ds_gtid = gtid; 3754 #if OMPT_SUPPORT 3755 root_thread->th.ompt_thread_info.thread_data.ptr = NULL; 3756 #endif 3757 root_thread->th.th_root = root; 3758 if (__kmp_env_consistency_check) { 3759 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3760 } 3761 #if USE_FAST_MEMORY 3762 __kmp_initialize_fast_memory(root_thread); 3763 #endif /* USE_FAST_MEMORY */ 3764 3765 #if KMP_USE_BGET 3766 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3767 __kmp_initialize_bget(root_thread); 3768 #endif 3769 __kmp_init_random(root_thread); // Initialize random number generator 3770 } 3771 3772 /* setup the serial team held in reserve by the root thread */ 3773 if (!root_thread->th.th_serial_team) { 3774 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3775 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3776 root_thread->th.th_serial_team = 3777 __kmp_allocate_team(root, 1, 1, 3778 #if OMPT_SUPPORT 3779 ompt_data_none, // root parallel id 3780 #endif 3781 #if OMP_40_ENABLED 3782 proc_bind_default, 3783 #endif 3784 &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3785 } 3786 KMP_ASSERT(root_thread->th.th_serial_team); 3787 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3788 root_thread->th.th_serial_team)); 3789 3790 /* drop root_thread into place */ 3791 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3792 3793 root->r.r_root_team->t.t_threads[0] = root_thread; 3794 root->r.r_hot_team->t.t_threads[0] = root_thread; 3795 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3796 // AC: the team created in reserve, not for execution (it is unused for now). 3797 root_thread->th.th_serial_team->t.t_serialized = 0; 3798 root->r.r_uber_thread = root_thread; 3799 3800 /* initialize the thread, get it ready to go */ 3801 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3802 TCW_4(__kmp_init_gtid, TRUE); 3803 3804 /* prepare the master thread for get_gtid() */ 3805 __kmp_gtid_set_specific(gtid); 3806 3807 #if USE_ITT_BUILD 3808 __kmp_itt_thread_name(gtid); 3809 #endif /* USE_ITT_BUILD */ 3810 3811 #ifdef KMP_TDATA_GTID 3812 __kmp_gtid = gtid; 3813 #endif 3814 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3815 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3816 3817 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3818 "plain=%u\n", 3819 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3820 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3821 KMP_INIT_BARRIER_STATE)); 3822 { // Initialize barrier data. 3823 int b; 3824 for (b = 0; b < bs_last_barrier; ++b) { 3825 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3826 #if USE_DEBUGGER 3827 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3828 #endif 3829 } 3830 } 3831 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3832 KMP_INIT_BARRIER_STATE); 3833 3834 #if KMP_AFFINITY_SUPPORTED 3835 #if OMP_40_ENABLED 3836 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3837 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3838 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3839 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3840 #endif 3841 3842 if (TCR_4(__kmp_init_middle)) { 3843 __kmp_affinity_set_init_mask(gtid, TRUE); 3844 } 3845 #endif /* KMP_AFFINITY_SUPPORTED */ 3846 3847 __kmp_root_counter++; 3848 3849 #if OMPT_SUPPORT 3850 if (!initial_thread && ompt_enabled.enabled) { 3851 3852 ompt_thread_t *root_thread = ompt_get_thread(); 3853 3854 ompt_set_thread_state(root_thread, omp_state_overhead); 3855 3856 if (ompt_enabled.ompt_callback_thread_begin) { 3857 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3858 ompt_thread_initial, __ompt_get_thread_data_internal()); 3859 } 3860 ompt_data_t *task_data; 3861 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 3862 if (ompt_enabled.ompt_callback_task_create) { 3863 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 3864 NULL, NULL, task_data, ompt_task_initial, 0, NULL); 3865 // initial task has nothing to return to 3866 } 3867 3868 ompt_set_thread_state(root_thread, omp_state_work_serial); 3869 } 3870 #endif 3871 3872 KMP_MB(); 3873 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3874 3875 return gtid; 3876 } 3877 3878 #if KMP_NESTED_HOT_TEAMS 3879 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3880 const int max_level) { 3881 int i, n, nth; 3882 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3883 if (!hot_teams || !hot_teams[level].hot_team) { 3884 return 0; 3885 } 3886 KMP_DEBUG_ASSERT(level < max_level); 3887 kmp_team_t *team = hot_teams[level].hot_team; 3888 nth = hot_teams[level].hot_team_nth; 3889 n = nth - 1; // master is not freed 3890 if (level < max_level - 1) { 3891 for (i = 0; i < nth; ++i) { 3892 kmp_info_t *th = team->t.t_threads[i]; 3893 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3894 if (i > 0 && th->th.th_hot_teams) { 3895 __kmp_free(th->th.th_hot_teams); 3896 th->th.th_hot_teams = NULL; 3897 } 3898 } 3899 } 3900 __kmp_free_team(root, team, NULL); 3901 return n; 3902 } 3903 #endif 3904 3905 // Resets a root thread and clear its root and hot teams. 3906 // Returns the number of __kmp_threads entries directly and indirectly freed. 3907 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3908 kmp_team_t *root_team = root->r.r_root_team; 3909 kmp_team_t *hot_team = root->r.r_hot_team; 3910 int n = hot_team->t.t_nproc; 3911 int i; 3912 3913 KMP_DEBUG_ASSERT(!root->r.r_active); 3914 3915 root->r.r_root_team = NULL; 3916 root->r.r_hot_team = NULL; 3917 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3918 // before call to __kmp_free_team(). 3919 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3920 #if KMP_NESTED_HOT_TEAMS 3921 if (__kmp_hot_teams_max_level > 3922 0) { // need to free nested hot teams and their threads if any 3923 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3924 kmp_info_t *th = hot_team->t.t_threads[i]; 3925 if (__kmp_hot_teams_max_level > 1) { 3926 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3927 } 3928 if (th->th.th_hot_teams) { 3929 __kmp_free(th->th.th_hot_teams); 3930 th->th.th_hot_teams = NULL; 3931 } 3932 } 3933 } 3934 #endif 3935 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3936 3937 // Before we can reap the thread, we need to make certain that all other 3938 // threads in the teams that had this root as ancestor have stopped trying to 3939 // steal tasks. 3940 if (__kmp_tasking_mode != tskm_immediate_exec) { 3941 __kmp_wait_to_unref_task_teams(); 3942 } 3943 3944 #if KMP_OS_WINDOWS 3945 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3946 KA_TRACE( 3947 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3948 "\n", 3949 (LPVOID) & (root->r.r_uber_thread->th), 3950 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3951 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3952 #endif /* KMP_OS_WINDOWS */ 3953 3954 #if OMPT_SUPPORT 3955 if (ompt_enabled.ompt_callback_thread_end) { 3956 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3957 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3958 } 3959 #endif 3960 3961 TCW_4(__kmp_nth, 3962 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3963 root->r.r_cg_nthreads--; 3964 3965 __kmp_reap_thread(root->r.r_uber_thread, 1); 3966 3967 // We canot put root thread to __kmp_thread_pool, so we have to reap it istead 3968 // of freeing. 3969 root->r.r_uber_thread = NULL; 3970 /* mark root as no longer in use */ 3971 root->r.r_begin = FALSE; 3972 3973 return n; 3974 } 3975 3976 void __kmp_unregister_root_current_thread(int gtid) { 3977 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3978 /* this lock should be ok, since unregister_root_current_thread is never 3979 called during an abort, only during a normal close. furthermore, if you 3980 have the forkjoin lock, you should never try to get the initz lock */ 3981 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3982 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3983 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3984 "exiting T#%d\n", 3985 gtid)); 3986 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3987 return; 3988 } 3989 kmp_root_t *root = __kmp_root[gtid]; 3990 3991 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3992 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3993 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3994 KMP_ASSERT(root->r.r_active == FALSE); 3995 3996 KMP_MB(); 3997 3998 #if OMP_45_ENABLED 3999 kmp_info_t *thread = __kmp_threads[gtid]; 4000 kmp_team_t *team = thread->th.th_team; 4001 kmp_task_team_t *task_team = thread->th.th_task_team; 4002 4003 // we need to wait for the proxy tasks before finishing the thread 4004 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4005 #if OMPT_SUPPORT 4006 // the runtime is shutting down so we won't report any events 4007 thread->th.ompt_thread_info.state = omp_state_undefined; 4008 #endif 4009 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4010 } 4011 #endif 4012 4013 __kmp_reset_root(gtid, root); 4014 4015 /* free up this thread slot */ 4016 __kmp_gtid_set_specific(KMP_GTID_DNE); 4017 #ifdef KMP_TDATA_GTID 4018 __kmp_gtid = KMP_GTID_DNE; 4019 #endif 4020 4021 KMP_MB(); 4022 KC_TRACE(10, 4023 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4024 4025 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4026 } 4027 4028 #if KMP_OS_WINDOWS 4029 /* __kmp_forkjoin_lock must be already held 4030 Unregisters a root thread that is not the current thread. Returns the number 4031 of __kmp_threads entries freed as a result. */ 4032 static int __kmp_unregister_root_other_thread(int gtid) { 4033 kmp_root_t *root = __kmp_root[gtid]; 4034 int r; 4035 4036 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4037 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4038 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4039 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4040 KMP_ASSERT(root->r.r_active == FALSE); 4041 4042 r = __kmp_reset_root(gtid, root); 4043 KC_TRACE(10, 4044 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4045 return r; 4046 } 4047 #endif 4048 4049 #if KMP_DEBUG 4050 void __kmp_task_info() { 4051 4052 kmp_int32 gtid = __kmp_entry_gtid(); 4053 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4054 kmp_info_t *this_thr = __kmp_threads[gtid]; 4055 kmp_team_t *steam = this_thr->th.th_serial_team; 4056 kmp_team_t *team = this_thr->th.th_team; 4057 4058 __kmp_printf("__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p curtask=%p " 4059 "ptask=%p\n", 4060 gtid, tid, this_thr, team, this_thr->th.th_current_task, 4061 team->t.t_implicit_task_taskdata[tid].td_parent); 4062 } 4063 #endif // KMP_DEBUG 4064 4065 /* TODO optimize with one big memclr, take out what isn't needed, split 4066 responsibility to workers as much as possible, and delay initialization of 4067 features as much as possible */ 4068 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4069 int tid, int gtid) { 4070 /* this_thr->th.th_info.ds.ds_gtid is setup in 4071 kmp_allocate_thread/create_worker. 4072 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4073 kmp_info_t *master = team->t.t_threads[0]; 4074 KMP_DEBUG_ASSERT(this_thr != NULL); 4075 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4076 KMP_DEBUG_ASSERT(team); 4077 KMP_DEBUG_ASSERT(team->t.t_threads); 4078 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4079 KMP_DEBUG_ASSERT(master); 4080 KMP_DEBUG_ASSERT(master->th.th_root); 4081 4082 KMP_MB(); 4083 4084 TCW_SYNC_PTR(this_thr->th.th_team, team); 4085 4086 this_thr->th.th_info.ds.ds_tid = tid; 4087 this_thr->th.th_set_nproc = 0; 4088 if (__kmp_tasking_mode != tskm_immediate_exec) 4089 // When tasking is possible, threads are not safe to reap until they are 4090 // done tasking; this will be set when tasking code is exited in wait 4091 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4092 else // no tasking --> always safe to reap 4093 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4094 #if OMP_40_ENABLED 4095 this_thr->th.th_set_proc_bind = proc_bind_default; 4096 #if KMP_AFFINITY_SUPPORTED 4097 this_thr->th.th_new_place = this_thr->th.th_current_place; 4098 #endif 4099 #endif 4100 this_thr->th.th_root = master->th.th_root; 4101 4102 /* setup the thread's cache of the team structure */ 4103 this_thr->th.th_team_nproc = team->t.t_nproc; 4104 this_thr->th.th_team_master = master; 4105 this_thr->th.th_team_serialized = team->t.t_serialized; 4106 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4107 4108 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4109 4110 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4111 tid, gtid, this_thr, this_thr->th.th_current_task)); 4112 4113 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4114 team, tid, TRUE); 4115 4116 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4117 tid, gtid, this_thr, this_thr->th.th_current_task)); 4118 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4119 // __kmp_initialize_team()? 4120 4121 /* TODO no worksharing in speculative threads */ 4122 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4123 4124 this_thr->th.th_local.this_construct = 0; 4125 4126 if (!this_thr->th.th_pri_common) { 4127 this_thr->th.th_pri_common = 4128 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4129 if (__kmp_storage_map) { 4130 __kmp_print_storage_map_gtid( 4131 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4132 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4133 } 4134 this_thr->th.th_pri_head = NULL; 4135 } 4136 4137 /* Initialize dynamic dispatch */ 4138 { 4139 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4140 // Use team max_nproc since this will never change for the team. 4141 size_t disp_size = 4142 sizeof(dispatch_private_info_t) * 4143 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4144 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4145 team->t.t_max_nproc)); 4146 KMP_ASSERT(dispatch); 4147 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4148 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4149 4150 dispatch->th_disp_index = 0; 4151 #if OMP_45_ENABLED 4152 dispatch->th_doacross_buf_idx = 0; 4153 #endif 4154 if (!dispatch->th_disp_buffer) { 4155 dispatch->th_disp_buffer = 4156 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4157 4158 if (__kmp_storage_map) { 4159 __kmp_print_storage_map_gtid( 4160 gtid, &dispatch->th_disp_buffer[0], 4161 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4162 ? 1 4163 : __kmp_dispatch_num_buffers], 4164 disp_size, "th_%d.th_dispatch.th_disp_buffer " 4165 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4166 gtid, team->t.t_id, gtid); 4167 } 4168 } else { 4169 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4170 } 4171 4172 dispatch->th_dispatch_pr_current = 0; 4173 dispatch->th_dispatch_sh_current = 0; 4174 4175 dispatch->th_deo_fcn = 0; /* ORDERED */ 4176 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4177 } 4178 4179 this_thr->th.th_next_pool = NULL; 4180 4181 if (!this_thr->th.th_task_state_memo_stack) { 4182 size_t i; 4183 this_thr->th.th_task_state_memo_stack = 4184 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4185 this_thr->th.th_task_state_top = 0; 4186 this_thr->th.th_task_state_stack_sz = 4; 4187 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4188 ++i) // zero init the stack 4189 this_thr->th.th_task_state_memo_stack[i] = 0; 4190 } 4191 4192 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4193 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4194 4195 KMP_MB(); 4196 } 4197 4198 /* allocate a new thread for the requesting team. this is only called from 4199 within a forkjoin critical section. we will first try to get an available 4200 thread from the thread pool. if none is available, we will fork a new one 4201 assuming we are able to create a new one. this should be assured, as the 4202 caller should check on this first. */ 4203 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4204 int new_tid) { 4205 kmp_team_t *serial_team; 4206 kmp_info_t *new_thr; 4207 int new_gtid; 4208 4209 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4210 KMP_DEBUG_ASSERT(root && team); 4211 #if !KMP_NESTED_HOT_TEAMS 4212 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4213 #endif 4214 KMP_MB(); 4215 4216 /* first, try to get one from the thread pool */ 4217 if (__kmp_thread_pool) { 4218 4219 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4220 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4221 if (new_thr == __kmp_thread_pool_insert_pt) { 4222 __kmp_thread_pool_insert_pt = NULL; 4223 } 4224 TCW_4(new_thr->th.th_in_pool, FALSE); 4225 // Don't touch th_active_in_pool or th_active. 4226 // The worker thread adjusts those flags as it sleeps/awakens. 4227 __kmp_thread_pool_nth--; 4228 4229 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4230 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4231 KMP_ASSERT(!new_thr->th.th_team); 4232 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4233 KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0); 4234 4235 /* setup the thread structure */ 4236 __kmp_initialize_info(new_thr, team, new_tid, 4237 new_thr->th.th_info.ds.ds_gtid); 4238 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4239 4240 TCW_4(__kmp_nth, __kmp_nth + 1); 4241 root->r.r_cg_nthreads++; 4242 4243 new_thr->th.th_task_state = 0; 4244 new_thr->th.th_task_state_top = 0; 4245 new_thr->th.th_task_state_stack_sz = 4; 4246 4247 #ifdef KMP_ADJUST_BLOCKTIME 4248 /* Adjust blocktime back to zero if necessary */ 4249 /* Middle initialization might not have occurred yet */ 4250 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4251 if (__kmp_nth > __kmp_avail_proc) { 4252 __kmp_zero_bt = TRUE; 4253 } 4254 } 4255 #endif /* KMP_ADJUST_BLOCKTIME */ 4256 4257 #if KMP_DEBUG 4258 // If thread entered pool via __kmp_free_thread, wait_flag should != 4259 // KMP_BARRIER_PARENT_FLAG. 4260 int b; 4261 kmp_balign_t *balign = new_thr->th.th_bar; 4262 for (b = 0; b < bs_last_barrier; ++b) 4263 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4264 #endif 4265 4266 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4267 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4268 4269 KMP_MB(); 4270 return new_thr; 4271 } 4272 4273 /* no, well fork a new one */ 4274 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4275 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4276 4277 #if KMP_USE_MONITOR 4278 // If this is the first worker thread the RTL is creating, then also 4279 // launch the monitor thread. We try to do this as early as possible. 4280 if (!TCR_4(__kmp_init_monitor)) { 4281 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4282 if (!TCR_4(__kmp_init_monitor)) { 4283 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4284 TCW_4(__kmp_init_monitor, 1); 4285 __kmp_create_monitor(&__kmp_monitor); 4286 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4287 #if KMP_OS_WINDOWS 4288 // AC: wait until monitor has started. This is a fix for CQ232808. 4289 // The reason is that if the library is loaded/unloaded in a loop with 4290 // small (parallel) work in between, then there is high probability that 4291 // monitor thread started after the library shutdown. At shutdown it is 4292 // too late to cope with the problem, because when the master is in 4293 // DllMain (process detach) the monitor has no chances to start (it is 4294 // blocked), and master has no means to inform the monitor that the 4295 // library has gone, because all the memory which the monitor can access 4296 // is going to be released/reset. 4297 while (TCR_4(__kmp_init_monitor) < 2) { 4298 KMP_YIELD(TRUE); 4299 } 4300 KF_TRACE(10, ("after monitor thread has started\n")); 4301 #endif 4302 } 4303 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4304 } 4305 #endif 4306 4307 KMP_MB(); 4308 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { 4309 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4310 } 4311 4312 /* allocate space for it. */ 4313 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4314 4315 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4316 4317 if (__kmp_storage_map) { 4318 __kmp_print_thread_storage_map(new_thr, new_gtid); 4319 } 4320 4321 // add the reserve serialized team, initialized from the team's master thread 4322 { 4323 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4324 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4325 new_thr->th.th_serial_team = serial_team = 4326 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4327 #if OMPT_SUPPORT 4328 ompt_data_none, // root parallel id 4329 #endif 4330 #if OMP_40_ENABLED 4331 proc_bind_default, 4332 #endif 4333 &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 4334 } 4335 KMP_ASSERT(serial_team); 4336 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4337 // execution (it is unused for now). 4338 serial_team->t.t_threads[0] = new_thr; 4339 KF_TRACE(10, 4340 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4341 new_thr)); 4342 4343 /* setup the thread structures */ 4344 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4345 4346 #if USE_FAST_MEMORY 4347 __kmp_initialize_fast_memory(new_thr); 4348 #endif /* USE_FAST_MEMORY */ 4349 4350 #if KMP_USE_BGET 4351 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4352 __kmp_initialize_bget(new_thr); 4353 #endif 4354 4355 __kmp_init_random(new_thr); // Initialize random number generator 4356 4357 /* Initialize these only once when thread is grabbed for a team allocation */ 4358 KA_TRACE(20, 4359 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4360 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4361 4362 int b; 4363 kmp_balign_t *balign = new_thr->th.th_bar; 4364 for (b = 0; b < bs_last_barrier; ++b) { 4365 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4366 balign[b].bb.team = NULL; 4367 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4368 balign[b].bb.use_oncore_barrier = 0; 4369 } 4370 4371 new_thr->th.th_spin_here = FALSE; 4372 new_thr->th.th_next_waiting = 0; 4373 4374 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 4375 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4376 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4377 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4378 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4379 #endif 4380 4381 TCW_4(new_thr->th.th_in_pool, FALSE); 4382 new_thr->th.th_active_in_pool = FALSE; 4383 TCW_4(new_thr->th.th_active, TRUE); 4384 4385 /* adjust the global counters */ 4386 __kmp_all_nth++; 4387 __kmp_nth++; 4388 4389 root->r.r_cg_nthreads++; 4390 4391 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4392 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4393 if (__kmp_adjust_gtid_mode) { 4394 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4395 if (TCR_4(__kmp_gtid_mode) != 2) { 4396 TCW_4(__kmp_gtid_mode, 2); 4397 } 4398 } else { 4399 if (TCR_4(__kmp_gtid_mode) != 1) { 4400 TCW_4(__kmp_gtid_mode, 1); 4401 } 4402 } 4403 } 4404 4405 #ifdef KMP_ADJUST_BLOCKTIME 4406 /* Adjust blocktime back to zero if necessary */ 4407 /* Middle initialization might not have occurred yet */ 4408 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4409 if (__kmp_nth > __kmp_avail_proc) { 4410 __kmp_zero_bt = TRUE; 4411 } 4412 } 4413 #endif /* KMP_ADJUST_BLOCKTIME */ 4414 4415 /* actually fork it and create the new worker thread */ 4416 KF_TRACE( 4417 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4418 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4419 KF_TRACE(10, 4420 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4421 4422 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4423 new_gtid)); 4424 KMP_MB(); 4425 return new_thr; 4426 } 4427 4428 /* Reinitialize team for reuse. 4429 The hot team code calls this case at every fork barrier, so EPCC barrier 4430 test are extremely sensitive to changes in it, esp. writes to the team 4431 struct, which cause a cache invalidation in all threads. 4432 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4433 static void __kmp_reinitialize_team(kmp_team_t *team, 4434 kmp_internal_control_t *new_icvs, 4435 ident_t *loc) { 4436 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4437 team->t.t_threads[0], team)); 4438 KMP_DEBUG_ASSERT(team && new_icvs); 4439 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4440 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4441 4442 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4443 // Copy ICVs to the master thread's implicit taskdata 4444 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4445 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4446 4447 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4448 team->t.t_threads[0], team)); 4449 } 4450 4451 /* Initialize the team data structure. 4452 This assumes the t_threads and t_max_nproc are already set. 4453 Also, we don't touch the arguments */ 4454 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4455 kmp_internal_control_t *new_icvs, 4456 ident_t *loc) { 4457 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4458 4459 /* verify */ 4460 KMP_DEBUG_ASSERT(team); 4461 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4462 KMP_DEBUG_ASSERT(team->t.t_threads); 4463 KMP_MB(); 4464 4465 team->t.t_master_tid = 0; /* not needed */ 4466 /* team->t.t_master_bar; not needed */ 4467 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4468 team->t.t_nproc = new_nproc; 4469 4470 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4471 team->t.t_next_pool = NULL; 4472 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4473 * up hot team */ 4474 4475 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4476 team->t.t_invoke = NULL; /* not needed */ 4477 4478 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4479 team->t.t_sched = new_icvs->sched; 4480 4481 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4482 team->t.t_fp_control_saved = FALSE; /* not needed */ 4483 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4484 team->t.t_mxcsr = 0; /* not needed */ 4485 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4486 4487 team->t.t_construct = 0; 4488 4489 team->t.t_ordered.dt.t_value = 0; 4490 team->t.t_master_active = FALSE; 4491 4492 memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t)); 4493 4494 #ifdef KMP_DEBUG 4495 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4496 #endif 4497 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4498 4499 team->t.t_control_stack_top = NULL; 4500 4501 __kmp_reinitialize_team(team, new_icvs, loc); 4502 4503 KMP_MB(); 4504 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4505 } 4506 4507 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 4508 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4509 static void 4510 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4511 if (KMP_AFFINITY_CAPABLE()) { 4512 int status; 4513 if (old_mask != NULL) { 4514 status = __kmp_get_system_affinity(old_mask, TRUE); 4515 int error = errno; 4516 if (status != 0) { 4517 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4518 __kmp_msg_null); 4519 } 4520 } 4521 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4522 } 4523 } 4524 #endif 4525 4526 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 4527 4528 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4529 // It calculats the worker + master thread's partition based upon the parent 4530 // thread's partition, and binds each worker to a thread in their partition. 4531 // The master thread's partition should already include its current binding. 4532 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4533 // Copy the master thread's place partion to the team struct 4534 kmp_info_t *master_th = team->t.t_threads[0]; 4535 KMP_DEBUG_ASSERT(master_th != NULL); 4536 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4537 int first_place = master_th->th.th_first_place; 4538 int last_place = master_th->th.th_last_place; 4539 int masters_place = master_th->th.th_current_place; 4540 team->t.t_first_place = first_place; 4541 team->t.t_last_place = last_place; 4542 4543 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4544 "bound to place %d partition = [%d,%d]\n", 4545 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4546 team->t.t_id, masters_place, first_place, last_place)); 4547 4548 switch (proc_bind) { 4549 4550 case proc_bind_default: 4551 // serial teams might have the proc_bind policy set to proc_bind_default. It 4552 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4553 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4554 break; 4555 4556 case proc_bind_master: { 4557 int f; 4558 int n_th = team->t.t_nproc; 4559 for (f = 1; f < n_th; f++) { 4560 kmp_info_t *th = team->t.t_threads[f]; 4561 KMP_DEBUG_ASSERT(th != NULL); 4562 th->th.th_first_place = first_place; 4563 th->th.th_last_place = last_place; 4564 th->th.th_new_place = masters_place; 4565 4566 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4567 "partition = [%d,%d]\n", 4568 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4569 f, masters_place, first_place, last_place)); 4570 } 4571 } break; 4572 4573 case proc_bind_close: { 4574 int f; 4575 int n_th = team->t.t_nproc; 4576 int n_places; 4577 if (first_place <= last_place) { 4578 n_places = last_place - first_place + 1; 4579 } else { 4580 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4581 } 4582 if (n_th <= n_places) { 4583 int place = masters_place; 4584 for (f = 1; f < n_th; f++) { 4585 kmp_info_t *th = team->t.t_threads[f]; 4586 KMP_DEBUG_ASSERT(th != NULL); 4587 4588 if (place == last_place) { 4589 place = first_place; 4590 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4591 place = 0; 4592 } else { 4593 place++; 4594 } 4595 th->th.th_first_place = first_place; 4596 th->th.th_last_place = last_place; 4597 th->th.th_new_place = place; 4598 4599 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4600 "partition = [%d,%d]\n", 4601 __kmp_gtid_from_thread(team->t.t_threads[f]), 4602 team->t.t_id, f, place, first_place, last_place)); 4603 } 4604 } else { 4605 int S, rem, gap, s_count; 4606 S = n_th / n_places; 4607 s_count = 0; 4608 rem = n_th - (S * n_places); 4609 gap = rem > 0 ? n_places / rem : n_places; 4610 int place = masters_place; 4611 int gap_ct = gap; 4612 for (f = 0; f < n_th; f++) { 4613 kmp_info_t *th = team->t.t_threads[f]; 4614 KMP_DEBUG_ASSERT(th != NULL); 4615 4616 th->th.th_first_place = first_place; 4617 th->th.th_last_place = last_place; 4618 th->th.th_new_place = place; 4619 s_count++; 4620 4621 if ((s_count == S) && rem && (gap_ct == gap)) { 4622 // do nothing, add an extra thread to place on next iteration 4623 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4624 // we added an extra thread to this place; move to next place 4625 if (place == last_place) { 4626 place = first_place; 4627 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4628 place = 0; 4629 } else { 4630 place++; 4631 } 4632 s_count = 0; 4633 gap_ct = 1; 4634 rem--; 4635 } else if (s_count == S) { // place full; don't add extra 4636 if (place == last_place) { 4637 place = first_place; 4638 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4639 place = 0; 4640 } else { 4641 place++; 4642 } 4643 gap_ct++; 4644 s_count = 0; 4645 } 4646 4647 KA_TRACE(100, 4648 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4649 "partition = [%d,%d]\n", 4650 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4651 th->th.th_new_place, first_place, last_place)); 4652 } 4653 KMP_DEBUG_ASSERT(place == masters_place); 4654 } 4655 } break; 4656 4657 case proc_bind_spread: { 4658 int f; 4659 int n_th = team->t.t_nproc; 4660 int n_places; 4661 int thidx; 4662 if (first_place <= last_place) { 4663 n_places = last_place - first_place + 1; 4664 } else { 4665 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4666 } 4667 if (n_th <= n_places) { 4668 int place = -1; 4669 4670 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4671 int S = n_places / n_th; 4672 int s_count, rem, gap, gap_ct; 4673 4674 place = masters_place; 4675 rem = n_places - n_th * S; 4676 gap = rem ? n_th / rem : 1; 4677 gap_ct = gap; 4678 thidx = n_th; 4679 if (update_master_only == 1) 4680 thidx = 1; 4681 for (f = 0; f < thidx; f++) { 4682 kmp_info_t *th = team->t.t_threads[f]; 4683 KMP_DEBUG_ASSERT(th != NULL); 4684 4685 th->th.th_first_place = place; 4686 th->th.th_new_place = place; 4687 s_count = 1; 4688 while (s_count < S) { 4689 if (place == last_place) { 4690 place = first_place; 4691 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4692 place = 0; 4693 } else { 4694 place++; 4695 } 4696 s_count++; 4697 } 4698 if (rem && (gap_ct == gap)) { 4699 if (place == last_place) { 4700 place = first_place; 4701 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4702 place = 0; 4703 } else { 4704 place++; 4705 } 4706 rem--; 4707 gap_ct = 0; 4708 } 4709 th->th.th_last_place = place; 4710 gap_ct++; 4711 4712 if (place == last_place) { 4713 place = first_place; 4714 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4715 place = 0; 4716 } else { 4717 place++; 4718 } 4719 4720 KA_TRACE(100, 4721 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4722 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4723 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4724 f, th->th.th_new_place, th->th.th_first_place, 4725 th->th.th_last_place, __kmp_affinity_num_masks)); 4726 } 4727 } else { 4728 /* Having uniform space of available computation places I can create 4729 T partitions of round(P/T) size and put threads into the first 4730 place of each partition. */ 4731 double current = static_cast<double>(masters_place); 4732 double spacing = 4733 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4734 int first, last; 4735 kmp_info_t *th; 4736 4737 thidx = n_th + 1; 4738 if (update_master_only == 1) 4739 thidx = 1; 4740 for (f = 0; f < thidx; f++) { 4741 first = static_cast<int>(current); 4742 last = static_cast<int>(current + spacing) - 1; 4743 KMP_DEBUG_ASSERT(last >= first); 4744 if (first >= n_places) { 4745 if (masters_place) { 4746 first -= n_places; 4747 last -= n_places; 4748 if (first == (masters_place + 1)) { 4749 KMP_DEBUG_ASSERT(f == n_th); 4750 first--; 4751 } 4752 if (last == masters_place) { 4753 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4754 last--; 4755 } 4756 } else { 4757 KMP_DEBUG_ASSERT(f == n_th); 4758 first = 0; 4759 last = 0; 4760 } 4761 } 4762 if (last >= n_places) { 4763 last = (n_places - 1); 4764 } 4765 place = first; 4766 current += spacing; 4767 if (f < n_th) { 4768 KMP_DEBUG_ASSERT(0 <= first); 4769 KMP_DEBUG_ASSERT(n_places > first); 4770 KMP_DEBUG_ASSERT(0 <= last); 4771 KMP_DEBUG_ASSERT(n_places > last); 4772 KMP_DEBUG_ASSERT(last_place >= first_place); 4773 th = team->t.t_threads[f]; 4774 KMP_DEBUG_ASSERT(th); 4775 th->th.th_first_place = first; 4776 th->th.th_new_place = place; 4777 th->th.th_last_place = last; 4778 4779 KA_TRACE(100, 4780 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4781 "partition = [%d,%d], spacing = %.4f\n", 4782 __kmp_gtid_from_thread(team->t.t_threads[f]), 4783 team->t.t_id, f, th->th.th_new_place, 4784 th->th.th_first_place, th->th.th_last_place, spacing)); 4785 } 4786 } 4787 } 4788 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4789 } else { 4790 int S, rem, gap, s_count; 4791 S = n_th / n_places; 4792 s_count = 0; 4793 rem = n_th - (S * n_places); 4794 gap = rem > 0 ? n_places / rem : n_places; 4795 int place = masters_place; 4796 int gap_ct = gap; 4797 thidx = n_th; 4798 if (update_master_only == 1) 4799 thidx = 1; 4800 for (f = 0; f < thidx; f++) { 4801 kmp_info_t *th = team->t.t_threads[f]; 4802 KMP_DEBUG_ASSERT(th != NULL); 4803 4804 th->th.th_first_place = place; 4805 th->th.th_last_place = place; 4806 th->th.th_new_place = place; 4807 s_count++; 4808 4809 if ((s_count == S) && rem && (gap_ct == gap)) { 4810 // do nothing, add an extra thread to place on next iteration 4811 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4812 // we added an extra thread to this place; move on to next place 4813 if (place == last_place) { 4814 place = first_place; 4815 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4816 place = 0; 4817 } else { 4818 place++; 4819 } 4820 s_count = 0; 4821 gap_ct = 1; 4822 rem--; 4823 } else if (s_count == S) { // place is full; don't add extra thread 4824 if (place == last_place) { 4825 place = first_place; 4826 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4827 place = 0; 4828 } else { 4829 place++; 4830 } 4831 gap_ct++; 4832 s_count = 0; 4833 } 4834 4835 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4836 "partition = [%d,%d]\n", 4837 __kmp_gtid_from_thread(team->t.t_threads[f]), 4838 team->t.t_id, f, th->th.th_new_place, 4839 th->th.th_first_place, th->th.th_last_place)); 4840 } 4841 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4842 } 4843 } break; 4844 4845 default: 4846 break; 4847 } 4848 4849 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4850 } 4851 4852 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */ 4853 4854 /* allocate a new team data structure to use. take one off of the free pool if 4855 available */ 4856 kmp_team_t * 4857 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4858 #if OMPT_SUPPORT 4859 ompt_data_t ompt_parallel_data, 4860 #endif 4861 #if OMP_40_ENABLED 4862 kmp_proc_bind_t new_proc_bind, 4863 #endif 4864 kmp_internal_control_t *new_icvs, 4865 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4866 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4867 int f; 4868 kmp_team_t *team; 4869 int use_hot_team = !root->r.r_active; 4870 int level = 0; 4871 4872 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4873 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4874 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4875 KMP_MB(); 4876 4877 #if KMP_NESTED_HOT_TEAMS 4878 kmp_hot_team_ptr_t *hot_teams; 4879 if (master) { 4880 team = master->th.th_team; 4881 level = team->t.t_active_level; 4882 if (master->th.th_teams_microtask) { // in teams construct? 4883 if (master->th.th_teams_size.nteams > 1 && 4884 ( // #teams > 1 4885 team->t.t_pkfn == 4886 (microtask_t)__kmp_teams_master || // inner fork of the teams 4887 master->th.th_teams_level < 4888 team->t.t_level)) { // or nested parallel inside the teams 4889 ++level; // not increment if #teams==1, or for outer fork of the teams; 4890 // increment otherwise 4891 } 4892 } 4893 hot_teams = master->th.th_hot_teams; 4894 if (level < __kmp_hot_teams_max_level && hot_teams && 4895 hot_teams[level] 4896 .hot_team) { // hot team has already been allocated for given level 4897 use_hot_team = 1; 4898 } else { 4899 use_hot_team = 0; 4900 } 4901 } 4902 #endif 4903 // Optimization to use a "hot" team 4904 if (use_hot_team && new_nproc > 1) { 4905 KMP_DEBUG_ASSERT(new_nproc == max_nproc); 4906 #if KMP_NESTED_HOT_TEAMS 4907 team = hot_teams[level].hot_team; 4908 #else 4909 team = root->r.r_hot_team; 4910 #endif 4911 #if KMP_DEBUG 4912 if (__kmp_tasking_mode != tskm_immediate_exec) { 4913 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4914 "task_team[1] = %p before reinit\n", 4915 team->t.t_task_team[0], team->t.t_task_team[1])); 4916 } 4917 #endif 4918 4919 // Has the number of threads changed? 4920 /* Let's assume the most common case is that the number of threads is 4921 unchanged, and put that case first. */ 4922 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4923 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4924 // This case can mean that omp_set_num_threads() was called and the hot 4925 // team size was already reduced, so we check the special flag 4926 if (team->t.t_size_changed == -1) { 4927 team->t.t_size_changed = 1; 4928 } else { 4929 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4930 } 4931 4932 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4933 kmp_r_sched_t new_sched = new_icvs->sched; 4934 if (team->t.t_sched.r_sched_type != new_sched.r_sched_type || 4935 team->t.t_sched.chunk != new_sched.chunk) 4936 team->t.t_sched = 4937 new_sched; // set master's schedule as new run-time schedule 4938 4939 __kmp_reinitialize_team(team, new_icvs, 4940 root->r.r_uber_thread->th.th_ident); 4941 4942 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 4943 team->t.t_threads[0], team)); 4944 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4945 4946 #if OMP_40_ENABLED 4947 #if KMP_AFFINITY_SUPPORTED 4948 if ((team->t.t_size_changed == 0) && 4949 (team->t.t_proc_bind == new_proc_bind)) { 4950 if (new_proc_bind == proc_bind_spread) { 4951 __kmp_partition_places( 4952 team, 1); // add flag to update only master for spread 4953 } 4954 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 4955 "proc_bind = %d, partition = [%d,%d]\n", 4956 team->t.t_id, new_proc_bind, team->t.t_first_place, 4957 team->t.t_last_place)); 4958 } else { 4959 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4960 __kmp_partition_places(team); 4961 } 4962 #else 4963 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4964 #endif /* KMP_AFFINITY_SUPPORTED */ 4965 #endif /* OMP_40_ENABLED */ 4966 } else if (team->t.t_nproc > new_nproc) { 4967 KA_TRACE(20, 4968 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 4969 new_nproc)); 4970 4971 team->t.t_size_changed = 1; 4972 #if KMP_NESTED_HOT_TEAMS 4973 if (__kmp_hot_teams_mode == 0) { 4974 // AC: saved number of threads should correspond to team's value in this 4975 // mode, can be bigger in mode 1, when hot team has threads in reserve 4976 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 4977 hot_teams[level].hot_team_nth = new_nproc; 4978 #endif // KMP_NESTED_HOT_TEAMS 4979 /* release the extra threads we don't need any more */ 4980 for (f = new_nproc; f < team->t.t_nproc; f++) { 4981 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 4982 if (__kmp_tasking_mode != tskm_immediate_exec) { 4983 // When decreasing team size, threads no longer in the team should 4984 // unref task team. 4985 team->t.t_threads[f]->th.th_task_team = NULL; 4986 } 4987 __kmp_free_thread(team->t.t_threads[f]); 4988 team->t.t_threads[f] = NULL; 4989 } 4990 #if KMP_NESTED_HOT_TEAMS 4991 } // (__kmp_hot_teams_mode == 0) 4992 else { 4993 // When keeping extra threads in team, switch threads to wait on own 4994 // b_go flag 4995 for (f = new_nproc; f < team->t.t_nproc; ++f) { 4996 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 4997 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 4998 for (int b = 0; b < bs_last_barrier; ++b) { 4999 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5000 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5001 } 5002 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5003 } 5004 } 5005 } 5006 #endif // KMP_NESTED_HOT_TEAMS 5007 team->t.t_nproc = new_nproc; 5008 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5009 if (team->t.t_sched.r_sched_type != new_icvs->sched.r_sched_type || 5010 team->t.t_sched.chunk != new_icvs->sched.chunk) 5011 team->t.t_sched = new_icvs->sched; 5012 __kmp_reinitialize_team(team, new_icvs, 5013 root->r.r_uber_thread->th.th_ident); 5014 5015 /* update the remaining threads */ 5016 for (f = 0; f < new_nproc; ++f) { 5017 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5018 } 5019 // restore the current task state of the master thread: should be the 5020 // implicit task 5021 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5022 team->t.t_threads[0], team)); 5023 5024 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5025 5026 #ifdef KMP_DEBUG 5027 for (f = 0; f < team->t.t_nproc; f++) { 5028 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5029 team->t.t_threads[f]->th.th_team_nproc == 5030 team->t.t_nproc); 5031 } 5032 #endif 5033 5034 #if OMP_40_ENABLED 5035 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5036 #if KMP_AFFINITY_SUPPORTED 5037 __kmp_partition_places(team); 5038 #endif 5039 #endif 5040 } else { // team->t.t_nproc < new_nproc 5041 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5042 kmp_affin_mask_t *old_mask; 5043 if (KMP_AFFINITY_CAPABLE()) { 5044 KMP_CPU_ALLOC(old_mask); 5045 } 5046 #endif 5047 5048 KA_TRACE(20, 5049 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5050 new_nproc)); 5051 5052 team->t.t_size_changed = 1; 5053 5054 #if KMP_NESTED_HOT_TEAMS 5055 int avail_threads = hot_teams[level].hot_team_nth; 5056 if (new_nproc < avail_threads) 5057 avail_threads = new_nproc; 5058 kmp_info_t **other_threads = team->t.t_threads; 5059 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5060 // Adjust barrier data of reserved threads (if any) of the team 5061 // Other data will be set in __kmp_initialize_info() below. 5062 int b; 5063 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5064 for (b = 0; b < bs_last_barrier; ++b) { 5065 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5066 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5067 #if USE_DEBUGGER 5068 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5069 #endif 5070 } 5071 } 5072 if (hot_teams[level].hot_team_nth >= new_nproc) { 5073 // we have all needed threads in reserve, no need to allocate any 5074 // this only possible in mode 1, cannot have reserved threads in mode 0 5075 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5076 team->t.t_nproc = new_nproc; // just get reserved threads involved 5077 } else { 5078 // we may have some threads in reserve, but not enough 5079 team->t.t_nproc = 5080 hot_teams[level] 5081 .hot_team_nth; // get reserved threads involved if any 5082 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5083 #endif // KMP_NESTED_HOT_TEAMS 5084 if (team->t.t_max_nproc < new_nproc) { 5085 /* reallocate larger arrays */ 5086 __kmp_reallocate_team_arrays(team, new_nproc); 5087 __kmp_reinitialize_team(team, new_icvs, NULL); 5088 } 5089 5090 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5091 /* Temporarily set full mask for master thread before creation of 5092 workers. The reason is that workers inherit the affinity from master, 5093 so if a lot of workers are created on the single core quickly, they 5094 don't get a chance to set their own affinity for a long time. */ 5095 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5096 #endif 5097 5098 /* allocate new threads for the hot team */ 5099 for (f = team->t.t_nproc; f < new_nproc; f++) { 5100 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5101 KMP_DEBUG_ASSERT(new_worker); 5102 team->t.t_threads[f] = new_worker; 5103 5104 KA_TRACE(20, 5105 ("__kmp_allocate_team: team %d init T#%d arrived: " 5106 "join=%llu, plain=%llu\n", 5107 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5108 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5109 team->t.t_bar[bs_plain_barrier].b_arrived)); 5110 5111 { // Initialize barrier data for new threads. 5112 int b; 5113 kmp_balign_t *balign = new_worker->th.th_bar; 5114 for (b = 0; b < bs_last_barrier; ++b) { 5115 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5116 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5117 KMP_BARRIER_PARENT_FLAG); 5118 #if USE_DEBUGGER 5119 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5120 #endif 5121 } 5122 } 5123 } 5124 5125 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5126 if (KMP_AFFINITY_CAPABLE()) { 5127 /* Restore initial master thread's affinity mask */ 5128 __kmp_set_system_affinity(old_mask, TRUE); 5129 KMP_CPU_FREE(old_mask); 5130 } 5131 #endif 5132 #if KMP_NESTED_HOT_TEAMS 5133 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5134 #endif // KMP_NESTED_HOT_TEAMS 5135 /* make sure everyone is syncronized */ 5136 int old_nproc = team->t.t_nproc; // save old value and use to update only 5137 // new threads below 5138 __kmp_initialize_team(team, new_nproc, new_icvs, 5139 root->r.r_uber_thread->th.th_ident); 5140 5141 /* reinitialize the threads */ 5142 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5143 for (f = 0; f < team->t.t_nproc; ++f) 5144 __kmp_initialize_info(team->t.t_threads[f], team, f, 5145 __kmp_gtid_from_tid(f, team)); 5146 if (level) { // set th_task_state for new threads in nested hot team 5147 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5148 // only need to set the th_task_state for the new threads. th_task_state 5149 // for master thread will not be accurate until after this in 5150 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5151 // correct value. 5152 for (f = old_nproc; f < team->t.t_nproc; ++f) 5153 team->t.t_threads[f]->th.th_task_state = 5154 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5155 } else { // set th_task_state for new threads in non-nested hot team 5156 int old_state = 5157 team->t.t_threads[0]->th.th_task_state; // copy master's state 5158 for (f = old_nproc; f < team->t.t_nproc; ++f) 5159 team->t.t_threads[f]->th.th_task_state = old_state; 5160 } 5161 5162 #ifdef KMP_DEBUG 5163 for (f = 0; f < team->t.t_nproc; ++f) { 5164 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5165 team->t.t_threads[f]->th.th_team_nproc == 5166 team->t.t_nproc); 5167 } 5168 #endif 5169 5170 #if OMP_40_ENABLED 5171 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5172 #if KMP_AFFINITY_SUPPORTED 5173 __kmp_partition_places(team); 5174 #endif 5175 #endif 5176 } // Check changes in number of threads 5177 5178 #if OMP_40_ENABLED 5179 kmp_info_t *master = team->t.t_threads[0]; 5180 if (master->th.th_teams_microtask) { 5181 for (f = 1; f < new_nproc; ++f) { 5182 // propagate teams construct specific info to workers 5183 kmp_info_t *thr = team->t.t_threads[f]; 5184 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5185 thr->th.th_teams_level = master->th.th_teams_level; 5186 thr->th.th_teams_size = master->th.th_teams_size; 5187 } 5188 } 5189 #endif /* OMP_40_ENABLED */ 5190 #if KMP_NESTED_HOT_TEAMS 5191 if (level) { 5192 // Sync barrier state for nested hot teams, not needed for outermost hot 5193 // team. 5194 for (f = 1; f < new_nproc; ++f) { 5195 kmp_info_t *thr = team->t.t_threads[f]; 5196 int b; 5197 kmp_balign_t *balign = thr->th.th_bar; 5198 for (b = 0; b < bs_last_barrier; ++b) { 5199 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5200 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5201 #if USE_DEBUGGER 5202 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5203 #endif 5204 } 5205 } 5206 } 5207 #endif // KMP_NESTED_HOT_TEAMS 5208 5209 /* reallocate space for arguments if necessary */ 5210 __kmp_alloc_argv_entries(argc, team, TRUE); 5211 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5212 // The hot team re-uses the previous task team, 5213 // if untouched during the previous release->gather phase. 5214 5215 KF_TRACE(10, (" hot_team = %p\n", team)); 5216 5217 #if KMP_DEBUG 5218 if (__kmp_tasking_mode != tskm_immediate_exec) { 5219 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5220 "task_team[1] = %p after reinit\n", 5221 team->t.t_task_team[0], team->t.t_task_team[1])); 5222 } 5223 #endif 5224 5225 #if OMPT_SUPPORT 5226 __ompt_team_assign_id(team, ompt_parallel_data); 5227 #endif 5228 5229 KMP_MB(); 5230 5231 return team; 5232 } 5233 5234 /* next, let's try to take one from the team pool */ 5235 KMP_MB(); 5236 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5237 /* TODO: consider resizing undersized teams instead of reaping them, now 5238 that we have a resizing mechanism */ 5239 if (team->t.t_max_nproc >= max_nproc) { 5240 /* take this team from the team pool */ 5241 __kmp_team_pool = team->t.t_next_pool; 5242 5243 /* setup the team for fresh use */ 5244 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5245 5246 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5247 "task_team[1] %p to NULL\n", 5248 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5249 team->t.t_task_team[0] = NULL; 5250 team->t.t_task_team[1] = NULL; 5251 5252 /* reallocate space for arguments if necessary */ 5253 __kmp_alloc_argv_entries(argc, team, TRUE); 5254 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5255 5256 KA_TRACE( 5257 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5258 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5259 { // Initialize barrier data. 5260 int b; 5261 for (b = 0; b < bs_last_barrier; ++b) { 5262 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5263 #if USE_DEBUGGER 5264 team->t.t_bar[b].b_master_arrived = 0; 5265 team->t.t_bar[b].b_team_arrived = 0; 5266 #endif 5267 } 5268 } 5269 5270 #if OMP_40_ENABLED 5271 team->t.t_proc_bind = new_proc_bind; 5272 #endif 5273 5274 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5275 team->t.t_id)); 5276 5277 #if OMPT_SUPPORT 5278 __ompt_team_assign_id(team, ompt_parallel_data); 5279 #endif 5280 5281 KMP_MB(); 5282 5283 return team; 5284 } 5285 5286 /* reap team if it is too small, then loop back and check the next one */ 5287 // not sure if this is wise, but, will be redone during the hot-teams 5288 // rewrite. 5289 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5290 team = __kmp_reap_team(team); 5291 __kmp_team_pool = team; 5292 } 5293 5294 /* nothing available in the pool, no matter, make a new team! */ 5295 KMP_MB(); 5296 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5297 5298 /* and set it up */ 5299 team->t.t_max_nproc = max_nproc; 5300 /* NOTE well, for some reason allocating one big buffer and dividing it up 5301 seems to really hurt performance a lot on the P4, so, let's not use this */ 5302 __kmp_allocate_team_arrays(team, max_nproc); 5303 5304 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5305 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5306 5307 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5308 "%p to NULL\n", 5309 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5310 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5311 // memory, no need to duplicate 5312 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5313 // memory, no need to duplicate 5314 5315 if (__kmp_storage_map) { 5316 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5317 } 5318 5319 /* allocate space for arguments */ 5320 __kmp_alloc_argv_entries(argc, team, FALSE); 5321 team->t.t_argc = argc; 5322 5323 KA_TRACE(20, 5324 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5325 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5326 { // Initialize barrier data. 5327 int b; 5328 for (b = 0; b < bs_last_barrier; ++b) { 5329 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5330 #if USE_DEBUGGER 5331 team->t.t_bar[b].b_master_arrived = 0; 5332 team->t.t_bar[b].b_team_arrived = 0; 5333 #endif 5334 } 5335 } 5336 5337 #if OMP_40_ENABLED 5338 team->t.t_proc_bind = new_proc_bind; 5339 #endif 5340 5341 #if OMPT_SUPPORT 5342 __ompt_team_assign_id(team, ompt_parallel_data); 5343 team->t.ompt_serialized_team_info = NULL; 5344 #endif 5345 5346 KMP_MB(); 5347 5348 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5349 team->t.t_id)); 5350 5351 return team; 5352 } 5353 5354 /* TODO implement hot-teams at all levels */ 5355 /* TODO implement lazy thread release on demand (disband request) */ 5356 5357 /* free the team. return it to the team pool. release all the threads 5358 * associated with it */ 5359 void __kmp_free_team(kmp_root_t *root, 5360 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5361 int f; 5362 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5363 team->t.t_id)); 5364 5365 /* verify state */ 5366 KMP_DEBUG_ASSERT(root); 5367 KMP_DEBUG_ASSERT(team); 5368 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5369 KMP_DEBUG_ASSERT(team->t.t_threads); 5370 5371 int use_hot_team = team == root->r.r_hot_team; 5372 #if KMP_NESTED_HOT_TEAMS 5373 int level; 5374 kmp_hot_team_ptr_t *hot_teams; 5375 if (master) { 5376 level = team->t.t_active_level - 1; 5377 if (master->th.th_teams_microtask) { // in teams construct? 5378 if (master->th.th_teams_size.nteams > 1) { 5379 ++level; // level was not increased in teams construct for 5380 // team_of_masters 5381 } 5382 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5383 master->th.th_teams_level == team->t.t_level) { 5384 ++level; // level was not increased in teams construct for 5385 // team_of_workers before the parallel 5386 } // team->t.t_level will be increased inside parallel 5387 } 5388 hot_teams = master->th.th_hot_teams; 5389 if (level < __kmp_hot_teams_max_level) { 5390 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5391 use_hot_team = 1; 5392 } 5393 } 5394 #endif // KMP_NESTED_HOT_TEAMS 5395 5396 /* team is done working */ 5397 TCW_SYNC_PTR(team->t.t_pkfn, 5398 NULL); // Important for Debugging Support Library. 5399 team->t.t_copyin_counter = 0; // init counter for possible reuse 5400 // Do not reset pointer to parent team to NULL for hot teams. 5401 5402 /* if we are non-hot team, release our threads */ 5403 if (!use_hot_team) { 5404 if (__kmp_tasking_mode != tskm_immediate_exec) { 5405 // Wait for threads to reach reapable state 5406 for (f = 1; f < team->t.t_nproc; ++f) { 5407 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5408 kmp_info_t *th = team->t.t_threads[f]; 5409 volatile kmp_uint32 *state = &th->th.th_reap_state; 5410 while (*state != KMP_SAFE_TO_REAP) { 5411 #if KMP_OS_WINDOWS 5412 // On Windows a thread can be killed at any time, check this 5413 DWORD ecode; 5414 if (!__kmp_is_thread_alive(th, &ecode)) { 5415 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5416 break; 5417 } 5418 #endif 5419 // first check if thread is sleeping 5420 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5421 if (fl.is_sleeping()) 5422 fl.resume(__kmp_gtid_from_thread(th)); 5423 KMP_CPU_PAUSE(); 5424 } 5425 } 5426 5427 // Delete task teams 5428 int tt_idx; 5429 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5430 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5431 if (task_team != NULL) { 5432 for (f = 0; f < team->t.t_nproc; 5433 ++f) { // Have all threads unref task teams 5434 team->t.t_threads[f]->th.th_task_team = NULL; 5435 } 5436 KA_TRACE( 5437 20, 5438 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5439 __kmp_get_gtid(), task_team, team->t.t_id)); 5440 #if KMP_NESTED_HOT_TEAMS 5441 __kmp_free_task_team(master, task_team); 5442 #endif 5443 team->t.t_task_team[tt_idx] = NULL; 5444 } 5445 } 5446 } 5447 5448 // Reset pointer to parent team only for non-hot teams. 5449 team->t.t_parent = NULL; 5450 team->t.t_level = 0; 5451 team->t.t_active_level = 0; 5452 5453 /* free the worker threads */ 5454 for (f = 1; f < team->t.t_nproc; ++f) { 5455 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5456 __kmp_free_thread(team->t.t_threads[f]); 5457 team->t.t_threads[f] = NULL; 5458 } 5459 5460 /* put the team back in the team pool */ 5461 /* TODO limit size of team pool, call reap_team if pool too large */ 5462 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5463 __kmp_team_pool = (volatile kmp_team_t *)team; 5464 } 5465 5466 KMP_MB(); 5467 } 5468 5469 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5470 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5471 kmp_team_t *next_pool = team->t.t_next_pool; 5472 5473 KMP_DEBUG_ASSERT(team); 5474 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5475 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5476 KMP_DEBUG_ASSERT(team->t.t_threads); 5477 KMP_DEBUG_ASSERT(team->t.t_argv); 5478 5479 /* TODO clean the threads that are a part of this? */ 5480 5481 /* free stuff */ 5482 __kmp_free_team_arrays(team); 5483 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5484 __kmp_free((void *)team->t.t_argv); 5485 __kmp_free(team); 5486 5487 KMP_MB(); 5488 return next_pool; 5489 } 5490 5491 // Free the thread. Don't reap it, just place it on the pool of available 5492 // threads. 5493 // 5494 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5495 // binding for the affinity mechanism to be useful. 5496 // 5497 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5498 // However, we want to avoid a potential performance problem by always 5499 // scanning through the list to find the correct point at which to insert 5500 // the thread (potential N**2 behavior). To do this we keep track of the 5501 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5502 // With single-level parallelism, threads will always be added to the tail 5503 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5504 // parallelism, all bets are off and we may need to scan through the entire 5505 // free list. 5506 // 5507 // This change also has a potentially large performance benefit, for some 5508 // applications. Previously, as threads were freed from the hot team, they 5509 // would be placed back on the free list in inverse order. If the hot team 5510 // grew back to it's original size, then the freed thread would be placed 5511 // back on the hot team in reverse order. This could cause bad cache 5512 // locality problems on programs where the size of the hot team regularly 5513 // grew and shrunk. 5514 // 5515 // Now, for single-level parallelism, the OMP tid is alway == gtid. 5516 void __kmp_free_thread(kmp_info_t *this_th) { 5517 int gtid; 5518 kmp_info_t **scan; 5519 kmp_root_t *root = this_th->th.th_root; 5520 5521 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5522 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5523 5524 KMP_DEBUG_ASSERT(this_th); 5525 5526 // When moving thread to pool, switch thread to wait on own b_go flag, and 5527 // uninitialized (NULL team). 5528 int b; 5529 kmp_balign_t *balign = this_th->th.th_bar; 5530 for (b = 0; b < bs_last_barrier; ++b) { 5531 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5532 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5533 balign[b].bb.team = NULL; 5534 balign[b].bb.leaf_kids = 0; 5535 } 5536 this_th->th.th_task_state = 0; 5537 5538 /* put thread back on the free pool */ 5539 TCW_PTR(this_th->th.th_team, NULL); 5540 TCW_PTR(this_th->th.th_root, NULL); 5541 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5542 5543 // If the __kmp_thread_pool_insert_pt is already past the new insert 5544 // point, then we need to re-scan the entire list. 5545 gtid = this_th->th.th_info.ds.ds_gtid; 5546 if (__kmp_thread_pool_insert_pt != NULL) { 5547 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5548 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5549 __kmp_thread_pool_insert_pt = NULL; 5550 } 5551 } 5552 5553 // Scan down the list to find the place to insert the thread. 5554 // scan is the address of a link in the list, possibly the address of 5555 // __kmp_thread_pool itself. 5556 // 5557 // In the absence of nested parallism, the for loop will have 0 iterations. 5558 if (__kmp_thread_pool_insert_pt != NULL) { 5559 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5560 } else { 5561 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5562 } 5563 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5564 scan = &((*scan)->th.th_next_pool)) 5565 ; 5566 5567 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5568 // to its address. 5569 TCW_PTR(this_th->th.th_next_pool, *scan); 5570 __kmp_thread_pool_insert_pt = *scan = this_th; 5571 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5572 (this_th->th.th_info.ds.ds_gtid < 5573 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5574 TCW_4(this_th->th.th_in_pool, TRUE); 5575 __kmp_thread_pool_nth++; 5576 5577 TCW_4(__kmp_nth, __kmp_nth - 1); 5578 root->r.r_cg_nthreads--; 5579 5580 #ifdef KMP_ADJUST_BLOCKTIME 5581 /* Adjust blocktime back to user setting or default if necessary */ 5582 /* Middle initialization might never have occurred */ 5583 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5584 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5585 if (__kmp_nth <= __kmp_avail_proc) { 5586 __kmp_zero_bt = FALSE; 5587 } 5588 } 5589 #endif /* KMP_ADJUST_BLOCKTIME */ 5590 5591 KMP_MB(); 5592 } 5593 5594 /* ------------------------------------------------------------------------ */ 5595 5596 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5597 int gtid = this_thr->th.th_info.ds.ds_gtid; 5598 /* void *stack_data;*/ 5599 kmp_team_t *(*volatile pteam); 5600 5601 KMP_MB(); 5602 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5603 5604 if (__kmp_env_consistency_check) { 5605 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5606 } 5607 5608 #if OMPT_SUPPORT 5609 ompt_data_t *thread_data; 5610 if (ompt_enabled.enabled) { 5611 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5612 thread_data->ptr = NULL; 5613 5614 this_thr->th.ompt_thread_info.state = omp_state_overhead; 5615 this_thr->th.ompt_thread_info.wait_id = 0; 5616 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5617 if (ompt_enabled.ompt_callback_thread_begin) { 5618 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5619 ompt_thread_worker, thread_data); 5620 } 5621 } 5622 #endif 5623 5624 #if OMPT_SUPPORT 5625 if (ompt_enabled.enabled) { 5626 this_thr->th.ompt_thread_info.state = omp_state_idle; 5627 } 5628 #endif 5629 /* This is the place where threads wait for work */ 5630 while (!TCR_4(__kmp_global.g.g_done)) { 5631 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5632 KMP_MB(); 5633 5634 /* wait for work to do */ 5635 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5636 5637 /* No tid yet since not part of a team */ 5638 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5639 5640 #if OMPT_SUPPORT 5641 if (ompt_enabled.enabled) { 5642 this_thr->th.ompt_thread_info.state = omp_state_overhead; 5643 } 5644 #endif 5645 5646 pteam = (kmp_team_t * (*))(&this_thr->th.th_team); 5647 5648 /* have we been allocated? */ 5649 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5650 /* we were just woken up, so run our new task */ 5651 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5652 int rc; 5653 KA_TRACE(20, 5654 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5655 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5656 (*pteam)->t.t_pkfn)); 5657 5658 updateHWFPControl(*pteam); 5659 5660 #if OMPT_SUPPORT 5661 if (ompt_enabled.enabled) { 5662 this_thr->th.ompt_thread_info.state = omp_state_work_parallel; 5663 } 5664 #endif 5665 5666 { 5667 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 5668 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 5669 rc = (*pteam)->t.t_invoke(gtid); 5670 } 5671 KMP_ASSERT(rc); 5672 5673 KMP_MB(); 5674 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5675 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5676 (*pteam)->t.t_pkfn)); 5677 } 5678 #if OMPT_SUPPORT 5679 if (ompt_enabled.enabled) { 5680 /* no frame set while outside task */ 5681 __ompt_get_task_info_object(0)->frame.exit_runtime_frame = NULL; 5682 5683 this_thr->th.ompt_thread_info.state = omp_state_overhead; 5684 this_thr->th.ompt_thread_info.task_data = *OMPT_CUR_TASK_DATA(this_thr); 5685 } 5686 #endif 5687 /* join barrier after parallel region */ 5688 __kmp_join_barrier(gtid); 5689 } 5690 } 5691 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5692 5693 #if OMPT_SUPPORT 5694 if (ompt_enabled.ompt_callback_thread_end) { 5695 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5696 } 5697 #endif 5698 5699 this_thr->th.th_task_team = NULL; 5700 /* run the destructors for the threadprivate data for this thread */ 5701 __kmp_common_destroy_gtid(gtid); 5702 5703 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5704 KMP_MB(); 5705 return this_thr; 5706 } 5707 5708 /* ------------------------------------------------------------------------ */ 5709 5710 void __kmp_internal_end_dest(void *specific_gtid) { 5711 #if KMP_COMPILER_ICC 5712 #pragma warning(push) 5713 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose 5714 // significant bits 5715 #endif 5716 // Make sure no significant bits are lost 5717 int gtid = (kmp_intptr_t)specific_gtid - 1; 5718 #if KMP_COMPILER_ICC 5719 #pragma warning(pop) 5720 #endif 5721 5722 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5723 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5724 * this is because 0 is reserved for the nothing-stored case */ 5725 5726 /* josh: One reason for setting the gtid specific data even when it is being 5727 destroyed by pthread is to allow gtid lookup through thread specific data 5728 (__kmp_gtid_get_specific). Some of the code, especially stat code, 5729 that gets executed in the call to __kmp_internal_end_thread, actually 5730 gets the gtid through the thread specific data. Setting it here seems 5731 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread 5732 to run smoothly. 5733 todo: get rid of this after we remove the dependence on 5734 __kmp_gtid_get_specific */ 5735 if (gtid >= 0 && KMP_UBER_GTID(gtid)) 5736 __kmp_gtid_set_specific(gtid); 5737 #ifdef KMP_TDATA_GTID 5738 __kmp_gtid = gtid; 5739 #endif 5740 __kmp_internal_end_thread(gtid); 5741 } 5742 5743 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5744 5745 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases 5746 // destructors work perfectly, but in real libomp.so I have no evidence it is 5747 // ever called. However, -fini linker option in makefile.mk works fine. 5748 5749 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5750 __kmp_internal_end_atexit(); 5751 } 5752 5753 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); } 5754 5755 #endif 5756 5757 /* [Windows] josh: when the atexit handler is called, there may still be more 5758 than one thread alive */ 5759 void __kmp_internal_end_atexit(void) { 5760 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5761 /* [Windows] 5762 josh: ideally, we want to completely shutdown the library in this atexit 5763 handler, but stat code that depends on thread specific data for gtid fails 5764 because that data becomes unavailable at some point during the shutdown, so 5765 we call __kmp_internal_end_thread instead. We should eventually remove the 5766 dependency on __kmp_get_specific_gtid in the stat code and use 5767 __kmp_internal_end_library to cleanly shutdown the library. 5768 5769 // TODO: Can some of this comment about GVS be removed? 5770 I suspect that the offending stat code is executed when the calling thread 5771 tries to clean up a dead root thread's data structures, resulting in GVS 5772 code trying to close the GVS structures for that thread, but since the stat 5773 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5774 the calling thread is cleaning up itself instead of another thread, it get 5775 confused. This happens because allowing a thread to unregister and cleanup 5776 another thread is a recent modification for addressing an issue. 5777 Based on the current design (20050722), a thread may end up 5778 trying to unregister another thread only if thread death does not trigger 5779 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5780 thread specific data destructor function to detect thread death. For 5781 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5782 is nothing. Thus, the workaround is applicable only for Windows static 5783 stat library. */ 5784 __kmp_internal_end_library(-1); 5785 #if KMP_OS_WINDOWS 5786 __kmp_close_console(); 5787 #endif 5788 } 5789 5790 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5791 // It is assumed __kmp_forkjoin_lock is acquired. 5792 5793 int gtid; 5794 5795 KMP_DEBUG_ASSERT(thread != NULL); 5796 5797 gtid = thread->th.th_info.ds.ds_gtid; 5798 5799 if (!is_root) { 5800 5801 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5802 /* Assume the threads are at the fork barrier here */ 5803 KA_TRACE( 5804 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5805 gtid)); 5806 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5807 * (GEH) */ 5808 ANNOTATE_HAPPENS_BEFORE(thread); 5809 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 5810 __kmp_release_64(&flag); 5811 } 5812 5813 // Terminate OS thread. 5814 __kmp_reap_worker(thread); 5815 5816 // The thread was killed asynchronously. If it was actively 5817 // spinning in the thread pool, decrement the global count. 5818 // 5819 // There is a small timing hole here - if the worker thread was just waking 5820 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5821 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5822 // the global counter might not get updated. 5823 // 5824 // Currently, this can only happen as the library is unloaded, 5825 // so there are no harmful side effects. 5826 if (thread->th.th_active_in_pool) { 5827 thread->th.th_active_in_pool = FALSE; 5828 KMP_TEST_THEN_DEC32(&__kmp_thread_pool_active_nth); 5829 KMP_DEBUG_ASSERT(TCR_4(__kmp_thread_pool_active_nth) >= 0); 5830 } 5831 5832 // Decrement # of [worker] threads in the pool. 5833 KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0); 5834 --__kmp_thread_pool_nth; 5835 } 5836 5837 __kmp_free_implicit_task(thread); 5838 5839 // Free the fast memory for tasking 5840 #if USE_FAST_MEMORY 5841 __kmp_free_fast_memory(thread); 5842 #endif /* USE_FAST_MEMORY */ 5843 5844 __kmp_suspend_uninitialize_thread(thread); 5845 5846 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5847 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5848 5849 --__kmp_all_nth; 5850 // __kmp_nth was decremented when thread is added to the pool. 5851 5852 #ifdef KMP_ADJUST_BLOCKTIME 5853 /* Adjust blocktime back to user setting or default if necessary */ 5854 /* Middle initialization might never have occurred */ 5855 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5856 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5857 if (__kmp_nth <= __kmp_avail_proc) { 5858 __kmp_zero_bt = FALSE; 5859 } 5860 } 5861 #endif /* KMP_ADJUST_BLOCKTIME */ 5862 5863 /* free the memory being used */ 5864 if (__kmp_env_consistency_check) { 5865 if (thread->th.th_cons) { 5866 __kmp_free_cons_stack(thread->th.th_cons); 5867 thread->th.th_cons = NULL; 5868 } 5869 } 5870 5871 if (thread->th.th_pri_common != NULL) { 5872 __kmp_free(thread->th.th_pri_common); 5873 thread->th.th_pri_common = NULL; 5874 } 5875 5876 if (thread->th.th_task_state_memo_stack != NULL) { 5877 __kmp_free(thread->th.th_task_state_memo_stack); 5878 thread->th.th_task_state_memo_stack = NULL; 5879 } 5880 5881 #if KMP_USE_BGET 5882 if (thread->th.th_local.bget_data != NULL) { 5883 __kmp_finalize_bget(thread); 5884 } 5885 #endif 5886 5887 #if KMP_AFFINITY_SUPPORTED 5888 if (thread->th.th_affin_mask != NULL) { 5889 KMP_CPU_FREE(thread->th.th_affin_mask); 5890 thread->th.th_affin_mask = NULL; 5891 } 5892 #endif /* KMP_AFFINITY_SUPPORTED */ 5893 5894 __kmp_reap_team(thread->th.th_serial_team); 5895 thread->th.th_serial_team = NULL; 5896 __kmp_free(thread); 5897 5898 KMP_MB(); 5899 5900 } // __kmp_reap_thread 5901 5902 static void __kmp_internal_end(void) { 5903 int i; 5904 5905 /* First, unregister the library */ 5906 __kmp_unregister_library(); 5907 5908 #if KMP_OS_WINDOWS 5909 /* In Win static library, we can't tell when a root actually dies, so we 5910 reclaim the data structures for any root threads that have died but not 5911 unregistered themselves, in order to shut down cleanly. 5912 In Win dynamic library we also can't tell when a thread dies. */ 5913 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 5914 // dead roots 5915 #endif 5916 5917 for (i = 0; i < __kmp_threads_capacity; i++) 5918 if (__kmp_root[i]) 5919 if (__kmp_root[i]->r.r_active) 5920 break; 5921 KMP_MB(); /* Flush all pending memory write invalidates. */ 5922 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 5923 5924 if (i < __kmp_threads_capacity) { 5925 #if KMP_USE_MONITOR 5926 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 5927 KMP_MB(); /* Flush all pending memory write invalidates. */ 5928 5929 // Need to check that monitor was initialized before reaping it. If we are 5930 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 5931 // __kmp_monitor will appear to contain valid data, but it is only valid in 5932 // the parent process, not the child. 5933 // New behavior (201008): instead of keying off of the flag 5934 // __kmp_init_parallel, the monitor thread creation is keyed off 5935 // of the new flag __kmp_init_monitor. 5936 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 5937 if (TCR_4(__kmp_init_monitor)) { 5938 __kmp_reap_monitor(&__kmp_monitor); 5939 TCW_4(__kmp_init_monitor, 0); 5940 } 5941 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 5942 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 5943 #endif // KMP_USE_MONITOR 5944 } else { 5945 /* TODO move this to cleanup code */ 5946 #ifdef KMP_DEBUG 5947 /* make sure that everything has properly ended */ 5948 for (i = 0; i < __kmp_threads_capacity; i++) { 5949 if (__kmp_root[i]) { 5950 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 5951 // there can be uber threads alive here 5952 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 5953 } 5954 } 5955 #endif 5956 5957 KMP_MB(); 5958 5959 // Reap the worker threads. 5960 // This is valid for now, but be careful if threads are reaped sooner. 5961 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 5962 // Get the next thread from the pool. 5963 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 5964 __kmp_thread_pool = thread->th.th_next_pool; 5965 // Reap it. 5966 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 5967 thread->th.th_next_pool = NULL; 5968 thread->th.th_in_pool = FALSE; 5969 __kmp_reap_thread(thread, 0); 5970 } 5971 __kmp_thread_pool_insert_pt = NULL; 5972 5973 // Reap teams. 5974 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 5975 // Get the next team from the pool. 5976 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 5977 __kmp_team_pool = team->t.t_next_pool; 5978 // Reap it. 5979 team->t.t_next_pool = NULL; 5980 __kmp_reap_team(team); 5981 } 5982 5983 __kmp_reap_task_teams(); 5984 5985 for (i = 0; i < __kmp_threads_capacity; ++i) { 5986 // TBD: Add some checking... 5987 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 5988 } 5989 5990 /* Make sure all threadprivate destructors get run by joining with all 5991 worker threads before resetting this flag */ 5992 TCW_SYNC_4(__kmp_init_common, FALSE); 5993 5994 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 5995 KMP_MB(); 5996 5997 #if KMP_USE_MONITOR 5998 // See note above: One of the possible fixes for CQ138434 / CQ140126 5999 // 6000 // FIXME: push both code fragments down and CSE them? 6001 // push them into __kmp_cleanup() ? 6002 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6003 if (TCR_4(__kmp_init_monitor)) { 6004 __kmp_reap_monitor(&__kmp_monitor); 6005 TCW_4(__kmp_init_monitor, 0); 6006 } 6007 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6008 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6009 #endif 6010 } /* else !__kmp_global.t_active */ 6011 TCW_4(__kmp_init_gtid, FALSE); 6012 KMP_MB(); /* Flush all pending memory write invalidates. */ 6013 6014 __kmp_cleanup(); 6015 #if OMPT_SUPPORT 6016 ompt_fini(); 6017 #endif 6018 } 6019 6020 void __kmp_internal_end_library(int gtid_req) { 6021 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6022 /* this shouldn't be a race condition because __kmp_internal_end() is the 6023 only place to clear __kmp_serial_init */ 6024 /* we'll check this later too, after we get the lock */ 6025 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6026 // redundaant, because the next check will work in any case. 6027 if (__kmp_global.g.g_abort) { 6028 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6029 /* TODO abort? */ 6030 return; 6031 } 6032 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6033 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6034 return; 6035 } 6036 6037 KMP_MB(); /* Flush all pending memory write invalidates. */ 6038 6039 /* find out who we are and what we should do */ 6040 { 6041 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6042 KA_TRACE( 6043 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6044 if (gtid == KMP_GTID_SHUTDOWN) { 6045 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6046 "already shutdown\n")); 6047 return; 6048 } else if (gtid == KMP_GTID_MONITOR) { 6049 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6050 "registered, or system shutdown\n")); 6051 return; 6052 } else if (gtid == KMP_GTID_DNE) { 6053 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6054 "shutdown\n")); 6055 /* we don't know who we are, but we may still shutdown the library */ 6056 } else if (KMP_UBER_GTID(gtid)) { 6057 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6058 if (__kmp_root[gtid]->r.r_active) { 6059 __kmp_global.g.g_abort = -1; 6060 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6061 KA_TRACE(10, 6062 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6063 gtid)); 6064 return; 6065 } else { 6066 KA_TRACE( 6067 10, 6068 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6069 __kmp_unregister_root_current_thread(gtid); 6070 } 6071 } else { 6072 /* worker threads may call this function through the atexit handler, if they 6073 * call exit() */ 6074 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6075 TODO: do a thorough shutdown instead */ 6076 #ifdef DUMP_DEBUG_ON_EXIT 6077 if (__kmp_debug_buf) 6078 __kmp_dump_debug_buffer(); 6079 #endif 6080 return; 6081 } 6082 } 6083 /* synchronize the termination process */ 6084 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6085 6086 /* have we already finished */ 6087 if (__kmp_global.g.g_abort) { 6088 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6089 /* TODO abort? */ 6090 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6091 return; 6092 } 6093 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6094 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6095 return; 6096 } 6097 6098 /* We need this lock to enforce mutex between this reading of 6099 __kmp_threads_capacity and the writing by __kmp_register_root. 6100 Alternatively, we can use a counter of roots that is atomically updated by 6101 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6102 __kmp_internal_end_*. */ 6103 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6104 6105 /* now we can safely conduct the actual termination */ 6106 __kmp_internal_end(); 6107 6108 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6109 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6110 6111 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6112 6113 #ifdef DUMP_DEBUG_ON_EXIT 6114 if (__kmp_debug_buf) 6115 __kmp_dump_debug_buffer(); 6116 #endif 6117 6118 #if KMP_OS_WINDOWS 6119 __kmp_close_console(); 6120 #endif 6121 6122 __kmp_fini_allocator(); 6123 6124 } // __kmp_internal_end_library 6125 6126 void __kmp_internal_end_thread(int gtid_req) { 6127 int i; 6128 6129 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6130 /* this shouldn't be a race condition because __kmp_internal_end() is the 6131 * only place to clear __kmp_serial_init */ 6132 /* we'll check this later too, after we get the lock */ 6133 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6134 // redundant, because the next check will work in any case. 6135 if (__kmp_global.g.g_abort) { 6136 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6137 /* TODO abort? */ 6138 return; 6139 } 6140 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6141 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6142 return; 6143 } 6144 6145 KMP_MB(); /* Flush all pending memory write invalidates. */ 6146 6147 /* find out who we are and what we should do */ 6148 { 6149 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6150 KA_TRACE(10, 6151 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6152 if (gtid == KMP_GTID_SHUTDOWN) { 6153 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6154 "already shutdown\n")); 6155 return; 6156 } else if (gtid == KMP_GTID_MONITOR) { 6157 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6158 "registered, or system shutdown\n")); 6159 return; 6160 } else if (gtid == KMP_GTID_DNE) { 6161 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6162 "shutdown\n")); 6163 return; 6164 /* we don't know who we are */ 6165 } else if (KMP_UBER_GTID(gtid)) { 6166 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6167 if (__kmp_root[gtid]->r.r_active) { 6168 __kmp_global.g.g_abort = -1; 6169 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6170 KA_TRACE(10, 6171 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6172 gtid)); 6173 return; 6174 } else { 6175 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6176 gtid)); 6177 __kmp_unregister_root_current_thread(gtid); 6178 } 6179 } else { 6180 /* just a worker thread, let's leave */ 6181 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6182 6183 if (gtid >= 0) { 6184 __kmp_threads[gtid]->th.th_task_team = NULL; 6185 } 6186 6187 KA_TRACE(10, 6188 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6189 gtid)); 6190 return; 6191 } 6192 } 6193 #if defined KMP_DYNAMIC_LIB 6194 // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber 6195 // thread, because we will better shutdown later in the library destructor. 6196 // The reason of this change is performance problem when non-openmp thread in 6197 // a loop forks and joins many openmp threads. We can save a lot of time 6198 // keeping worker threads alive until the program shutdown. 6199 // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) 6200 // and Windows(DPD200287443) that occurs when using critical sections from 6201 // foreign threads. 6202 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6203 return; 6204 #endif 6205 /* synchronize the termination process */ 6206 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6207 6208 /* have we already finished */ 6209 if (__kmp_global.g.g_abort) { 6210 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6211 /* TODO abort? */ 6212 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6213 return; 6214 } 6215 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6216 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6217 return; 6218 } 6219 6220 /* We need this lock to enforce mutex between this reading of 6221 __kmp_threads_capacity and the writing by __kmp_register_root. 6222 Alternatively, we can use a counter of roots that is atomically updated by 6223 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6224 __kmp_internal_end_*. */ 6225 6226 /* should we finish the run-time? are all siblings done? */ 6227 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6228 6229 for (i = 0; i < __kmp_threads_capacity; ++i) { 6230 if (KMP_UBER_GTID(i)) { 6231 KA_TRACE( 6232 10, 6233 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6234 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6235 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6236 return; 6237 } 6238 } 6239 6240 /* now we can safely conduct the actual termination */ 6241 6242 __kmp_internal_end(); 6243 6244 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6245 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6246 6247 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6248 6249 #ifdef DUMP_DEBUG_ON_EXIT 6250 if (__kmp_debug_buf) 6251 __kmp_dump_debug_buffer(); 6252 #endif 6253 } // __kmp_internal_end_thread 6254 6255 // ----------------------------------------------------------------------------- 6256 // Library registration stuff. 6257 6258 static long __kmp_registration_flag = 0; 6259 // Random value used to indicate library initialization. 6260 static char *__kmp_registration_str = NULL; 6261 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6262 6263 static inline char *__kmp_reg_status_name() { 6264 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6265 each thread. If registration and unregistration go in different threads 6266 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6267 env var can not be found, because the name will contain different pid. */ 6268 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6269 } // __kmp_reg_status_get 6270 6271 void __kmp_register_library_startup(void) { 6272 6273 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6274 int done = 0; 6275 union { 6276 double dtime; 6277 long ltime; 6278 } time; 6279 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6280 __kmp_initialize_system_tick(); 6281 #endif 6282 __kmp_read_system_time(&time.dtime); 6283 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6284 __kmp_registration_str = 6285 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6286 __kmp_registration_flag, KMP_LIBRARY_FILE); 6287 6288 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6289 __kmp_registration_str)); 6290 6291 while (!done) { 6292 6293 char *value = NULL; // Actual value of the environment variable. 6294 6295 // Set environment variable, but do not overwrite if it is exist. 6296 __kmp_env_set(name, __kmp_registration_str, 0); 6297 // Check the variable is written. 6298 value = __kmp_env_get(name); 6299 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6300 6301 done = 1; // Ok, environment variable set successfully, exit the loop. 6302 6303 } else { 6304 6305 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6306 // Check whether it alive or dead. 6307 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6308 char *tail = value; 6309 char *flag_addr_str = NULL; 6310 char *flag_val_str = NULL; 6311 char const *file_name = NULL; 6312 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6313 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6314 file_name = tail; 6315 if (tail != NULL) { 6316 long *flag_addr = 0; 6317 long flag_val = 0; 6318 KMP_SSCANF(flag_addr_str, "%p", &flag_addr); 6319 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6320 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6321 // First, check whether environment-encoded address is mapped into 6322 // addr space. 6323 // If so, dereference it to see if it still has the right value. 6324 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6325 neighbor = 1; 6326 } else { 6327 // If not, then we know the other copy of the library is no longer 6328 // running. 6329 neighbor = 2; 6330 } 6331 } 6332 } 6333 switch (neighbor) { 6334 case 0: // Cannot parse environment variable -- neighbor status unknown. 6335 // Assume it is the incompatible format of future version of the 6336 // library. Assume the other library is alive. 6337 // WARN( ... ); // TODO: Issue a warning. 6338 file_name = "unknown library"; 6339 // Attention! Falling to the next case. That's intentional. 6340 case 1: { // Neighbor is alive. 6341 // Check it is allowed. 6342 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6343 if (!__kmp_str_match_true(duplicate_ok)) { 6344 // That's not allowed. Issue fatal error. 6345 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6346 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6347 } 6348 KMP_INTERNAL_FREE(duplicate_ok); 6349 __kmp_duplicate_library_ok = 1; 6350 done = 1; // Exit the loop. 6351 } break; 6352 case 2: { // Neighbor is dead. 6353 // Clear the variable and try to register library again. 6354 __kmp_env_unset(name); 6355 } break; 6356 default: { KMP_DEBUG_ASSERT(0); } break; 6357 } 6358 } 6359 KMP_INTERNAL_FREE((void *)value); 6360 } 6361 KMP_INTERNAL_FREE((void *)name); 6362 6363 } // func __kmp_register_library_startup 6364 6365 void __kmp_unregister_library(void) { 6366 6367 char *name = __kmp_reg_status_name(); 6368 char *value = __kmp_env_get(name); 6369 6370 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6371 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6372 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6373 // Ok, this is our variable. Delete it. 6374 __kmp_env_unset(name); 6375 } 6376 6377 KMP_INTERNAL_FREE(__kmp_registration_str); 6378 KMP_INTERNAL_FREE(value); 6379 KMP_INTERNAL_FREE(name); 6380 6381 __kmp_registration_flag = 0; 6382 __kmp_registration_str = NULL; 6383 6384 } // __kmp_unregister_library 6385 6386 // End of Library registration stuff. 6387 // ----------------------------------------------------------------------------- 6388 6389 #if KMP_MIC_SUPPORTED 6390 6391 static void __kmp_check_mic_type() { 6392 kmp_cpuid_t cpuid_state = {0}; 6393 kmp_cpuid_t *cs_p = &cpuid_state; 6394 __kmp_x86_cpuid(1, 0, cs_p); 6395 // We don't support mic1 at the moment 6396 if ((cs_p->eax & 0xff0) == 0xB10) { 6397 __kmp_mic_type = mic2; 6398 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6399 __kmp_mic_type = mic3; 6400 } else { 6401 __kmp_mic_type = non_mic; 6402 } 6403 } 6404 6405 #endif /* KMP_MIC_SUPPORTED */ 6406 6407 static void __kmp_do_serial_initialize(void) { 6408 int i, gtid; 6409 int size; 6410 6411 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6412 6413 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6414 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6415 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6416 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6417 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6418 6419 #if OMPT_SUPPORT 6420 ompt_pre_init(); 6421 #endif 6422 6423 __kmp_validate_locks(); 6424 6425 /* Initialize internal memory allocator */ 6426 __kmp_init_allocator(); 6427 6428 /* Register the library startup via an environment variable and check to see 6429 whether another copy of the library is already registered. */ 6430 6431 __kmp_register_library_startup(); 6432 6433 /* TODO reinitialization of library */ 6434 if (TCR_4(__kmp_global.g.g_done)) { 6435 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6436 } 6437 6438 __kmp_global.g.g_abort = 0; 6439 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6440 6441 /* initialize the locks */ 6442 #if KMP_USE_ADAPTIVE_LOCKS 6443 #if KMP_DEBUG_ADAPTIVE_LOCKS 6444 __kmp_init_speculative_stats(); 6445 #endif 6446 #endif 6447 #if KMP_STATS_ENABLED 6448 __kmp_stats_init(); 6449 #endif 6450 __kmp_init_lock(&__kmp_global_lock); 6451 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6452 __kmp_init_lock(&__kmp_debug_lock); 6453 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6454 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6455 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6456 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6457 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6458 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6459 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6460 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6461 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6462 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6463 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6464 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6465 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6466 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6467 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6468 #if KMP_USE_MONITOR 6469 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6470 #endif 6471 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6472 6473 /* conduct initialization and initial setup of configuration */ 6474 6475 __kmp_runtime_initialize(); 6476 6477 #if KMP_MIC_SUPPORTED 6478 __kmp_check_mic_type(); 6479 #endif 6480 6481 // Some global variable initialization moved here from kmp_env_initialize() 6482 #ifdef KMP_DEBUG 6483 kmp_diag = 0; 6484 #endif 6485 __kmp_abort_delay = 0; 6486 6487 // From __kmp_init_dflt_team_nth() 6488 /* assume the entire machine will be used */ 6489 __kmp_dflt_team_nth_ub = __kmp_xproc; 6490 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6491 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6492 } 6493 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6494 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6495 } 6496 __kmp_max_nth = __kmp_sys_max_nth; 6497 __kmp_cg_max_nth = __kmp_sys_max_nth; 6498 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6499 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6500 __kmp_teams_max_nth = __kmp_sys_max_nth; 6501 } 6502 6503 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6504 // part 6505 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6506 #if KMP_USE_MONITOR 6507 __kmp_monitor_wakeups = 6508 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6509 __kmp_bt_intervals = 6510 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6511 #endif 6512 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6513 __kmp_library = library_throughput; 6514 // From KMP_SCHEDULE initialization 6515 __kmp_static = kmp_sch_static_balanced; 6516 // AC: do not use analytical here, because it is non-monotonous 6517 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6518 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6519 // need to repeat assignment 6520 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6521 // bit control and barrier method control parts 6522 #if KMP_FAST_REDUCTION_BARRIER 6523 #define kmp_reduction_barrier_gather_bb ((int)1) 6524 #define kmp_reduction_barrier_release_bb ((int)1) 6525 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6526 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6527 #endif // KMP_FAST_REDUCTION_BARRIER 6528 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6529 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6530 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6531 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6532 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6533 #if KMP_FAST_REDUCTION_BARRIER 6534 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6535 // lin_64 ): hyper,1 6536 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6537 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6538 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6539 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6540 } 6541 #endif // KMP_FAST_REDUCTION_BARRIER 6542 } 6543 #if KMP_FAST_REDUCTION_BARRIER 6544 #undef kmp_reduction_barrier_release_pat 6545 #undef kmp_reduction_barrier_gather_pat 6546 #undef kmp_reduction_barrier_release_bb 6547 #undef kmp_reduction_barrier_gather_bb 6548 #endif // KMP_FAST_REDUCTION_BARRIER 6549 #if KMP_MIC_SUPPORTED 6550 if (__kmp_mic_type == mic2) { // KNC 6551 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6552 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6553 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6554 1; // forkjoin release 6555 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6556 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6557 } 6558 #if KMP_FAST_REDUCTION_BARRIER 6559 if (__kmp_mic_type == mic2) { // KNC 6560 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6561 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6562 } 6563 #endif // KMP_FAST_REDUCTION_BARRIER 6564 #endif // KMP_MIC_SUPPORTED 6565 6566 // From KMP_CHECKS initialization 6567 #ifdef KMP_DEBUG 6568 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6569 #else 6570 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6571 #endif 6572 6573 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6574 __kmp_foreign_tp = TRUE; 6575 6576 __kmp_global.g.g_dynamic = FALSE; 6577 __kmp_global.g.g_dynamic_mode = dynamic_default; 6578 6579 __kmp_env_initialize(NULL); 6580 6581 // Print all messages in message catalog for testing purposes. 6582 #ifdef KMP_DEBUG 6583 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6584 if (__kmp_str_match_true(val)) { 6585 kmp_str_buf_t buffer; 6586 __kmp_str_buf_init(&buffer); 6587 __kmp_i18n_dump_catalog(&buffer); 6588 __kmp_printf("%s", buffer.str); 6589 __kmp_str_buf_free(&buffer); 6590 } 6591 __kmp_env_free(&val); 6592 #endif 6593 6594 __kmp_threads_capacity = 6595 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6596 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6597 __kmp_tp_capacity = __kmp_default_tp_capacity( 6598 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6599 6600 // If the library is shut down properly, both pools must be NULL. Just in 6601 // case, set them to NULL -- some memory may leak, but subsequent code will 6602 // work even if pools are not freed. 6603 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6604 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6605 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6606 __kmp_thread_pool = NULL; 6607 __kmp_thread_pool_insert_pt = NULL; 6608 __kmp_team_pool = NULL; 6609 6610 /* Allocate all of the variable sized records */ 6611 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6612 * expandable */ 6613 /* Since allocation is cache-aligned, just add extra padding at the end */ 6614 size = 6615 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6616 CACHE_LINE; 6617 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6618 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6619 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6620 6621 /* init thread counts */ 6622 KMP_DEBUG_ASSERT(__kmp_all_nth == 6623 0); // Asserts fail if the library is reinitializing and 6624 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6625 __kmp_all_nth = 0; 6626 __kmp_nth = 0; 6627 6628 /* setup the uber master thread and hierarchy */ 6629 gtid = __kmp_register_root(TRUE); 6630 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6631 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6632 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6633 6634 KMP_MB(); /* Flush all pending memory write invalidates. */ 6635 6636 __kmp_common_initialize(); 6637 6638 #if KMP_OS_UNIX 6639 /* invoke the child fork handler */ 6640 __kmp_register_atfork(); 6641 #endif 6642 6643 #if !defined KMP_DYNAMIC_LIB 6644 { 6645 /* Invoke the exit handler when the program finishes, only for static 6646 library. For dynamic library, we already have _fini and DllMain. */ 6647 int rc = atexit(__kmp_internal_end_atexit); 6648 if (rc != 0) { 6649 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6650 __kmp_msg_null); 6651 } 6652 } 6653 #endif 6654 6655 #if KMP_HANDLE_SIGNALS 6656 #if KMP_OS_UNIX 6657 /* NOTE: make sure that this is called before the user installs their own 6658 signal handlers so that the user handlers are called first. this way they 6659 can return false, not call our handler, avoid terminating the library, and 6660 continue execution where they left off. */ 6661 __kmp_install_signals(FALSE); 6662 #endif /* KMP_OS_UNIX */ 6663 #if KMP_OS_WINDOWS 6664 __kmp_install_signals(TRUE); 6665 #endif /* KMP_OS_WINDOWS */ 6666 #endif 6667 6668 /* we have finished the serial initialization */ 6669 __kmp_init_counter++; 6670 6671 __kmp_init_serial = TRUE; 6672 6673 if (__kmp_settings) { 6674 __kmp_env_print(); 6675 } 6676 6677 #if OMP_40_ENABLED 6678 if (__kmp_display_env || __kmp_display_env_verbose) { 6679 __kmp_env_print_2(); 6680 } 6681 #endif // OMP_40_ENABLED 6682 6683 #if OMPT_SUPPORT 6684 ompt_post_init(); 6685 #endif 6686 6687 KMP_MB(); 6688 6689 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6690 } 6691 6692 void __kmp_serial_initialize(void) { 6693 if (__kmp_init_serial) { 6694 return; 6695 } 6696 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6697 if (__kmp_init_serial) { 6698 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6699 return; 6700 } 6701 __kmp_do_serial_initialize(); 6702 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6703 } 6704 6705 static void __kmp_do_middle_initialize(void) { 6706 int i, j; 6707 int prev_dflt_team_nth; 6708 6709 if (!__kmp_init_serial) { 6710 __kmp_do_serial_initialize(); 6711 } 6712 6713 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6714 6715 // Save the previous value for the __kmp_dflt_team_nth so that 6716 // we can avoid some reinitialization if it hasn't changed. 6717 prev_dflt_team_nth = __kmp_dflt_team_nth; 6718 6719 #if KMP_AFFINITY_SUPPORTED 6720 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6721 // number of cores on the machine. 6722 __kmp_affinity_initialize(); 6723 6724 // Run through the __kmp_threads array and set the affinity mask 6725 // for each root thread that is currently registered with the RTL. 6726 for (i = 0; i < __kmp_threads_capacity; i++) { 6727 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6728 __kmp_affinity_set_init_mask(i, TRUE); 6729 } 6730 } 6731 #endif /* KMP_AFFINITY_SUPPORTED */ 6732 6733 KMP_ASSERT(__kmp_xproc > 0); 6734 if (__kmp_avail_proc == 0) { 6735 __kmp_avail_proc = __kmp_xproc; 6736 } 6737 6738 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6739 // correct them now 6740 j = 0; 6741 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6742 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6743 __kmp_avail_proc; 6744 j++; 6745 } 6746 6747 if (__kmp_dflt_team_nth == 0) { 6748 #ifdef KMP_DFLT_NTH_CORES 6749 // Default #threads = #cores 6750 __kmp_dflt_team_nth = __kmp_ncores; 6751 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6752 "__kmp_ncores (%d)\n", 6753 __kmp_dflt_team_nth)); 6754 #else 6755 // Default #threads = #available OS procs 6756 __kmp_dflt_team_nth = __kmp_avail_proc; 6757 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6758 "__kmp_avail_proc(%d)\n", 6759 __kmp_dflt_team_nth)); 6760 #endif /* KMP_DFLT_NTH_CORES */ 6761 } 6762 6763 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 6764 __kmp_dflt_team_nth = KMP_MIN_NTH; 6765 } 6766 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 6767 __kmp_dflt_team_nth = __kmp_sys_max_nth; 6768 } 6769 6770 // There's no harm in continuing if the following check fails, 6771 // but it indicates an error in the previous logic. 6772 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 6773 6774 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 6775 // Run through the __kmp_threads array and set the num threads icv for each 6776 // root thread that is currently registered with the RTL (which has not 6777 // already explicitly set its nthreads-var with a call to 6778 // omp_set_num_threads()). 6779 for (i = 0; i < __kmp_threads_capacity; i++) { 6780 kmp_info_t *thread = __kmp_threads[i]; 6781 if (thread == NULL) 6782 continue; 6783 if (thread->th.th_current_task->td_icvs.nproc != 0) 6784 continue; 6785 6786 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 6787 } 6788 } 6789 KA_TRACE( 6790 20, 6791 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 6792 __kmp_dflt_team_nth)); 6793 6794 #ifdef KMP_ADJUST_BLOCKTIME 6795 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 6796 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6797 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6798 if (__kmp_nth > __kmp_avail_proc) { 6799 __kmp_zero_bt = TRUE; 6800 } 6801 } 6802 #endif /* KMP_ADJUST_BLOCKTIME */ 6803 6804 /* we have finished middle initialization */ 6805 TCW_SYNC_4(__kmp_init_middle, TRUE); 6806 6807 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 6808 } 6809 6810 void __kmp_middle_initialize(void) { 6811 if (__kmp_init_middle) { 6812 return; 6813 } 6814 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6815 if (__kmp_init_middle) { 6816 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6817 return; 6818 } 6819 __kmp_do_middle_initialize(); 6820 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6821 } 6822 6823 void __kmp_parallel_initialize(void) { 6824 int gtid = __kmp_entry_gtid(); // this might be a new root 6825 6826 /* synchronize parallel initialization (for sibling) */ 6827 if (TCR_4(__kmp_init_parallel)) 6828 return; 6829 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6830 if (TCR_4(__kmp_init_parallel)) { 6831 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6832 return; 6833 } 6834 6835 /* TODO reinitialization after we have already shut down */ 6836 if (TCR_4(__kmp_global.g.g_done)) { 6837 KA_TRACE( 6838 10, 6839 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 6840 __kmp_infinite_loop(); 6841 } 6842 6843 /* jc: The lock __kmp_initz_lock is already held, so calling 6844 __kmp_serial_initialize would cause a deadlock. So we call 6845 __kmp_do_serial_initialize directly. */ 6846 if (!__kmp_init_middle) { 6847 __kmp_do_middle_initialize(); 6848 } 6849 6850 /* begin initialization */ 6851 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 6852 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6853 6854 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6855 // Save the FP control regs. 6856 // Worker threads will set theirs to these values at thread startup. 6857 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 6858 __kmp_store_mxcsr(&__kmp_init_mxcsr); 6859 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 6860 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 6861 6862 #if KMP_OS_UNIX 6863 #if KMP_HANDLE_SIGNALS 6864 /* must be after __kmp_serial_initialize */ 6865 __kmp_install_signals(TRUE); 6866 #endif 6867 #endif 6868 6869 __kmp_suspend_initialize(); 6870 6871 #if defined(USE_LOAD_BALANCE) 6872 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6873 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 6874 } 6875 #else 6876 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6877 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 6878 } 6879 #endif 6880 6881 if (__kmp_version) { 6882 __kmp_print_version_2(); 6883 } 6884 6885 /* we have finished parallel initialization */ 6886 TCW_SYNC_4(__kmp_init_parallel, TRUE); 6887 6888 KMP_MB(); 6889 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 6890 6891 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6892 } 6893 6894 /* ------------------------------------------------------------------------ */ 6895 6896 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6897 kmp_team_t *team) { 6898 kmp_disp_t *dispatch; 6899 6900 KMP_MB(); 6901 6902 /* none of the threads have encountered any constructs, yet. */ 6903 this_thr->th.th_local.this_construct = 0; 6904 #if KMP_CACHE_MANAGE 6905 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 6906 #endif /* KMP_CACHE_MANAGE */ 6907 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 6908 KMP_DEBUG_ASSERT(dispatch); 6909 KMP_DEBUG_ASSERT(team->t.t_dispatch); 6910 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 6911 // this_thr->th.th_info.ds.ds_tid ] ); 6912 6913 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 6914 #if OMP_45_ENABLED 6915 dispatch->th_doacross_buf_idx = 6916 0; /* reset the doacross dispatch buffer counter */ 6917 #endif 6918 if (__kmp_env_consistency_check) 6919 __kmp_push_parallel(gtid, team->t.t_ident); 6920 6921 KMP_MB(); /* Flush all pending memory write invalidates. */ 6922 } 6923 6924 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6925 kmp_team_t *team) { 6926 if (__kmp_env_consistency_check) 6927 __kmp_pop_parallel(gtid, team->t.t_ident); 6928 6929 __kmp_finish_implicit_task(this_thr); 6930 } 6931 6932 int __kmp_invoke_task_func(int gtid) { 6933 int rc; 6934 int tid = __kmp_tid_from_gtid(gtid); 6935 kmp_info_t *this_thr = __kmp_threads[gtid]; 6936 kmp_team_t *team = this_thr->th.th_team; 6937 6938 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 6939 #if USE_ITT_BUILD 6940 if (__itt_stack_caller_create_ptr) { 6941 __kmp_itt_stack_callee_enter( 6942 (__itt_caller) 6943 team->t.t_stack_id); // inform ittnotify about entering user's code 6944 } 6945 #endif /* USE_ITT_BUILD */ 6946 #if INCLUDE_SSC_MARKS 6947 SSC_MARK_INVOKING(); 6948 #endif 6949 6950 #if OMPT_SUPPORT 6951 void *dummy; 6952 void **exit_runtime_p; 6953 ompt_data_t *my_task_data; 6954 ompt_data_t *my_parallel_data; 6955 int ompt_team_size; 6956 6957 if (ompt_enabled.enabled) { 6958 exit_runtime_p = &(team->t.t_implicit_task_taskdata[tid] 6959 .ompt_task_info.frame.exit_runtime_frame); 6960 } else { 6961 exit_runtime_p = &dummy; 6962 } 6963 6964 my_task_data = 6965 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 6966 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 6967 if (ompt_enabled.ompt_callback_implicit_task) { 6968 ompt_team_size = team->t.t_nproc; 6969 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 6970 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 6971 __kmp_tid_from_gtid(gtid)); 6972 } 6973 #endif 6974 6975 { 6976 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 6977 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 6978 rc = 6979 __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 6980 tid, (int)team->t.t_argc, (void **)team->t.t_argv 6981 #if OMPT_SUPPORT 6982 , 6983 exit_runtime_p 6984 #endif 6985 ); 6986 #if OMPT_SUPPORT 6987 *exit_runtime_p = NULL; 6988 #endif 6989 } 6990 6991 #if USE_ITT_BUILD 6992 if (__itt_stack_caller_create_ptr) { 6993 __kmp_itt_stack_callee_leave( 6994 (__itt_caller) 6995 team->t.t_stack_id); // inform ittnotify about leaving user's code 6996 } 6997 #endif /* USE_ITT_BUILD */ 6998 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 6999 7000 return rc; 7001 } 7002 7003 #if OMP_40_ENABLED 7004 void __kmp_teams_master(int gtid) { 7005 // This routine is called by all master threads in teams construct 7006 kmp_info_t *thr = __kmp_threads[gtid]; 7007 kmp_team_t *team = thr->th.th_team; 7008 ident_t *loc = team->t.t_ident; 7009 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7010 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7011 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7012 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7013 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7014 // Launch league of teams now, but not let workers execute 7015 // (they hang on fork barrier until next parallel) 7016 #if INCLUDE_SSC_MARKS 7017 SSC_MARK_FORKING(); 7018 #endif 7019 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7020 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7021 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7022 #if INCLUDE_SSC_MARKS 7023 SSC_MARK_JOINING(); 7024 #endif 7025 7026 // AC: last parameter "1" eliminates join barrier which won't work because 7027 // worker threads are in a fork barrier waiting for more parallel regions 7028 __kmp_join_call(loc, gtid 7029 #if OMPT_SUPPORT 7030 , 7031 fork_context_intel 7032 #endif 7033 , 7034 1); 7035 } 7036 7037 int __kmp_invoke_teams_master(int gtid) { 7038 kmp_info_t *this_thr = __kmp_threads[gtid]; 7039 kmp_team_t *team = this_thr->th.th_team; 7040 #if KMP_DEBUG 7041 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7042 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7043 (void *)__kmp_teams_master); 7044 #endif 7045 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7046 __kmp_teams_master(gtid); 7047 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7048 return 1; 7049 } 7050 #endif /* OMP_40_ENABLED */ 7051 7052 /* this sets the requested number of threads for the next parallel region 7053 encountered by this team. since this should be enclosed in the forkjoin 7054 critical section it should avoid race conditions with assymmetrical nested 7055 parallelism */ 7056 7057 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7058 kmp_info_t *thr = __kmp_threads[gtid]; 7059 7060 if (num_threads > 0) 7061 thr->th.th_set_nproc = num_threads; 7062 } 7063 7064 #if OMP_40_ENABLED 7065 7066 /* this sets the requested number of teams for the teams region and/or 7067 the number of threads for the next parallel region encountered */ 7068 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7069 int num_threads) { 7070 kmp_info_t *thr = __kmp_threads[gtid]; 7071 KMP_DEBUG_ASSERT(num_teams >= 0); 7072 KMP_DEBUG_ASSERT(num_threads >= 0); 7073 7074 if (num_teams == 0) 7075 num_teams = 1; // default number of teams is 1. 7076 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7077 if (!__kmp_reserve_warn) { 7078 __kmp_reserve_warn = 1; 7079 __kmp_msg(kmp_ms_warning, 7080 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7081 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7082 } 7083 num_teams = __kmp_teams_max_nth; 7084 } 7085 // Set number of teams (number of threads in the outer "parallel" of the 7086 // teams) 7087 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7088 7089 // Remember the number of threads for inner parallel regions 7090 if (num_threads == 0) { 7091 if (!TCR_4(__kmp_init_middle)) 7092 __kmp_middle_initialize(); // get __kmp_avail_proc calculated 7093 num_threads = __kmp_avail_proc / num_teams; 7094 if (num_teams * num_threads > __kmp_teams_max_nth) { 7095 // adjust num_threads w/o warning as it is not user setting 7096 num_threads = __kmp_teams_max_nth / num_teams; 7097 } 7098 } else { 7099 if (num_teams * num_threads > __kmp_teams_max_nth) { 7100 int new_threads = __kmp_teams_max_nth / num_teams; 7101 if (!__kmp_reserve_warn) { // user asked for too many threads 7102 __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT 7103 __kmp_msg(kmp_ms_warning, 7104 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7105 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7106 } 7107 num_threads = new_threads; 7108 } 7109 } 7110 thr->th.th_teams_size.nth = num_threads; 7111 } 7112 7113 // Set the proc_bind var to use in the following parallel region. 7114 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7115 kmp_info_t *thr = __kmp_threads[gtid]; 7116 thr->th.th_set_proc_bind = proc_bind; 7117 } 7118 7119 #endif /* OMP_40_ENABLED */ 7120 7121 /* Launch the worker threads into the microtask. */ 7122 7123 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7124 kmp_info_t *this_thr = __kmp_threads[gtid]; 7125 7126 #ifdef KMP_DEBUG 7127 int f; 7128 #endif /* KMP_DEBUG */ 7129 7130 KMP_DEBUG_ASSERT(team); 7131 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7132 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7133 KMP_MB(); /* Flush all pending memory write invalidates. */ 7134 7135 team->t.t_construct = 0; /* no single directives seen yet */ 7136 team->t.t_ordered.dt.t_value = 7137 0; /* thread 0 enters the ordered section first */ 7138 7139 /* Reset the identifiers on the dispatch buffer */ 7140 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7141 if (team->t.t_max_nproc > 1) { 7142 int i; 7143 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7144 team->t.t_disp_buffer[i].buffer_index = i; 7145 #if OMP_45_ENABLED 7146 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7147 #endif 7148 } 7149 } else { 7150 team->t.t_disp_buffer[0].buffer_index = 0; 7151 #if OMP_45_ENABLED 7152 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7153 #endif 7154 } 7155 7156 KMP_MB(); /* Flush all pending memory write invalidates. */ 7157 KMP_ASSERT(this_thr->th.th_team == team); 7158 7159 #ifdef KMP_DEBUG 7160 for (f = 0; f < team->t.t_nproc; f++) { 7161 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7162 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7163 } 7164 #endif /* KMP_DEBUG */ 7165 7166 /* release the worker threads so they may begin working */ 7167 __kmp_fork_barrier(gtid, 0); 7168 } 7169 7170 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7171 kmp_info_t *this_thr = __kmp_threads[gtid]; 7172 7173 KMP_DEBUG_ASSERT(team); 7174 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7175 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7176 KMP_MB(); /* Flush all pending memory write invalidates. */ 7177 7178 /* Join barrier after fork */ 7179 7180 #ifdef KMP_DEBUG 7181 if (__kmp_threads[gtid] && 7182 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7183 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7184 __kmp_threads[gtid]); 7185 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7186 "team->t.t_nproc=%d\n", 7187 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7188 team->t.t_nproc); 7189 __kmp_print_structure(); 7190 } 7191 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7192 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7193 #endif /* KMP_DEBUG */ 7194 7195 __kmp_join_barrier(gtid); /* wait for everyone */ 7196 #if OMPT_SUPPORT 7197 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7198 if (this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) { 7199 ompt_data_t *tId = OMPT_CUR_TASK_DATA(this_thr); 7200 ompt_data_t *pId = OMPT_CUR_TEAM_DATA(this_thr); 7201 this_thr->th.ompt_thread_info.state = omp_state_overhead; 7202 #if OMPT_OPTIONAL 7203 void *codeptr = NULL; 7204 if (KMP_MASTER_TID(ds_tid) && 7205 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7206 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7207 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7208 7209 if (ompt_enabled.ompt_callback_sync_region_wait) { 7210 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7211 ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr); 7212 } 7213 if (ompt_enabled.ompt_callback_sync_region) { 7214 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7215 ompt_sync_region_barrier, ompt_scope_end, pId, tId, codeptr); 7216 } 7217 #endif 7218 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7219 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7220 ompt_scope_end, NULL, tId, 0, ds_tid); 7221 } 7222 // return to idle state 7223 this_thr->th.ompt_thread_info.state = omp_state_overhead; 7224 } 7225 #endif 7226 7227 KMP_MB(); /* Flush all pending memory write invalidates. */ 7228 KMP_ASSERT(this_thr->th.th_team == team); 7229 } 7230 7231 /* ------------------------------------------------------------------------ */ 7232 7233 #ifdef USE_LOAD_BALANCE 7234 7235 // Return the worker threads actively spinning in the hot team, if we 7236 // are at the outermost level of parallelism. Otherwise, return 0. 7237 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7238 int i; 7239 int retval; 7240 kmp_team_t *hot_team; 7241 7242 if (root->r.r_active) { 7243 return 0; 7244 } 7245 hot_team = root->r.r_hot_team; 7246 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7247 return hot_team->t.t_nproc - 1; // Don't count master thread 7248 } 7249 7250 // Skip the master thread - it is accounted for elsewhere. 7251 retval = 0; 7252 for (i = 1; i < hot_team->t.t_nproc; i++) { 7253 if (hot_team->t.t_threads[i]->th.th_active) { 7254 retval++; 7255 } 7256 } 7257 return retval; 7258 } 7259 7260 // Perform an automatic adjustment to the number of 7261 // threads used by the next parallel region. 7262 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7263 int retval; 7264 int pool_active; 7265 int hot_team_active; 7266 int team_curr_active; 7267 int system_active; 7268 7269 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7270 set_nproc)); 7271 KMP_DEBUG_ASSERT(root); 7272 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7273 ->th.th_current_task->td_icvs.dynamic == TRUE); 7274 KMP_DEBUG_ASSERT(set_nproc > 1); 7275 7276 if (set_nproc == 1) { 7277 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7278 return 1; 7279 } 7280 7281 // Threads that are active in the thread pool, active in the hot team for this 7282 // particular root (if we are at the outer par level), and the currently 7283 // executing thread (to become the master) are available to add to the new 7284 // team, but are currently contributing to the system load, and must be 7285 // accounted for. 7286 pool_active = TCR_4(__kmp_thread_pool_active_nth); 7287 hot_team_active = __kmp_active_hot_team_nproc(root); 7288 team_curr_active = pool_active + hot_team_active + 1; 7289 7290 // Check the system load. 7291 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7292 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7293 "hot team active = %d\n", 7294 system_active, pool_active, hot_team_active)); 7295 7296 if (system_active < 0) { 7297 // There was an error reading the necessary info from /proc, so use the 7298 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7299 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7300 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7301 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7302 7303 // Make this call behave like the thread limit algorithm. 7304 retval = __kmp_avail_proc - __kmp_nth + 7305 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7306 if (retval > set_nproc) { 7307 retval = set_nproc; 7308 } 7309 if (retval < KMP_MIN_NTH) { 7310 retval = KMP_MIN_NTH; 7311 } 7312 7313 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7314 retval)); 7315 return retval; 7316 } 7317 7318 // There is a slight delay in the load balance algorithm in detecting new 7319 // running procs. The real system load at this instant should be at least as 7320 // large as the #active omp thread that are available to add to the team. 7321 if (system_active < team_curr_active) { 7322 system_active = team_curr_active; 7323 } 7324 retval = __kmp_avail_proc - system_active + team_curr_active; 7325 if (retval > set_nproc) { 7326 retval = set_nproc; 7327 } 7328 if (retval < KMP_MIN_NTH) { 7329 retval = KMP_MIN_NTH; 7330 } 7331 7332 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7333 return retval; 7334 } // __kmp_load_balance_nproc() 7335 7336 #endif /* USE_LOAD_BALANCE */ 7337 7338 /* ------------------------------------------------------------------------ */ 7339 7340 /* NOTE: this is called with the __kmp_init_lock held */ 7341 void __kmp_cleanup(void) { 7342 int f; 7343 7344 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7345 7346 if (TCR_4(__kmp_init_parallel)) { 7347 #if KMP_HANDLE_SIGNALS 7348 __kmp_remove_signals(); 7349 #endif 7350 TCW_4(__kmp_init_parallel, FALSE); 7351 } 7352 7353 if (TCR_4(__kmp_init_middle)) { 7354 #if KMP_AFFINITY_SUPPORTED 7355 __kmp_affinity_uninitialize(); 7356 #endif /* KMP_AFFINITY_SUPPORTED */ 7357 __kmp_cleanup_hierarchy(); 7358 TCW_4(__kmp_init_middle, FALSE); 7359 } 7360 7361 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7362 7363 if (__kmp_init_serial) { 7364 __kmp_runtime_destroy(); 7365 __kmp_init_serial = FALSE; 7366 } 7367 7368 for (f = 0; f < __kmp_threads_capacity; f++) { 7369 if (__kmp_root[f] != NULL) { 7370 __kmp_free(__kmp_root[f]); 7371 __kmp_root[f] = NULL; 7372 } 7373 } 7374 __kmp_free(__kmp_threads); 7375 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7376 // there is no need in freeing __kmp_root. 7377 __kmp_threads = NULL; 7378 __kmp_root = NULL; 7379 __kmp_threads_capacity = 0; 7380 7381 #if KMP_USE_DYNAMIC_LOCK 7382 __kmp_cleanup_indirect_user_locks(); 7383 #else 7384 __kmp_cleanup_user_locks(); 7385 #endif 7386 7387 #if KMP_AFFINITY_SUPPORTED 7388 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7389 __kmp_cpuinfo_file = NULL; 7390 #endif /* KMP_AFFINITY_SUPPORTED */ 7391 7392 #if KMP_USE_ADAPTIVE_LOCKS 7393 #if KMP_DEBUG_ADAPTIVE_LOCKS 7394 __kmp_print_speculative_stats(); 7395 #endif 7396 #endif 7397 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7398 __kmp_nested_nth.nth = NULL; 7399 __kmp_nested_nth.size = 0; 7400 __kmp_nested_nth.used = 0; 7401 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7402 __kmp_nested_proc_bind.bind_types = NULL; 7403 __kmp_nested_proc_bind.size = 0; 7404 __kmp_nested_proc_bind.used = 0; 7405 7406 __kmp_i18n_catclose(); 7407 7408 #if KMP_STATS_ENABLED 7409 __kmp_stats_fini(); 7410 #endif 7411 7412 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7413 } 7414 7415 /* ------------------------------------------------------------------------ */ 7416 7417 int __kmp_ignore_mppbeg(void) { 7418 char *env; 7419 7420 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7421 if (__kmp_str_match_false(env)) 7422 return FALSE; 7423 } 7424 // By default __kmpc_begin() is no-op. 7425 return TRUE; 7426 } 7427 7428 int __kmp_ignore_mppend(void) { 7429 char *env; 7430 7431 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7432 if (__kmp_str_match_false(env)) 7433 return FALSE; 7434 } 7435 // By default __kmpc_end() is no-op. 7436 return TRUE; 7437 } 7438 7439 void __kmp_internal_begin(void) { 7440 int gtid; 7441 kmp_root_t *root; 7442 7443 /* this is a very important step as it will register new sibling threads 7444 and assign these new uber threads a new gtid */ 7445 gtid = __kmp_entry_gtid(); 7446 root = __kmp_threads[gtid]->th.th_root; 7447 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7448 7449 if (root->r.r_begin) 7450 return; 7451 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7452 if (root->r.r_begin) { 7453 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7454 return; 7455 } 7456 7457 root->r.r_begin = TRUE; 7458 7459 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7460 } 7461 7462 /* ------------------------------------------------------------------------ */ 7463 7464 void __kmp_user_set_library(enum library_type arg) { 7465 int gtid; 7466 kmp_root_t *root; 7467 kmp_info_t *thread; 7468 7469 /* first, make sure we are initialized so we can get our gtid */ 7470 7471 gtid = __kmp_entry_gtid(); 7472 thread = __kmp_threads[gtid]; 7473 7474 root = thread->th.th_root; 7475 7476 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7477 library_serial)); 7478 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7479 thread */ 7480 KMP_WARNING(SetLibraryIncorrectCall); 7481 return; 7482 } 7483 7484 switch (arg) { 7485 case library_serial: 7486 thread->th.th_set_nproc = 0; 7487 set__nproc(thread, 1); 7488 break; 7489 case library_turnaround: 7490 thread->th.th_set_nproc = 0; 7491 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7492 : __kmp_dflt_team_nth_ub); 7493 break; 7494 case library_throughput: 7495 thread->th.th_set_nproc = 0; 7496 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7497 : __kmp_dflt_team_nth_ub); 7498 break; 7499 default: 7500 KMP_FATAL(UnknownLibraryType, arg); 7501 } 7502 7503 __kmp_aux_set_library(arg); 7504 } 7505 7506 void __kmp_aux_set_stacksize(size_t arg) { 7507 if (!__kmp_init_serial) 7508 __kmp_serial_initialize(); 7509 7510 #if KMP_OS_DARWIN 7511 if (arg & (0x1000 - 1)) { 7512 arg &= ~(0x1000 - 1); 7513 if (arg + 0x1000) /* check for overflow if we round up */ 7514 arg += 0x1000; 7515 } 7516 #endif 7517 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7518 7519 /* only change the default stacksize before the first parallel region */ 7520 if (!TCR_4(__kmp_init_parallel)) { 7521 size_t value = arg; /* argument is in bytes */ 7522 7523 if (value < __kmp_sys_min_stksize) 7524 value = __kmp_sys_min_stksize; 7525 else if (value > KMP_MAX_STKSIZE) 7526 value = KMP_MAX_STKSIZE; 7527 7528 __kmp_stksize = value; 7529 7530 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7531 } 7532 7533 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7534 } 7535 7536 /* set the behaviour of the runtime library */ 7537 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7538 void __kmp_aux_set_library(enum library_type arg) { 7539 __kmp_library = arg; 7540 7541 switch (__kmp_library) { 7542 case library_serial: { 7543 KMP_INFORM(LibraryIsSerial); 7544 (void)__kmp_change_library(TRUE); 7545 } break; 7546 case library_turnaround: 7547 (void)__kmp_change_library(TRUE); 7548 break; 7549 case library_throughput: 7550 (void)__kmp_change_library(FALSE); 7551 break; 7552 default: 7553 KMP_FATAL(UnknownLibraryType, arg); 7554 } 7555 } 7556 7557 /* ------------------------------------------------------------------------ */ 7558 7559 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 7560 int blocktime = arg; /* argument is in milliseconds */ 7561 #if KMP_USE_MONITOR 7562 int bt_intervals; 7563 #endif 7564 int bt_set; 7565 7566 __kmp_save_internal_controls(thread); 7567 7568 /* Normalize and set blocktime for the teams */ 7569 if (blocktime < KMP_MIN_BLOCKTIME) 7570 blocktime = KMP_MIN_BLOCKTIME; 7571 else if (blocktime > KMP_MAX_BLOCKTIME) 7572 blocktime = KMP_MAX_BLOCKTIME; 7573 7574 set__blocktime_team(thread->th.th_team, tid, blocktime); 7575 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 7576 7577 #if KMP_USE_MONITOR 7578 /* Calculate and set blocktime intervals for the teams */ 7579 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 7580 7581 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 7582 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 7583 #endif 7584 7585 /* Set whether blocktime has been set to "TRUE" */ 7586 bt_set = TRUE; 7587 7588 set__bt_set_team(thread->th.th_team, tid, bt_set); 7589 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 7590 #if KMP_USE_MONITOR 7591 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 7592 "bt_intervals=%d, monitor_updates=%d\n", 7593 __kmp_gtid_from_tid(tid, thread->th.th_team), 7594 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 7595 __kmp_monitor_wakeups)); 7596 #else 7597 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 7598 __kmp_gtid_from_tid(tid, thread->th.th_team), 7599 thread->th.th_team->t.t_id, tid, blocktime)); 7600 #endif 7601 } 7602 7603 void __kmp_aux_set_defaults(char const *str, int len) { 7604 if (!__kmp_init_serial) { 7605 __kmp_serial_initialize(); 7606 } 7607 __kmp_env_initialize(str); 7608 7609 if (__kmp_settings 7610 #if OMP_40_ENABLED 7611 || __kmp_display_env || __kmp_display_env_verbose 7612 #endif // OMP_40_ENABLED 7613 ) { 7614 __kmp_env_print(); 7615 } 7616 } // __kmp_aux_set_defaults 7617 7618 /* ------------------------------------------------------------------------ */ 7619 /* internal fast reduction routines */ 7620 7621 PACKED_REDUCTION_METHOD_T 7622 __kmp_determine_reduction_method( 7623 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 7624 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 7625 kmp_critical_name *lck) { 7626 7627 // Default reduction method: critical construct ( lck != NULL, like in current 7628 // PAROPT ) 7629 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 7630 // can be selected by RTL 7631 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 7632 // can be selected by RTL 7633 // Finally, it's up to OpenMP RTL to make a decision on which method to select 7634 // among generated by PAROPT. 7635 7636 PACKED_REDUCTION_METHOD_T retval; 7637 7638 int team_size; 7639 7640 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 7641 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 7642 7643 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 7644 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 7645 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 7646 7647 retval = critical_reduce_block; 7648 7649 // another choice of getting a team size (with 1 dynamic deference) is slower 7650 team_size = __kmp_get_team_num_threads(global_tid); 7651 if (team_size == 1) { 7652 7653 retval = empty_reduce_block; 7654 7655 } else { 7656 7657 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 7658 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 7659 7660 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 7661 7662 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \ 7663 KMP_OS_DARWIN 7664 7665 int teamsize_cutoff = 4; 7666 7667 #if KMP_MIC_SUPPORTED 7668 if (__kmp_mic_type != non_mic) { 7669 teamsize_cutoff = 8; 7670 } 7671 #endif 7672 if (tree_available) { 7673 if (team_size <= teamsize_cutoff) { 7674 if (atomic_available) { 7675 retval = atomic_reduce_block; 7676 } 7677 } else { 7678 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 7679 } 7680 } else if (atomic_available) { 7681 retval = atomic_reduce_block; 7682 } 7683 #else 7684 #error "Unknown or unsupported OS" 7685 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || 7686 // KMP_OS_DARWIN 7687 7688 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 7689 7690 #if KMP_OS_LINUX || KMP_OS_WINDOWS 7691 7692 // basic tuning 7693 7694 if (atomic_available) { 7695 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 7696 retval = atomic_reduce_block; 7697 } 7698 } // otherwise: use critical section 7699 7700 #elif KMP_OS_DARWIN 7701 7702 if (atomic_available && (num_vars <= 3)) { 7703 retval = atomic_reduce_block; 7704 } else if (tree_available) { 7705 if ((reduce_size > (9 * sizeof(kmp_real64))) && 7706 (reduce_size < (2000 * sizeof(kmp_real64)))) { 7707 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 7708 } 7709 } // otherwise: use critical section 7710 7711 #else 7712 #error "Unknown or unsupported OS" 7713 #endif 7714 7715 #else 7716 #error "Unknown or unsupported architecture" 7717 #endif 7718 } 7719 7720 // KMP_FORCE_REDUCTION 7721 7722 // If the team is serialized (team_size == 1), ignore the forced reduction 7723 // method and stay with the unsynchronized method (empty_reduce_block) 7724 if (__kmp_force_reduction_method != reduction_method_not_defined && 7725 team_size != 1) { 7726 7727 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 7728 7729 int atomic_available, tree_available; 7730 7731 switch ((forced_retval = __kmp_force_reduction_method)) { 7732 case critical_reduce_block: 7733 KMP_ASSERT(lck); // lck should be != 0 7734 break; 7735 7736 case atomic_reduce_block: 7737 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 7738 if (!atomic_available) { 7739 KMP_WARNING(RedMethodNotSupported, "atomic"); 7740 forced_retval = critical_reduce_block; 7741 } 7742 break; 7743 7744 case tree_reduce_block: 7745 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 7746 if (!tree_available) { 7747 KMP_WARNING(RedMethodNotSupported, "tree"); 7748 forced_retval = critical_reduce_block; 7749 } else { 7750 #if KMP_FAST_REDUCTION_BARRIER 7751 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 7752 #endif 7753 } 7754 break; 7755 7756 default: 7757 KMP_ASSERT(0); // "unsupported method specified" 7758 } 7759 7760 retval = forced_retval; 7761 } 7762 7763 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 7764 7765 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 7766 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 7767 7768 return (retval); 7769 } 7770 7771 // this function is for testing set/get/determine reduce method 7772 kmp_int32 __kmp_get_reduce_method(void) { 7773 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 7774 } 7775