1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #include "kmp.h" 14 #include "kmp_affinity.h" 15 #include "kmp_atomic.h" 16 #include "kmp_environment.h" 17 #include "kmp_error.h" 18 #include "kmp_i18n.h" 19 #include "kmp_io.h" 20 #include "kmp_itt.h" 21 #include "kmp_settings.h" 22 #include "kmp_stats.h" 23 #include "kmp_str.h" 24 #include "kmp_wait_release.h" 25 #include "kmp_wrapper_getpid.h" 26 #include "kmp_dispatch.h" 27 #if KMP_USE_HIER_SCHED 28 #include "kmp_dispatch_hier.h" 29 #endif 30 31 #if OMPT_SUPPORT 32 #include "ompt-specific.h" 33 #endif 34 35 /* these are temporary issues to be dealt with */ 36 #define KMP_USE_PRCTL 0 37 38 #if KMP_OS_WINDOWS 39 #include <process.h> 40 #endif 41 42 #include "tsan_annotations.h" 43 44 #if defined(KMP_GOMP_COMPAT) 45 char const __kmp_version_alt_comp[] = 46 KMP_VERSION_PREFIX "alternative compiler support: yes"; 47 #endif /* defined(KMP_GOMP_COMPAT) */ 48 49 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: " 50 #if OMP_50_ENABLED 51 "5.0 (201611)"; 52 #elif OMP_45_ENABLED 53 "4.5 (201511)"; 54 #elif OMP_40_ENABLED 55 "4.0 (201307)"; 56 #else 57 "3.1 (201107)"; 58 #endif 59 60 #ifdef KMP_DEBUG 61 char const __kmp_version_lock[] = 62 KMP_VERSION_PREFIX "lock type: run time selectable"; 63 #endif /* KMP_DEBUG */ 64 65 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 66 67 /* ------------------------------------------------------------------------ */ 68 69 #if KMP_USE_MONITOR 70 kmp_info_t __kmp_monitor; 71 #endif 72 73 /* Forward declarations */ 74 75 void __kmp_cleanup(void); 76 77 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 78 int gtid); 79 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 80 kmp_internal_control_t *new_icvs, 81 ident_t *loc); 82 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 83 static void __kmp_partition_places(kmp_team_t *team, 84 int update_master_only = 0); 85 #endif 86 static void __kmp_do_serial_initialize(void); 87 void __kmp_fork_barrier(int gtid, int tid); 88 void __kmp_join_barrier(int gtid); 89 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 90 kmp_internal_control_t *new_icvs, ident_t *loc); 91 92 #ifdef USE_LOAD_BALANCE 93 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 94 #endif 95 96 static int __kmp_expand_threads(int nNeed); 97 #if KMP_OS_WINDOWS 98 static int __kmp_unregister_root_other_thread(int gtid); 99 #endif 100 static void __kmp_unregister_library(void); // called by __kmp_internal_end() 101 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 102 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 103 104 /* Calculate the identifier of the current thread */ 105 /* fast (and somewhat portable) way to get unique identifier of executing 106 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 107 int __kmp_get_global_thread_id() { 108 int i; 109 kmp_info_t **other_threads; 110 size_t stack_data; 111 char *stack_addr; 112 size_t stack_size; 113 char *stack_base; 114 115 KA_TRACE( 116 1000, 117 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 118 __kmp_nth, __kmp_all_nth)); 119 120 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 121 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 122 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 123 __kmp_init_gtid for this to work. */ 124 125 if (!TCR_4(__kmp_init_gtid)) 126 return KMP_GTID_DNE; 127 128 #ifdef KMP_TDATA_GTID 129 if (TCR_4(__kmp_gtid_mode) >= 3) { 130 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 131 return __kmp_gtid; 132 } 133 #endif 134 if (TCR_4(__kmp_gtid_mode) >= 2) { 135 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 136 return __kmp_gtid_get_specific(); 137 } 138 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 139 140 stack_addr = (char *)&stack_data; 141 other_threads = __kmp_threads; 142 143 /* ATT: The code below is a source of potential bugs due to unsynchronized 144 access to __kmp_threads array. For example: 145 1. Current thread loads other_threads[i] to thr and checks it, it is 146 non-NULL. 147 2. Current thread is suspended by OS. 148 3. Another thread unregisters and finishes (debug versions of free() 149 may fill memory with something like 0xEF). 150 4. Current thread is resumed. 151 5. Current thread reads junk from *thr. 152 TODO: Fix it. --ln */ 153 154 for (i = 0; i < __kmp_threads_capacity; i++) { 155 156 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 157 if (!thr) 158 continue; 159 160 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 161 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 162 163 /* stack grows down -- search through all of the active threads */ 164 165 if (stack_addr <= stack_base) { 166 size_t stack_diff = stack_base - stack_addr; 167 168 if (stack_diff <= stack_size) { 169 /* The only way we can be closer than the allocated */ 170 /* stack size is if we are running on this thread. */ 171 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 172 return i; 173 } 174 } 175 } 176 177 /* get specific to try and determine our gtid */ 178 KA_TRACE(1000, 179 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 180 "thread, using TLS\n")); 181 i = __kmp_gtid_get_specific(); 182 183 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 184 185 /* if we havn't been assigned a gtid, then return code */ 186 if (i < 0) 187 return i; 188 189 /* dynamically updated stack window for uber threads to avoid get_specific 190 call */ 191 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 192 KMP_FATAL(StackOverflow, i); 193 } 194 195 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 196 if (stack_addr > stack_base) { 197 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 198 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 199 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 200 stack_base); 201 } else { 202 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 203 stack_base - stack_addr); 204 } 205 206 /* Reprint stack bounds for ubermaster since they have been refined */ 207 if (__kmp_storage_map) { 208 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 209 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 210 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 211 other_threads[i]->th.th_info.ds.ds_stacksize, 212 "th_%d stack (refinement)", i); 213 } 214 return i; 215 } 216 217 int __kmp_get_global_thread_id_reg() { 218 int gtid; 219 220 if (!__kmp_init_serial) { 221 gtid = KMP_GTID_DNE; 222 } else 223 #ifdef KMP_TDATA_GTID 224 if (TCR_4(__kmp_gtid_mode) >= 3) { 225 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 226 gtid = __kmp_gtid; 227 } else 228 #endif 229 if (TCR_4(__kmp_gtid_mode) >= 2) { 230 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 231 gtid = __kmp_gtid_get_specific(); 232 } else { 233 KA_TRACE(1000, 234 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 235 gtid = __kmp_get_global_thread_id(); 236 } 237 238 /* we must be a new uber master sibling thread */ 239 if (gtid == KMP_GTID_DNE) { 240 KA_TRACE(10, 241 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 242 "Registering a new gtid.\n")); 243 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 244 if (!__kmp_init_serial) { 245 __kmp_do_serial_initialize(); 246 gtid = __kmp_gtid_get_specific(); 247 } else { 248 gtid = __kmp_register_root(FALSE); 249 } 250 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 251 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 252 } 253 254 KMP_DEBUG_ASSERT(gtid >= 0); 255 256 return gtid; 257 } 258 259 /* caller must hold forkjoin_lock */ 260 void __kmp_check_stack_overlap(kmp_info_t *th) { 261 int f; 262 char *stack_beg = NULL; 263 char *stack_end = NULL; 264 int gtid; 265 266 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 267 if (__kmp_storage_map) { 268 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 269 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 270 271 gtid = __kmp_gtid_from_thread(th); 272 273 if (gtid == KMP_GTID_MONITOR) { 274 __kmp_print_storage_map_gtid( 275 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 276 "th_%s stack (%s)", "mon", 277 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 278 } else { 279 __kmp_print_storage_map_gtid( 280 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 281 "th_%d stack (%s)", gtid, 282 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 283 } 284 } 285 286 /* No point in checking ubermaster threads since they use refinement and 287 * cannot overlap */ 288 gtid = __kmp_gtid_from_thread(th); 289 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 290 KA_TRACE(10, 291 ("__kmp_check_stack_overlap: performing extensive checking\n")); 292 if (stack_beg == NULL) { 293 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 294 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 295 } 296 297 for (f = 0; f < __kmp_threads_capacity; f++) { 298 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 299 300 if (f_th && f_th != th) { 301 char *other_stack_end = 302 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 303 char *other_stack_beg = 304 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 305 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 306 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 307 308 /* Print the other stack values before the abort */ 309 if (__kmp_storage_map) 310 __kmp_print_storage_map_gtid( 311 -1, other_stack_beg, other_stack_end, 312 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 313 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 314 315 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 316 __kmp_msg_null); 317 } 318 } 319 } 320 } 321 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 322 } 323 324 /* ------------------------------------------------------------------------ */ 325 326 void __kmp_infinite_loop(void) { 327 static int done = FALSE; 328 329 while (!done) { 330 KMP_YIELD(1); 331 } 332 } 333 334 #define MAX_MESSAGE 512 335 336 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 337 char const *format, ...) { 338 char buffer[MAX_MESSAGE]; 339 va_list ap; 340 341 va_start(ap, format); 342 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 343 p2, (unsigned long)size, format); 344 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 345 __kmp_vprintf(kmp_err, buffer, ap); 346 #if KMP_PRINT_DATA_PLACEMENT 347 int node; 348 if (gtid >= 0) { 349 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 350 if (__kmp_storage_map_verbose) { 351 node = __kmp_get_host_node(p1); 352 if (node < 0) /* doesn't work, so don't try this next time */ 353 __kmp_storage_map_verbose = FALSE; 354 else { 355 char *last; 356 int lastNode; 357 int localProc = __kmp_get_cpu_from_gtid(gtid); 358 359 const int page_size = KMP_GET_PAGE_SIZE(); 360 361 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 362 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 363 if (localProc >= 0) 364 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 365 localProc >> 1); 366 else 367 __kmp_printf_no_lock(" GTID %d\n", gtid); 368 #if KMP_USE_PRCTL 369 /* The more elaborate format is disabled for now because of the prctl 370 * hanging bug. */ 371 do { 372 last = p1; 373 lastNode = node; 374 /* This loop collates adjacent pages with the same host node. */ 375 do { 376 (char *)p1 += page_size; 377 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 378 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 379 lastNode); 380 } while (p1 <= p2); 381 #else 382 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 383 (char *)p1 + (page_size - 1), 384 __kmp_get_host_node(p1)); 385 if (p1 < p2) { 386 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 387 (char *)p2 + (page_size - 1), 388 __kmp_get_host_node(p2)); 389 } 390 #endif 391 } 392 } 393 } else 394 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 395 } 396 #endif /* KMP_PRINT_DATA_PLACEMENT */ 397 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 398 } 399 400 void __kmp_warn(char const *format, ...) { 401 char buffer[MAX_MESSAGE]; 402 va_list ap; 403 404 if (__kmp_generate_warnings == kmp_warnings_off) { 405 return; 406 } 407 408 va_start(ap, format); 409 410 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 411 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 412 __kmp_vprintf(kmp_err, buffer, ap); 413 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 414 415 va_end(ap); 416 } 417 418 void __kmp_abort_process() { 419 // Later threads may stall here, but that's ok because abort() will kill them. 420 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 421 422 if (__kmp_debug_buf) { 423 __kmp_dump_debug_buffer(); 424 } 425 426 if (KMP_OS_WINDOWS) { 427 // Let other threads know of abnormal termination and prevent deadlock 428 // if abort happened during library initialization or shutdown 429 __kmp_global.g.g_abort = SIGABRT; 430 431 /* On Windows* OS by default abort() causes pop-up error box, which stalls 432 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 433 boxes. _set_abort_behavior() works well, but this function is not 434 available in VS7 (this is not problem for DLL, but it is a problem for 435 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 436 help, at least in some versions of MS C RTL. 437 438 It seems following sequence is the only way to simulate abort() and 439 avoid pop-up error box. */ 440 raise(SIGABRT); 441 _exit(3); // Just in case, if signal ignored, exit anyway. 442 } else { 443 abort(); 444 } 445 446 __kmp_infinite_loop(); 447 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 448 449 } // __kmp_abort_process 450 451 void __kmp_abort_thread(void) { 452 // TODO: Eliminate g_abort global variable and this function. 453 // In case of abort just call abort(), it will kill all the threads. 454 __kmp_infinite_loop(); 455 } // __kmp_abort_thread 456 457 /* Print out the storage map for the major kmp_info_t thread data structures 458 that are allocated together. */ 459 460 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 461 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 462 gtid); 463 464 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 465 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 466 467 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 468 sizeof(kmp_local_t), "th_%d.th_local", gtid); 469 470 __kmp_print_storage_map_gtid( 471 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 472 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 473 474 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 475 &thr->th.th_bar[bs_plain_barrier + 1], 476 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 477 gtid); 478 479 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 480 &thr->th.th_bar[bs_forkjoin_barrier + 1], 481 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 482 gtid); 483 484 #if KMP_FAST_REDUCTION_BARRIER 485 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 486 &thr->th.th_bar[bs_reduction_barrier + 1], 487 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 488 gtid); 489 #endif // KMP_FAST_REDUCTION_BARRIER 490 } 491 492 /* Print out the storage map for the major kmp_team_t team data structures 493 that are allocated together. */ 494 495 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 496 int team_id, int num_thr) { 497 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 498 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 499 header, team_id); 500 501 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 502 &team->t.t_bar[bs_last_barrier], 503 sizeof(kmp_balign_team_t) * bs_last_barrier, 504 "%s_%d.t_bar", header, team_id); 505 506 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 507 &team->t.t_bar[bs_plain_barrier + 1], 508 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 509 header, team_id); 510 511 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 512 &team->t.t_bar[bs_forkjoin_barrier + 1], 513 sizeof(kmp_balign_team_t), 514 "%s_%d.t_bar[forkjoin]", header, team_id); 515 516 #if KMP_FAST_REDUCTION_BARRIER 517 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 518 &team->t.t_bar[bs_reduction_barrier + 1], 519 sizeof(kmp_balign_team_t), 520 "%s_%d.t_bar[reduction]", header, team_id); 521 #endif // KMP_FAST_REDUCTION_BARRIER 522 523 __kmp_print_storage_map_gtid( 524 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 525 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 526 527 __kmp_print_storage_map_gtid( 528 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 529 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 530 531 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 532 &team->t.t_disp_buffer[num_disp_buff], 533 sizeof(dispatch_shared_info_t) * num_disp_buff, 534 "%s_%d.t_disp_buffer", header, team_id); 535 536 __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data, 537 sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, 538 team_id); 539 } 540 541 static void __kmp_init_allocator() { 542 #if OMP_50_ENABLED 543 __kmp_init_memkind(); 544 #endif 545 } 546 static void __kmp_fini_allocator() { 547 #if OMP_50_ENABLED 548 __kmp_fini_memkind(); 549 #endif 550 } 551 552 /* ------------------------------------------------------------------------ */ 553 554 #if KMP_DYNAMIC_LIB 555 #if KMP_OS_WINDOWS 556 557 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) { 558 // TODO: Change to __kmp_break_bootstrap_lock(). 559 __kmp_init_bootstrap_lock(lck); // make the lock released 560 } 561 562 static void __kmp_reset_locks_on_process_detach(int gtid_req) { 563 int i; 564 int thread_count; 565 566 // PROCESS_DETACH is expected to be called by a thread that executes 567 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one 568 // calling ProcessExit or FreeLibrary). So, it might be safe to access the 569 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some 570 // threads can be still alive here, although being about to be terminated. The 571 // threads in the array with ds_thread==0 are most suspicious. Actually, it 572 // can be not safe to access the __kmp_threads[]. 573 574 // TODO: does it make sense to check __kmp_roots[] ? 575 576 // Let's check that there are no other alive threads registered with the OMP 577 // lib. 578 while (1) { 579 thread_count = 0; 580 for (i = 0; i < __kmp_threads_capacity; ++i) { 581 if (!__kmp_threads) 582 continue; 583 kmp_info_t *th = __kmp_threads[i]; 584 if (th == NULL) 585 continue; 586 int gtid = th->th.th_info.ds.ds_gtid; 587 if (gtid == gtid_req) 588 continue; 589 if (gtid < 0) 590 continue; 591 DWORD exit_val; 592 int alive = __kmp_is_thread_alive(th, &exit_val); 593 if (alive) { 594 ++thread_count; 595 } 596 } 597 if (thread_count == 0) 598 break; // success 599 } 600 601 // Assume that I'm alone. Now it might be safe to check and reset locks. 602 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset. 603 __kmp_reset_lock(&__kmp_forkjoin_lock); 604 #ifdef KMP_DEBUG 605 __kmp_reset_lock(&__kmp_stdio_lock); 606 #endif // KMP_DEBUG 607 } 608 609 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 610 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 611 612 switch (fdwReason) { 613 614 case DLL_PROCESS_ATTACH: 615 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 616 617 return TRUE; 618 619 case DLL_PROCESS_DETACH: 620 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 621 622 if (lpReserved != NULL) { 623 // lpReserved is used for telling the difference: 624 // lpReserved == NULL when FreeLibrary() was called, 625 // lpReserved != NULL when the process terminates. 626 // When FreeLibrary() is called, worker threads remain alive. So they will 627 // release the forkjoin lock by themselves. When the process terminates, 628 // worker threads disappear triggering the problem of unreleased forkjoin 629 // lock as described below. 630 631 // A worker thread can take the forkjoin lock. The problem comes up if 632 // that worker thread becomes dead before it releases the forkjoin lock. 633 // The forkjoin lock remains taken, while the thread executing 634 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try 635 // to take the forkjoin lock and will always fail, so that the application 636 // will never finish [normally]. This scenario is possible if 637 // __kmpc_end() has not been executed. It looks like it's not a corner 638 // case, but common cases: 639 // - the main function was compiled by an alternative compiler; 640 // - the main function was compiled by icl but without /Qopenmp 641 // (application with plugins); 642 // - application terminates by calling C exit(), Fortran CALL EXIT() or 643 // Fortran STOP. 644 // - alive foreign thread prevented __kmpc_end from doing cleanup. 645 // 646 // This is a hack to work around the problem. 647 // TODO: !!! figure out something better. 648 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific()); 649 } 650 651 __kmp_internal_end_library(__kmp_gtid_get_specific()); 652 653 return TRUE; 654 655 case DLL_THREAD_ATTACH: 656 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 657 658 /* if we want to register new siblings all the time here call 659 * __kmp_get_gtid(); */ 660 return TRUE; 661 662 case DLL_THREAD_DETACH: 663 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 664 665 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 666 return TRUE; 667 } 668 669 return TRUE; 670 } 671 672 #endif /* KMP_OS_WINDOWS */ 673 #endif /* KMP_DYNAMIC_LIB */ 674 675 /* Change the library type to "status" and return the old type */ 676 /* called from within initialization routines where __kmp_initz_lock is held */ 677 int __kmp_change_library(int status) { 678 int old_status; 679 680 old_status = __kmp_yield_init & 681 1; // check whether KMP_LIBRARY=throughput (even init count) 682 683 if (status) { 684 __kmp_yield_init |= 1; // throughput => turnaround (odd init count) 685 } else { 686 __kmp_yield_init &= ~1; // turnaround => throughput (even init count) 687 } 688 689 return old_status; // return previous setting of whether 690 // KMP_LIBRARY=throughput 691 } 692 693 /* __kmp_parallel_deo -- Wait until it's our turn. */ 694 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 695 int gtid = *gtid_ref; 696 #ifdef BUILD_PARALLEL_ORDERED 697 kmp_team_t *team = __kmp_team_from_gtid(gtid); 698 #endif /* BUILD_PARALLEL_ORDERED */ 699 700 if (__kmp_env_consistency_check) { 701 if (__kmp_threads[gtid]->th.th_root->r.r_active) 702 #if KMP_USE_DYNAMIC_LOCK 703 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 704 #else 705 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 706 #endif 707 } 708 #ifdef BUILD_PARALLEL_ORDERED 709 if (!team->t.t_serialized) { 710 KMP_MB(); 711 KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), 712 KMP_EQ, NULL); 713 KMP_MB(); 714 } 715 #endif /* BUILD_PARALLEL_ORDERED */ 716 } 717 718 /* __kmp_parallel_dxo -- Signal the next task. */ 719 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 720 int gtid = *gtid_ref; 721 #ifdef BUILD_PARALLEL_ORDERED 722 int tid = __kmp_tid_from_gtid(gtid); 723 kmp_team_t *team = __kmp_team_from_gtid(gtid); 724 #endif /* BUILD_PARALLEL_ORDERED */ 725 726 if (__kmp_env_consistency_check) { 727 if (__kmp_threads[gtid]->th.th_root->r.r_active) 728 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 729 } 730 #ifdef BUILD_PARALLEL_ORDERED 731 if (!team->t.t_serialized) { 732 KMP_MB(); /* Flush all pending memory write invalidates. */ 733 734 /* use the tid of the next thread in this team */ 735 /* TODO replace with general release procedure */ 736 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 737 738 KMP_MB(); /* Flush all pending memory write invalidates. */ 739 } 740 #endif /* BUILD_PARALLEL_ORDERED */ 741 } 742 743 /* ------------------------------------------------------------------------ */ 744 /* The BARRIER for a SINGLE process section is always explicit */ 745 746 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 747 int status; 748 kmp_info_t *th; 749 kmp_team_t *team; 750 751 if (!TCR_4(__kmp_init_parallel)) 752 __kmp_parallel_initialize(); 753 754 #if OMP_50_ENABLED 755 __kmp_resume_if_soft_paused(); 756 #endif 757 758 th = __kmp_threads[gtid]; 759 team = th->th.th_team; 760 status = 0; 761 762 th->th.th_ident = id_ref; 763 764 if (team->t.t_serialized) { 765 status = 1; 766 } else { 767 kmp_int32 old_this = th->th.th_local.this_construct; 768 769 ++th->th.th_local.this_construct; 770 /* try to set team count to thread count--success means thread got the 771 single block */ 772 /* TODO: Should this be acquire or release? */ 773 if (team->t.t_construct == old_this) { 774 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 775 th->th.th_local.this_construct); 776 } 777 #if USE_ITT_BUILD 778 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 779 KMP_MASTER_GTID(gtid) && 780 #if OMP_40_ENABLED 781 th->th.th_teams_microtask == NULL && 782 #endif 783 team->t.t_active_level == 784 1) { // Only report metadata by master of active team at level 1 785 __kmp_itt_metadata_single(id_ref); 786 } 787 #endif /* USE_ITT_BUILD */ 788 } 789 790 if (__kmp_env_consistency_check) { 791 if (status && push_ws) { 792 __kmp_push_workshare(gtid, ct_psingle, id_ref); 793 } else { 794 __kmp_check_workshare(gtid, ct_psingle, id_ref); 795 } 796 } 797 #if USE_ITT_BUILD 798 if (status) { 799 __kmp_itt_single_start(gtid); 800 } 801 #endif /* USE_ITT_BUILD */ 802 return status; 803 } 804 805 void __kmp_exit_single(int gtid) { 806 #if USE_ITT_BUILD 807 __kmp_itt_single_end(gtid); 808 #endif /* USE_ITT_BUILD */ 809 if (__kmp_env_consistency_check) 810 __kmp_pop_workshare(gtid, ct_psingle, NULL); 811 } 812 813 /* determine if we can go parallel or must use a serialized parallel region and 814 * how many threads we can use 815 * set_nproc is the number of threads requested for the team 816 * returns 0 if we should serialize or only use one thread, 817 * otherwise the number of threads to use 818 * The forkjoin lock is held by the caller. */ 819 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 820 int master_tid, int set_nthreads 821 #if OMP_40_ENABLED 822 , 823 int enter_teams 824 #endif /* OMP_40_ENABLED */ 825 ) { 826 int capacity; 827 int new_nthreads; 828 KMP_DEBUG_ASSERT(__kmp_init_serial); 829 KMP_DEBUG_ASSERT(root && parent_team); 830 831 // If dyn-var is set, dynamically adjust the number of desired threads, 832 // according to the method specified by dynamic_mode. 833 new_nthreads = set_nthreads; 834 if (!get__dynamic_2(parent_team, master_tid)) { 835 ; 836 } 837 #ifdef USE_LOAD_BALANCE 838 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 839 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 840 if (new_nthreads == 1) { 841 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 842 "reservation to 1 thread\n", 843 master_tid)); 844 return 1; 845 } 846 if (new_nthreads < set_nthreads) { 847 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 848 "reservation to %d threads\n", 849 master_tid, new_nthreads)); 850 } 851 } 852 #endif /* USE_LOAD_BALANCE */ 853 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 854 new_nthreads = __kmp_avail_proc - __kmp_nth + 855 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 856 if (new_nthreads <= 1) { 857 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 858 "reservation to 1 thread\n", 859 master_tid)); 860 return 1; 861 } 862 if (new_nthreads < set_nthreads) { 863 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 864 "reservation to %d threads\n", 865 master_tid, new_nthreads)); 866 } else { 867 new_nthreads = set_nthreads; 868 } 869 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 870 if (set_nthreads > 2) { 871 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 872 new_nthreads = (new_nthreads % set_nthreads) + 1; 873 if (new_nthreads == 1) { 874 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 875 "reservation to 1 thread\n", 876 master_tid)); 877 return 1; 878 } 879 if (new_nthreads < set_nthreads) { 880 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 881 "reservation to %d threads\n", 882 master_tid, new_nthreads)); 883 } 884 } 885 } else { 886 KMP_ASSERT(0); 887 } 888 889 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 890 if (__kmp_nth + new_nthreads - 891 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 892 __kmp_max_nth) { 893 int tl_nthreads = __kmp_max_nth - __kmp_nth + 894 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 895 if (tl_nthreads <= 0) { 896 tl_nthreads = 1; 897 } 898 899 // If dyn-var is false, emit a 1-time warning. 900 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 901 __kmp_reserve_warn = 1; 902 __kmp_msg(kmp_ms_warning, 903 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 904 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 905 } 906 if (tl_nthreads == 1) { 907 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 908 "reduced reservation to 1 thread\n", 909 master_tid)); 910 return 1; 911 } 912 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 913 "reservation to %d threads\n", 914 master_tid, tl_nthreads)); 915 new_nthreads = tl_nthreads; 916 } 917 918 // Respect OMP_THREAD_LIMIT 919 if (root->r.r_cg_nthreads + new_nthreads - 920 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 921 __kmp_cg_max_nth) { 922 int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads + 923 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 924 if (tl_nthreads <= 0) { 925 tl_nthreads = 1; 926 } 927 928 // If dyn-var is false, emit a 1-time warning. 929 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 930 __kmp_reserve_warn = 1; 931 __kmp_msg(kmp_ms_warning, 932 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 933 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 934 } 935 if (tl_nthreads == 1) { 936 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 937 "reduced reservation to 1 thread\n", 938 master_tid)); 939 return 1; 940 } 941 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 942 "reservation to %d threads\n", 943 master_tid, tl_nthreads)); 944 new_nthreads = tl_nthreads; 945 } 946 947 // Check if the threads array is large enough, or needs expanding. 948 // See comment in __kmp_register_root() about the adjustment if 949 // __kmp_threads[0] == NULL. 950 capacity = __kmp_threads_capacity; 951 if (TCR_PTR(__kmp_threads[0]) == NULL) { 952 --capacity; 953 } 954 if (__kmp_nth + new_nthreads - 955 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 956 capacity) { 957 // Expand the threads array. 958 int slotsRequired = __kmp_nth + new_nthreads - 959 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 960 capacity; 961 int slotsAdded = __kmp_expand_threads(slotsRequired); 962 if (slotsAdded < slotsRequired) { 963 // The threads array was not expanded enough. 964 new_nthreads -= (slotsRequired - slotsAdded); 965 KMP_ASSERT(new_nthreads >= 1); 966 967 // If dyn-var is false, emit a 1-time warning. 968 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 969 __kmp_reserve_warn = 1; 970 if (__kmp_tp_cached) { 971 __kmp_msg(kmp_ms_warning, 972 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 973 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 974 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 975 } else { 976 __kmp_msg(kmp_ms_warning, 977 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 978 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 979 } 980 } 981 } 982 } 983 984 #ifdef KMP_DEBUG 985 if (new_nthreads == 1) { 986 KC_TRACE(10, 987 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 988 "dead roots and rechecking; requested %d threads\n", 989 __kmp_get_gtid(), set_nthreads)); 990 } else { 991 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 992 " %d threads\n", 993 __kmp_get_gtid(), new_nthreads, set_nthreads)); 994 } 995 #endif // KMP_DEBUG 996 return new_nthreads; 997 } 998 999 /* Allocate threads from the thread pool and assign them to the new team. We are 1000 assured that there are enough threads available, because we checked on that 1001 earlier within critical section forkjoin */ 1002 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 1003 kmp_info_t *master_th, int master_gtid) { 1004 int i; 1005 int use_hot_team; 1006 1007 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 1008 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 1009 KMP_MB(); 1010 1011 /* first, let's setup the master thread */ 1012 master_th->th.th_info.ds.ds_tid = 0; 1013 master_th->th.th_team = team; 1014 master_th->th.th_team_nproc = team->t.t_nproc; 1015 master_th->th.th_team_master = master_th; 1016 master_th->th.th_team_serialized = FALSE; 1017 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 1018 1019 /* make sure we are not the optimized hot team */ 1020 #if KMP_NESTED_HOT_TEAMS 1021 use_hot_team = 0; 1022 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 1023 if (hot_teams) { // hot teams array is not allocated if 1024 // KMP_HOT_TEAMS_MAX_LEVEL=0 1025 int level = team->t.t_active_level - 1; // index in array of hot teams 1026 if (master_th->th.th_teams_microtask) { // are we inside the teams? 1027 if (master_th->th.th_teams_size.nteams > 1) { 1028 ++level; // level was not increased in teams construct for 1029 // team_of_masters 1030 } 1031 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1032 master_th->th.th_teams_level == team->t.t_level) { 1033 ++level; // level was not increased in teams construct for 1034 // team_of_workers before the parallel 1035 } // team->t.t_level will be increased inside parallel 1036 } 1037 if (level < __kmp_hot_teams_max_level) { 1038 if (hot_teams[level].hot_team) { 1039 // hot team has already been allocated for given level 1040 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 1041 use_hot_team = 1; // the team is ready to use 1042 } else { 1043 use_hot_team = 0; // AC: threads are not allocated yet 1044 hot_teams[level].hot_team = team; // remember new hot team 1045 hot_teams[level].hot_team_nth = team->t.t_nproc; 1046 } 1047 } else { 1048 use_hot_team = 0; 1049 } 1050 } 1051 #else 1052 use_hot_team = team == root->r.r_hot_team; 1053 #endif 1054 if (!use_hot_team) { 1055 1056 /* install the master thread */ 1057 team->t.t_threads[0] = master_th; 1058 __kmp_initialize_info(master_th, team, 0, master_gtid); 1059 1060 /* now, install the worker threads */ 1061 for (i = 1; i < team->t.t_nproc; i++) { 1062 1063 /* fork or reallocate a new thread and install it in team */ 1064 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1065 team->t.t_threads[i] = thr; 1066 KMP_DEBUG_ASSERT(thr); 1067 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1068 /* align team and thread arrived states */ 1069 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1070 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1071 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1072 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1073 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1074 team->t.t_bar[bs_plain_barrier].b_arrived)); 1075 #if OMP_40_ENABLED 1076 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1077 thr->th.th_teams_level = master_th->th.th_teams_level; 1078 thr->th.th_teams_size = master_th->th.th_teams_size; 1079 #endif 1080 { // Initialize threads' barrier data. 1081 int b; 1082 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1083 for (b = 0; b < bs_last_barrier; ++b) { 1084 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1085 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1086 #if USE_DEBUGGER 1087 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1088 #endif 1089 } 1090 } 1091 } 1092 1093 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1094 __kmp_partition_places(team); 1095 #endif 1096 } 1097 1098 #if OMP_50_ENABLED 1099 if (__kmp_display_affinity && team->t.t_display_affinity != 1) { 1100 for (i = 0; i < team->t.t_nproc; i++) { 1101 kmp_info_t *thr = team->t.t_threads[i]; 1102 if (thr->th.th_prev_num_threads != team->t.t_nproc || 1103 thr->th.th_prev_level != team->t.t_level) { 1104 team->t.t_display_affinity = 1; 1105 break; 1106 } 1107 } 1108 } 1109 #endif 1110 1111 KMP_MB(); 1112 } 1113 1114 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1115 // Propagate any changes to the floating point control registers out to the team 1116 // We try to avoid unnecessary writes to the relevant cache line in the team 1117 // structure, so we don't make changes unless they are needed. 1118 inline static void propagateFPControl(kmp_team_t *team) { 1119 if (__kmp_inherit_fp_control) { 1120 kmp_int16 x87_fpu_control_word; 1121 kmp_uint32 mxcsr; 1122 1123 // Get master values of FPU control flags (both X87 and vector) 1124 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1125 __kmp_store_mxcsr(&mxcsr); 1126 mxcsr &= KMP_X86_MXCSR_MASK; 1127 1128 // There is no point looking at t_fp_control_saved here. 1129 // If it is TRUE, we still have to update the values if they are different 1130 // from those we now have. If it is FALSE we didn't save anything yet, but 1131 // our objective is the same. We have to ensure that the values in the team 1132 // are the same as those we have. 1133 // So, this code achieves what we need whether or not t_fp_control_saved is 1134 // true. By checking whether the value needs updating we avoid unnecessary 1135 // writes that would put the cache-line into a written state, causing all 1136 // threads in the team to have to read it again. 1137 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1138 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1139 // Although we don't use this value, other code in the runtime wants to know 1140 // whether it should restore them. So we must ensure it is correct. 1141 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1142 } else { 1143 // Similarly here. Don't write to this cache-line in the team structure 1144 // unless we have to. 1145 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1146 } 1147 } 1148 1149 // Do the opposite, setting the hardware registers to the updated values from 1150 // the team. 1151 inline static void updateHWFPControl(kmp_team_t *team) { 1152 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1153 // Only reset the fp control regs if they have been changed in the team. 1154 // the parallel region that we are exiting. 1155 kmp_int16 x87_fpu_control_word; 1156 kmp_uint32 mxcsr; 1157 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1158 __kmp_store_mxcsr(&mxcsr); 1159 mxcsr &= KMP_X86_MXCSR_MASK; 1160 1161 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1162 __kmp_clear_x87_fpu_status_word(); 1163 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1164 } 1165 1166 if (team->t.t_mxcsr != mxcsr) { 1167 __kmp_load_mxcsr(&team->t.t_mxcsr); 1168 } 1169 } 1170 } 1171 #else 1172 #define propagateFPControl(x) ((void)0) 1173 #define updateHWFPControl(x) ((void)0) 1174 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1175 1176 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1177 int realloc); // forward declaration 1178 1179 /* Run a parallel region that has been serialized, so runs only in a team of the 1180 single master thread. */ 1181 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1182 kmp_info_t *this_thr; 1183 kmp_team_t *serial_team; 1184 1185 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1186 1187 /* Skip all this code for autopar serialized loops since it results in 1188 unacceptable overhead */ 1189 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1190 return; 1191 1192 if (!TCR_4(__kmp_init_parallel)) 1193 __kmp_parallel_initialize(); 1194 1195 #if OMP_50_ENABLED 1196 __kmp_resume_if_soft_paused(); 1197 #endif 1198 1199 this_thr = __kmp_threads[global_tid]; 1200 serial_team = this_thr->th.th_serial_team; 1201 1202 /* utilize the serialized team held by this thread */ 1203 KMP_DEBUG_ASSERT(serial_team); 1204 KMP_MB(); 1205 1206 if (__kmp_tasking_mode != tskm_immediate_exec) { 1207 KMP_DEBUG_ASSERT( 1208 this_thr->th.th_task_team == 1209 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1210 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1211 NULL); 1212 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1213 "team %p, new task_team = NULL\n", 1214 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1215 this_thr->th.th_task_team = NULL; 1216 } 1217 1218 #if OMP_40_ENABLED 1219 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1220 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1221 proc_bind = proc_bind_false; 1222 } else if (proc_bind == proc_bind_default) { 1223 // No proc_bind clause was specified, so use the current value 1224 // of proc-bind-var for this parallel region. 1225 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1226 } 1227 // Reset for next parallel region 1228 this_thr->th.th_set_proc_bind = proc_bind_default; 1229 #endif /* OMP_40_ENABLED */ 1230 1231 #if OMPT_SUPPORT 1232 ompt_data_t ompt_parallel_data = ompt_data_none; 1233 ompt_data_t *implicit_task_data; 1234 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1235 if (ompt_enabled.enabled && 1236 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1237 1238 ompt_task_info_t *parent_task_info; 1239 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1240 1241 parent_task_info->frame.enter_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1242 if (ompt_enabled.ompt_callback_parallel_begin) { 1243 int team_size = 1; 1244 1245 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1246 &(parent_task_info->task_data), &(parent_task_info->frame), 1247 &ompt_parallel_data, team_size, ompt_parallel_invoker_program, 1248 codeptr); 1249 } 1250 } 1251 #endif // OMPT_SUPPORT 1252 1253 if (this_thr->th.th_team != serial_team) { 1254 // Nested level will be an index in the nested nthreads array 1255 int level = this_thr->th.th_team->t.t_level; 1256 1257 if (serial_team->t.t_serialized) { 1258 /* this serial team was already used 1259 TODO increase performance by making this locks more specific */ 1260 kmp_team_t *new_team; 1261 1262 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1263 1264 new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1265 #if OMPT_SUPPORT 1266 ompt_parallel_data, 1267 #endif 1268 #if OMP_40_ENABLED 1269 proc_bind, 1270 #endif 1271 &this_thr->th.th_current_task->td_icvs, 1272 0 USE_NESTED_HOT_ARG(NULL)); 1273 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1274 KMP_ASSERT(new_team); 1275 1276 /* setup new serialized team and install it */ 1277 new_team->t.t_threads[0] = this_thr; 1278 new_team->t.t_parent = this_thr->th.th_team; 1279 serial_team = new_team; 1280 this_thr->th.th_serial_team = serial_team; 1281 1282 KF_TRACE( 1283 10, 1284 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1285 global_tid, serial_team)); 1286 1287 /* TODO the above breaks the requirement that if we run out of resources, 1288 then we can still guarantee that serialized teams are ok, since we may 1289 need to allocate a new one */ 1290 } else { 1291 KF_TRACE( 1292 10, 1293 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1294 global_tid, serial_team)); 1295 } 1296 1297 /* we have to initialize this serial team */ 1298 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1299 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1300 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1301 serial_team->t.t_ident = loc; 1302 serial_team->t.t_serialized = 1; 1303 serial_team->t.t_nproc = 1; 1304 serial_team->t.t_parent = this_thr->th.th_team; 1305 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1306 this_thr->th.th_team = serial_team; 1307 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1308 1309 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1310 this_thr->th.th_current_task)); 1311 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1312 this_thr->th.th_current_task->td_flags.executing = 0; 1313 1314 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1315 1316 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1317 implicit task for each serialized task represented by 1318 team->t.t_serialized? */ 1319 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1320 &this_thr->th.th_current_task->td_parent->td_icvs); 1321 1322 // Thread value exists in the nested nthreads array for the next nested 1323 // level 1324 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1325 this_thr->th.th_current_task->td_icvs.nproc = 1326 __kmp_nested_nth.nth[level + 1]; 1327 } 1328 1329 #if OMP_40_ENABLED 1330 if (__kmp_nested_proc_bind.used && 1331 (level + 1 < __kmp_nested_proc_bind.used)) { 1332 this_thr->th.th_current_task->td_icvs.proc_bind = 1333 __kmp_nested_proc_bind.bind_types[level + 1]; 1334 } 1335 #endif /* OMP_40_ENABLED */ 1336 1337 #if USE_DEBUGGER 1338 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1339 #endif 1340 this_thr->th.th_info.ds.ds_tid = 0; 1341 1342 /* set thread cache values */ 1343 this_thr->th.th_team_nproc = 1; 1344 this_thr->th.th_team_master = this_thr; 1345 this_thr->th.th_team_serialized = 1; 1346 1347 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1348 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1349 #if OMP_50_ENABLED 1350 serial_team->t.t_def_allocator = this_thr->th.th_def_allocator; // save 1351 #endif 1352 1353 propagateFPControl(serial_team); 1354 1355 /* check if we need to allocate dispatch buffers stack */ 1356 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1357 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1358 serial_team->t.t_dispatch->th_disp_buffer = 1359 (dispatch_private_info_t *)__kmp_allocate( 1360 sizeof(dispatch_private_info_t)); 1361 } 1362 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1363 1364 KMP_MB(); 1365 1366 } else { 1367 /* this serialized team is already being used, 1368 * that's fine, just add another nested level */ 1369 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1370 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1371 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1372 ++serial_team->t.t_serialized; 1373 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1374 1375 // Nested level will be an index in the nested nthreads array 1376 int level = this_thr->th.th_team->t.t_level; 1377 // Thread value exists in the nested nthreads array for the next nested 1378 // level 1379 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1380 this_thr->th.th_current_task->td_icvs.nproc = 1381 __kmp_nested_nth.nth[level + 1]; 1382 } 1383 serial_team->t.t_level++; 1384 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1385 "of serial team %p to %d\n", 1386 global_tid, serial_team, serial_team->t.t_level)); 1387 1388 /* allocate/push dispatch buffers stack */ 1389 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1390 { 1391 dispatch_private_info_t *disp_buffer = 1392 (dispatch_private_info_t *)__kmp_allocate( 1393 sizeof(dispatch_private_info_t)); 1394 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1395 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1396 } 1397 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1398 1399 KMP_MB(); 1400 } 1401 #if OMP_40_ENABLED 1402 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1403 #endif 1404 1405 #if OMP_50_ENABLED 1406 // Perform the display affinity functionality for 1407 // serialized parallel regions 1408 if (__kmp_display_affinity) { 1409 if (this_thr->th.th_prev_level != serial_team->t.t_level || 1410 this_thr->th.th_prev_num_threads != 1) { 1411 // NULL means use the affinity-format-var ICV 1412 __kmp_aux_display_affinity(global_tid, NULL); 1413 this_thr->th.th_prev_level = serial_team->t.t_level; 1414 this_thr->th.th_prev_num_threads = 1; 1415 } 1416 } 1417 #endif 1418 1419 if (__kmp_env_consistency_check) 1420 __kmp_push_parallel(global_tid, NULL); 1421 #if OMPT_SUPPORT 1422 serial_team->t.ompt_team_info.master_return_address = codeptr; 1423 if (ompt_enabled.enabled && 1424 this_thr->th.ompt_thread_info.state != ompt_state_overhead) { 1425 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1426 1427 ompt_lw_taskteam_t lw_taskteam; 1428 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1429 &ompt_parallel_data, codeptr); 1430 1431 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1432 // don't use lw_taskteam after linking. content was swaped 1433 1434 /* OMPT implicit task begin */ 1435 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1436 if (ompt_enabled.ompt_callback_implicit_task) { 1437 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1438 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1439 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1440 OMPT_CUR_TASK_INFO(this_thr) 1441 ->thread_num = __kmp_tid_from_gtid(global_tid); 1442 } 1443 1444 /* OMPT state */ 1445 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 1446 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame.ptr = OMPT_GET_FRAME_ADDRESS(0); 1447 } 1448 #endif 1449 } 1450 1451 /* most of the work for a fork */ 1452 /* return true if we really went parallel, false if serialized */ 1453 int __kmp_fork_call(ident_t *loc, int gtid, 1454 enum fork_context_e call_context, // Intel, GNU, ... 1455 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1456 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1457 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1458 va_list *ap 1459 #else 1460 va_list ap 1461 #endif 1462 ) { 1463 void **argv; 1464 int i; 1465 int master_tid; 1466 int master_this_cons; 1467 kmp_team_t *team; 1468 kmp_team_t *parent_team; 1469 kmp_info_t *master_th; 1470 kmp_root_t *root; 1471 int nthreads; 1472 int master_active; 1473 int master_set_numthreads; 1474 int level; 1475 #if OMP_40_ENABLED 1476 int active_level; 1477 int teams_level; 1478 #endif 1479 #if KMP_NESTED_HOT_TEAMS 1480 kmp_hot_team_ptr_t **p_hot_teams; 1481 #endif 1482 { // KMP_TIME_BLOCK 1483 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1484 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1485 1486 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1487 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1488 /* Some systems prefer the stack for the root thread(s) to start with */ 1489 /* some gap from the parent stack to prevent false sharing. */ 1490 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1491 /* These 2 lines below are so this does not get optimized out */ 1492 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1493 __kmp_stkpadding += (short)((kmp_int64)dummy); 1494 } 1495 1496 /* initialize if needed */ 1497 KMP_DEBUG_ASSERT( 1498 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1499 if (!TCR_4(__kmp_init_parallel)) 1500 __kmp_parallel_initialize(); 1501 1502 #if OMP_50_ENABLED 1503 __kmp_resume_if_soft_paused(); 1504 #endif 1505 1506 /* setup current data */ 1507 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1508 // shutdown 1509 parent_team = master_th->th.th_team; 1510 master_tid = master_th->th.th_info.ds.ds_tid; 1511 master_this_cons = master_th->th.th_local.this_construct; 1512 root = master_th->th.th_root; 1513 master_active = root->r.r_active; 1514 master_set_numthreads = master_th->th.th_set_nproc; 1515 1516 #if OMPT_SUPPORT 1517 ompt_data_t ompt_parallel_data = ompt_data_none; 1518 ompt_data_t *parent_task_data; 1519 ompt_frame_t *ompt_frame; 1520 ompt_data_t *implicit_task_data; 1521 void *return_address = NULL; 1522 1523 if (ompt_enabled.enabled) { 1524 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1525 NULL, NULL); 1526 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1527 } 1528 #endif 1529 1530 // Nested level will be an index in the nested nthreads array 1531 level = parent_team->t.t_level; 1532 // used to launch non-serial teams even if nested is not allowed 1533 active_level = parent_team->t.t_active_level; 1534 #if OMP_40_ENABLED 1535 // needed to check nesting inside the teams 1536 teams_level = master_th->th.th_teams_level; 1537 #endif 1538 #if KMP_NESTED_HOT_TEAMS 1539 p_hot_teams = &master_th->th.th_hot_teams; 1540 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1541 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1542 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1543 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1544 // it is either actual or not needed (when active_level > 0) 1545 (*p_hot_teams)[0].hot_team_nth = 1; 1546 } 1547 #endif 1548 1549 #if OMPT_SUPPORT 1550 if (ompt_enabled.enabled) { 1551 if (ompt_enabled.ompt_callback_parallel_begin) { 1552 int team_size = master_set_numthreads 1553 ? master_set_numthreads 1554 : get__nproc_2(parent_team, master_tid); 1555 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1556 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, 1557 OMPT_INVOKER(call_context), return_address); 1558 } 1559 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1560 } 1561 #endif 1562 1563 master_th->th.th_ident = loc; 1564 1565 #if OMP_40_ENABLED 1566 if (master_th->th.th_teams_microtask && ap && 1567 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1568 // AC: This is start of parallel that is nested inside teams construct. 1569 // The team is actual (hot), all workers are ready at the fork barrier. 1570 // No lock needed to initialize the team a bit, then free workers. 1571 parent_team->t.t_ident = loc; 1572 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1573 parent_team->t.t_argc = argc; 1574 argv = (void **)parent_team->t.t_argv; 1575 for (i = argc - 1; i >= 0; --i) 1576 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1577 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1578 *argv++ = va_arg(*ap, void *); 1579 #else 1580 *argv++ = va_arg(ap, void *); 1581 #endif 1582 // Increment our nested depth levels, but not increase the serialization 1583 if (parent_team == master_th->th.th_serial_team) { 1584 // AC: we are in serialized parallel 1585 __kmpc_serialized_parallel(loc, gtid); 1586 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1587 // AC: need this in order enquiry functions work 1588 // correctly, will restore at join time 1589 parent_team->t.t_serialized--; 1590 #if OMPT_SUPPORT 1591 void *dummy; 1592 void **exit_runtime_p; 1593 1594 ompt_lw_taskteam_t lw_taskteam; 1595 1596 if (ompt_enabled.enabled) { 1597 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1598 &ompt_parallel_data, return_address); 1599 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame.ptr); 1600 1601 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1602 // don't use lw_taskteam after linking. content was swaped 1603 1604 /* OMPT implicit task begin */ 1605 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1606 if (ompt_enabled.ompt_callback_implicit_task) { 1607 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1608 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1609 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1610 OMPT_CUR_TASK_INFO(master_th) 1611 ->thread_num = __kmp_tid_from_gtid(gtid); 1612 } 1613 1614 /* OMPT state */ 1615 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1616 } else { 1617 exit_runtime_p = &dummy; 1618 } 1619 #endif 1620 1621 { 1622 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1623 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1624 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1625 #if OMPT_SUPPORT 1626 , 1627 exit_runtime_p 1628 #endif 1629 ); 1630 } 1631 1632 #if OMPT_SUPPORT 1633 *exit_runtime_p = NULL; 1634 if (ompt_enabled.enabled) { 1635 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = ompt_data_none; 1636 if (ompt_enabled.ompt_callback_implicit_task) { 1637 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1638 ompt_scope_end, NULL, implicit_task_data, 1, 1639 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1640 } 1641 __ompt_lw_taskteam_unlink(master_th); 1642 1643 if (ompt_enabled.ompt_callback_parallel_end) { 1644 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1645 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th), 1646 OMPT_INVOKER(call_context), return_address); 1647 } 1648 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1649 } 1650 #endif 1651 return TRUE; 1652 } 1653 1654 parent_team->t.t_pkfn = microtask; 1655 parent_team->t.t_invoke = invoker; 1656 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1657 parent_team->t.t_active_level++; 1658 parent_team->t.t_level++; 1659 #if OMP_50_ENABLED 1660 parent_team->t.t_def_allocator = master_th->th.th_def_allocator; // save 1661 #endif 1662 1663 /* Change number of threads in the team if requested */ 1664 if (master_set_numthreads) { // The parallel has num_threads clause 1665 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1666 // AC: only can reduce number of threads dynamically, can't increase 1667 kmp_info_t **other_threads = parent_team->t.t_threads; 1668 parent_team->t.t_nproc = master_set_numthreads; 1669 for (i = 0; i < master_set_numthreads; ++i) { 1670 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1671 } 1672 // Keep extra threads hot in the team for possible next parallels 1673 } 1674 master_th->th.th_set_nproc = 0; 1675 } 1676 1677 #if USE_DEBUGGER 1678 if (__kmp_debugging) { // Let debugger override number of threads. 1679 int nth = __kmp_omp_num_threads(loc); 1680 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1681 master_set_numthreads = nth; 1682 } 1683 } 1684 #endif 1685 1686 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1687 "master_th=%p, gtid=%d\n", 1688 root, parent_team, master_th, gtid)); 1689 __kmp_internal_fork(loc, gtid, parent_team); 1690 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1691 "master_th=%p, gtid=%d\n", 1692 root, parent_team, master_th, gtid)); 1693 1694 /* Invoke microtask for MASTER thread */ 1695 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1696 parent_team->t.t_id, parent_team->t.t_pkfn)); 1697 1698 if (!parent_team->t.t_invoke(gtid)) { 1699 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1700 } 1701 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1702 parent_team->t.t_id, parent_team->t.t_pkfn)); 1703 KMP_MB(); /* Flush all pending memory write invalidates. */ 1704 1705 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1706 1707 return TRUE; 1708 } // Parallel closely nested in teams construct 1709 #endif /* OMP_40_ENABLED */ 1710 1711 #if KMP_DEBUG 1712 if (__kmp_tasking_mode != tskm_immediate_exec) { 1713 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1714 parent_team->t.t_task_team[master_th->th.th_task_state]); 1715 } 1716 #endif 1717 1718 if (parent_team->t.t_active_level >= 1719 master_th->th.th_current_task->td_icvs.max_active_levels) { 1720 nthreads = 1; 1721 } else { 1722 #if OMP_40_ENABLED 1723 int enter_teams = ((ap == NULL && active_level == 0) || 1724 (ap && teams_level > 0 && teams_level == level)); 1725 #endif 1726 nthreads = 1727 master_set_numthreads 1728 ? master_set_numthreads 1729 : get__nproc_2( 1730 parent_team, 1731 master_tid); // TODO: get nproc directly from current task 1732 1733 // Check if we need to take forkjoin lock? (no need for serialized 1734 // parallel out of teams construct). This code moved here from 1735 // __kmp_reserve_threads() to speedup nested serialized parallels. 1736 if (nthreads > 1) { 1737 if ((!get__nested(master_th) && (root->r.r_in_parallel 1738 #if OMP_40_ENABLED 1739 && !enter_teams 1740 #endif /* OMP_40_ENABLED */ 1741 )) || 1742 (__kmp_library == library_serial)) { 1743 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1744 " threads\n", 1745 gtid, nthreads)); 1746 nthreads = 1; 1747 } 1748 } 1749 if (nthreads > 1) { 1750 /* determine how many new threads we can use */ 1751 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1752 nthreads = __kmp_reserve_threads( 1753 root, parent_team, master_tid, nthreads 1754 #if OMP_40_ENABLED 1755 /* AC: If we execute teams from parallel region (on host), then 1756 teams should be created but each can only have 1 thread if 1757 nesting is disabled. If teams called from serial region, then 1758 teams and their threads should be created regardless of the 1759 nesting setting. */ 1760 , 1761 enter_teams 1762 #endif /* OMP_40_ENABLED */ 1763 ); 1764 if (nthreads == 1) { 1765 // Free lock for single thread execution here; for multi-thread 1766 // execution it will be freed later after team of threads created 1767 // and initialized 1768 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1769 } 1770 } 1771 } 1772 KMP_DEBUG_ASSERT(nthreads > 0); 1773 1774 // If we temporarily changed the set number of threads then restore it now 1775 master_th->th.th_set_nproc = 0; 1776 1777 /* create a serialized parallel region? */ 1778 if (nthreads == 1) { 1779 /* josh todo: hypothetical question: what do we do for OS X*? */ 1780 #if KMP_OS_LINUX && \ 1781 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1782 void *args[argc]; 1783 #else 1784 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1785 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1786 KMP_ARCH_AARCH64) */ 1787 1788 KA_TRACE(20, 1789 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1790 1791 __kmpc_serialized_parallel(loc, gtid); 1792 1793 if (call_context == fork_context_intel) { 1794 /* TODO this sucks, use the compiler itself to pass args! :) */ 1795 master_th->th.th_serial_team->t.t_ident = loc; 1796 #if OMP_40_ENABLED 1797 if (!ap) { 1798 // revert change made in __kmpc_serialized_parallel() 1799 master_th->th.th_serial_team->t.t_level--; 1800 // Get args from parent team for teams construct 1801 1802 #if OMPT_SUPPORT 1803 void *dummy; 1804 void **exit_runtime_p; 1805 ompt_task_info_t *task_info; 1806 1807 ompt_lw_taskteam_t lw_taskteam; 1808 1809 if (ompt_enabled.enabled) { 1810 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1811 &ompt_parallel_data, return_address); 1812 1813 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1814 // don't use lw_taskteam after linking. content was swaped 1815 1816 task_info = OMPT_CUR_TASK_INFO(master_th); 1817 exit_runtime_p = &(task_info->frame.exit_frame.ptr); 1818 if (ompt_enabled.ompt_callback_implicit_task) { 1819 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1820 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1821 &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1822 OMPT_CUR_TASK_INFO(master_th) 1823 ->thread_num = __kmp_tid_from_gtid(gtid); 1824 } 1825 1826 /* OMPT state */ 1827 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1828 } else { 1829 exit_runtime_p = &dummy; 1830 } 1831 #endif 1832 1833 { 1834 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1835 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1836 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1837 parent_team->t.t_argv 1838 #if OMPT_SUPPORT 1839 , 1840 exit_runtime_p 1841 #endif 1842 ); 1843 } 1844 1845 #if OMPT_SUPPORT 1846 if (ompt_enabled.enabled) { 1847 exit_runtime_p = NULL; 1848 if (ompt_enabled.ompt_callback_implicit_task) { 1849 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1850 ompt_scope_end, NULL, &(task_info->task_data), 1, 1851 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1852 } 1853 1854 __ompt_lw_taskteam_unlink(master_th); 1855 if (ompt_enabled.ompt_callback_parallel_end) { 1856 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1857 OMPT_CUR_TEAM_DATA(master_th), parent_task_data, 1858 OMPT_INVOKER(call_context), return_address); 1859 } 1860 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1861 } 1862 #endif 1863 } else if (microtask == (microtask_t)__kmp_teams_master) { 1864 KMP_DEBUG_ASSERT(master_th->th.th_team == 1865 master_th->th.th_serial_team); 1866 team = master_th->th.th_team; 1867 // team->t.t_pkfn = microtask; 1868 team->t.t_invoke = invoker; 1869 __kmp_alloc_argv_entries(argc, team, TRUE); 1870 team->t.t_argc = argc; 1871 argv = (void **)team->t.t_argv; 1872 if (ap) { 1873 for (i = argc - 1; i >= 0; --i) 1874 // TODO: revert workaround for Intel(R) 64 tracker #96 1875 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1876 *argv++ = va_arg(*ap, void *); 1877 #else 1878 *argv++ = va_arg(ap, void *); 1879 #endif 1880 } else { 1881 for (i = 0; i < argc; ++i) 1882 // Get args from parent team for teams construct 1883 argv[i] = parent_team->t.t_argv[i]; 1884 } 1885 // AC: revert change made in __kmpc_serialized_parallel() 1886 // because initial code in teams should have level=0 1887 team->t.t_level--; 1888 // AC: call special invoker for outer "parallel" of teams construct 1889 invoker(gtid); 1890 } else { 1891 #endif /* OMP_40_ENABLED */ 1892 argv = args; 1893 for (i = argc - 1; i >= 0; --i) 1894 // TODO: revert workaround for Intel(R) 64 tracker #96 1895 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1896 *argv++ = va_arg(*ap, void *); 1897 #else 1898 *argv++ = va_arg(ap, void *); 1899 #endif 1900 KMP_MB(); 1901 1902 #if OMPT_SUPPORT 1903 void *dummy; 1904 void **exit_runtime_p; 1905 ompt_task_info_t *task_info; 1906 1907 ompt_lw_taskteam_t lw_taskteam; 1908 1909 if (ompt_enabled.enabled) { 1910 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1911 &ompt_parallel_data, return_address); 1912 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1913 // don't use lw_taskteam after linking. content was swaped 1914 task_info = OMPT_CUR_TASK_INFO(master_th); 1915 exit_runtime_p = &(task_info->frame.exit_frame.ptr); 1916 1917 /* OMPT implicit task begin */ 1918 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1919 if (ompt_enabled.ompt_callback_implicit_task) { 1920 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1921 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1922 implicit_task_data, 1, __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1923 OMPT_CUR_TASK_INFO(master_th) 1924 ->thread_num = __kmp_tid_from_gtid(gtid); 1925 } 1926 1927 /* OMPT state */ 1928 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 1929 } else { 1930 exit_runtime_p = &dummy; 1931 } 1932 #endif 1933 1934 { 1935 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1936 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1937 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1938 #if OMPT_SUPPORT 1939 , 1940 exit_runtime_p 1941 #endif 1942 ); 1943 } 1944 1945 #if OMPT_SUPPORT 1946 if (ompt_enabled.enabled) { 1947 *exit_runtime_p = NULL; 1948 if (ompt_enabled.ompt_callback_implicit_task) { 1949 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1950 ompt_scope_end, NULL, &(task_info->task_data), 1, 1951 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 1952 } 1953 1954 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1955 __ompt_lw_taskteam_unlink(master_th); 1956 if (ompt_enabled.ompt_callback_parallel_end) { 1957 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1958 &ompt_parallel_data, parent_task_data, 1959 OMPT_INVOKER(call_context), return_address); 1960 } 1961 master_th->th.ompt_thread_info.state = ompt_state_overhead; 1962 } 1963 #endif 1964 #if OMP_40_ENABLED 1965 } 1966 #endif /* OMP_40_ENABLED */ 1967 } else if (call_context == fork_context_gnu) { 1968 #if OMPT_SUPPORT 1969 ompt_lw_taskteam_t lwt; 1970 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1971 return_address); 1972 1973 lwt.ompt_task_info.frame.exit_frame = ompt_data_none; 1974 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1975 // don't use lw_taskteam after linking. content was swaped 1976 #endif 1977 1978 // we were called from GNU native code 1979 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1980 return FALSE; 1981 } else { 1982 KMP_ASSERT2(call_context < fork_context_last, 1983 "__kmp_fork_call: unknown fork_context parameter"); 1984 } 1985 1986 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1987 KMP_MB(); 1988 return FALSE; 1989 } // if (nthreads == 1) 1990 1991 // GEH: only modify the executing flag in the case when not serialized 1992 // serialized case is handled in kmpc_serialized_parallel 1993 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1994 "curtask=%p, curtask_max_aclevel=%d\n", 1995 parent_team->t.t_active_level, master_th, 1996 master_th->th.th_current_task, 1997 master_th->th.th_current_task->td_icvs.max_active_levels)); 1998 // TODO: GEH - cannot do this assertion because root thread not set up as 1999 // executing 2000 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 2001 master_th->th.th_current_task->td_flags.executing = 0; 2002 2003 #if OMP_40_ENABLED 2004 if (!master_th->th.th_teams_microtask || level > teams_level) 2005 #endif /* OMP_40_ENABLED */ 2006 { 2007 /* Increment our nested depth level */ 2008 KMP_ATOMIC_INC(&root->r.r_in_parallel); 2009 } 2010 2011 // See if we need to make a copy of the ICVs. 2012 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 2013 if ((level + 1 < __kmp_nested_nth.used) && 2014 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 2015 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 2016 } else { 2017 nthreads_icv = 0; // don't update 2018 } 2019 2020 #if OMP_40_ENABLED 2021 // Figure out the proc_bind_policy for the new team. 2022 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 2023 kmp_proc_bind_t proc_bind_icv = 2024 proc_bind_default; // proc_bind_default means don't update 2025 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 2026 proc_bind = proc_bind_false; 2027 } else { 2028 if (proc_bind == proc_bind_default) { 2029 // No proc_bind clause specified; use current proc-bind-var for this 2030 // parallel region 2031 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 2032 } 2033 /* else: The proc_bind policy was specified explicitly on parallel clause. 2034 This overrides proc-bind-var for this parallel region, but does not 2035 change proc-bind-var. */ 2036 // Figure the value of proc-bind-var for the child threads. 2037 if ((level + 1 < __kmp_nested_proc_bind.used) && 2038 (__kmp_nested_proc_bind.bind_types[level + 1] != 2039 master_th->th.th_current_task->td_icvs.proc_bind)) { 2040 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 2041 } 2042 } 2043 2044 // Reset for next parallel region 2045 master_th->th.th_set_proc_bind = proc_bind_default; 2046 #endif /* OMP_40_ENABLED */ 2047 2048 if ((nthreads_icv > 0) 2049 #if OMP_40_ENABLED 2050 || (proc_bind_icv != proc_bind_default) 2051 #endif /* OMP_40_ENABLED */ 2052 ) { 2053 kmp_internal_control_t new_icvs; 2054 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2055 new_icvs.next = NULL; 2056 if (nthreads_icv > 0) { 2057 new_icvs.nproc = nthreads_icv; 2058 } 2059 2060 #if OMP_40_ENABLED 2061 if (proc_bind_icv != proc_bind_default) { 2062 new_icvs.proc_bind = proc_bind_icv; 2063 } 2064 #endif /* OMP_40_ENABLED */ 2065 2066 /* allocate a new parallel team */ 2067 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2068 team = __kmp_allocate_team(root, nthreads, nthreads, 2069 #if OMPT_SUPPORT 2070 ompt_parallel_data, 2071 #endif 2072 #if OMP_40_ENABLED 2073 proc_bind, 2074 #endif 2075 &new_icvs, argc USE_NESTED_HOT_ARG(master_th)); 2076 } else { 2077 /* allocate a new parallel team */ 2078 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2079 team = __kmp_allocate_team(root, nthreads, nthreads, 2080 #if OMPT_SUPPORT 2081 ompt_parallel_data, 2082 #endif 2083 #if OMP_40_ENABLED 2084 proc_bind, 2085 #endif 2086 &master_th->th.th_current_task->td_icvs, 2087 argc USE_NESTED_HOT_ARG(master_th)); 2088 } 2089 KF_TRACE( 2090 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2091 2092 /* setup the new team */ 2093 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2094 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2095 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2096 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2097 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2098 #if OMPT_SUPPORT 2099 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2100 return_address); 2101 #endif 2102 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2103 // TODO: parent_team->t.t_level == INT_MAX ??? 2104 #if OMP_40_ENABLED 2105 if (!master_th->th.th_teams_microtask || level > teams_level) { 2106 #endif /* OMP_40_ENABLED */ 2107 int new_level = parent_team->t.t_level + 1; 2108 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2109 new_level = parent_team->t.t_active_level + 1; 2110 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2111 #if OMP_40_ENABLED 2112 } else { 2113 // AC: Do not increase parallel level at start of the teams construct 2114 int new_level = parent_team->t.t_level; 2115 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2116 new_level = parent_team->t.t_active_level; 2117 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2118 } 2119 #endif /* OMP_40_ENABLED */ 2120 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2121 // set master's schedule as new run-time schedule 2122 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2123 2124 #if OMP_40_ENABLED 2125 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2126 #endif 2127 #if OMP_50_ENABLED 2128 KMP_CHECK_UPDATE(team->t.t_def_allocator, master_th->th.th_def_allocator); 2129 #endif 2130 2131 // Update the floating point rounding in the team if required. 2132 propagateFPControl(team); 2133 2134 if (__kmp_tasking_mode != tskm_immediate_exec) { 2135 // Set master's task team to team's task team. Unless this is hot team, it 2136 // should be NULL. 2137 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2138 parent_team->t.t_task_team[master_th->th.th_task_state]); 2139 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2140 "%p, new task_team %p / team %p\n", 2141 __kmp_gtid_from_thread(master_th), 2142 master_th->th.th_task_team, parent_team, 2143 team->t.t_task_team[master_th->th.th_task_state], team)); 2144 2145 if (active_level || master_th->th.th_task_team) { 2146 // Take a memo of master's task_state 2147 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2148 if (master_th->th.th_task_state_top >= 2149 master_th->th.th_task_state_stack_sz) { // increase size 2150 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2151 kmp_uint8 *old_stack, *new_stack; 2152 kmp_uint32 i; 2153 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2154 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2155 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2156 } 2157 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2158 ++i) { // zero-init rest of stack 2159 new_stack[i] = 0; 2160 } 2161 old_stack = master_th->th.th_task_state_memo_stack; 2162 master_th->th.th_task_state_memo_stack = new_stack; 2163 master_th->th.th_task_state_stack_sz = new_size; 2164 __kmp_free(old_stack); 2165 } 2166 // Store master's task_state on stack 2167 master_th->th 2168 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2169 master_th->th.th_task_state; 2170 master_th->th.th_task_state_top++; 2171 #if KMP_NESTED_HOT_TEAMS 2172 if (master_th->th.th_hot_teams && 2173 active_level < __kmp_hot_teams_max_level && 2174 team == master_th->th.th_hot_teams[active_level].hot_team) { 2175 // Restore master's nested state if nested hot team 2176 master_th->th.th_task_state = 2177 master_th->th 2178 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2179 } else { 2180 #endif 2181 master_th->th.th_task_state = 0; 2182 #if KMP_NESTED_HOT_TEAMS 2183 } 2184 #endif 2185 } 2186 #if !KMP_NESTED_HOT_TEAMS 2187 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2188 (team == root->r.r_hot_team)); 2189 #endif 2190 } 2191 2192 KA_TRACE( 2193 20, 2194 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2195 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2196 team->t.t_nproc)); 2197 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2198 (team->t.t_master_tid == 0 && 2199 (team->t.t_parent == root->r.r_root_team || 2200 team->t.t_parent->t.t_serialized))); 2201 KMP_MB(); 2202 2203 /* now, setup the arguments */ 2204 argv = (void **)team->t.t_argv; 2205 #if OMP_40_ENABLED 2206 if (ap) { 2207 #endif /* OMP_40_ENABLED */ 2208 for (i = argc - 1; i >= 0; --i) { 2209 // TODO: revert workaround for Intel(R) 64 tracker #96 2210 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 2211 void *new_argv = va_arg(*ap, void *); 2212 #else 2213 void *new_argv = va_arg(ap, void *); 2214 #endif 2215 KMP_CHECK_UPDATE(*argv, new_argv); 2216 argv++; 2217 } 2218 #if OMP_40_ENABLED 2219 } else { 2220 for (i = 0; i < argc; ++i) { 2221 // Get args from parent team for teams construct 2222 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2223 } 2224 } 2225 #endif /* OMP_40_ENABLED */ 2226 2227 /* now actually fork the threads */ 2228 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2229 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2230 root->r.r_active = TRUE; 2231 2232 __kmp_fork_team_threads(root, team, master_th, gtid); 2233 __kmp_setup_icv_copy(team, nthreads, 2234 &master_th->th.th_current_task->td_icvs, loc); 2235 2236 #if OMPT_SUPPORT 2237 master_th->th.ompt_thread_info.state = ompt_state_work_parallel; 2238 #endif 2239 2240 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2241 2242 #if USE_ITT_BUILD 2243 if (team->t.t_active_level == 1 // only report frames at level 1 2244 #if OMP_40_ENABLED 2245 && !master_th->th.th_teams_microtask // not in teams construct 2246 #endif /* OMP_40_ENABLED */ 2247 ) { 2248 #if USE_ITT_NOTIFY 2249 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2250 (__kmp_forkjoin_frames_mode == 3 || 2251 __kmp_forkjoin_frames_mode == 1)) { 2252 kmp_uint64 tmp_time = 0; 2253 if (__itt_get_timestamp_ptr) 2254 tmp_time = __itt_get_timestamp(); 2255 // Internal fork - report frame begin 2256 master_th->th.th_frame_time = tmp_time; 2257 if (__kmp_forkjoin_frames_mode == 3) 2258 team->t.t_region_time = tmp_time; 2259 } else 2260 // only one notification scheme (either "submit" or "forking/joined", not both) 2261 #endif /* USE_ITT_NOTIFY */ 2262 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2263 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2264 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2265 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2266 } 2267 } 2268 #endif /* USE_ITT_BUILD */ 2269 2270 /* now go on and do the work */ 2271 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2272 KMP_MB(); 2273 KF_TRACE(10, 2274 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2275 root, team, master_th, gtid)); 2276 2277 #if USE_ITT_BUILD 2278 if (__itt_stack_caller_create_ptr) { 2279 team->t.t_stack_id = 2280 __kmp_itt_stack_caller_create(); // create new stack stitching id 2281 // before entering fork barrier 2282 } 2283 #endif /* USE_ITT_BUILD */ 2284 2285 #if OMP_40_ENABLED 2286 // AC: skip __kmp_internal_fork at teams construct, let only master 2287 // threads execute 2288 if (ap) 2289 #endif /* OMP_40_ENABLED */ 2290 { 2291 __kmp_internal_fork(loc, gtid, team); 2292 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2293 "master_th=%p, gtid=%d\n", 2294 root, team, master_th, gtid)); 2295 } 2296 2297 if (call_context == fork_context_gnu) { 2298 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2299 return TRUE; 2300 } 2301 2302 /* Invoke microtask for MASTER thread */ 2303 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2304 team->t.t_id, team->t.t_pkfn)); 2305 } // END of timer KMP_fork_call block 2306 2307 if (!team->t.t_invoke(gtid)) { 2308 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2309 } 2310 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2311 team->t.t_id, team->t.t_pkfn)); 2312 KMP_MB(); /* Flush all pending memory write invalidates. */ 2313 2314 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2315 2316 #if OMPT_SUPPORT 2317 if (ompt_enabled.enabled) { 2318 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2319 } 2320 #endif 2321 2322 return TRUE; 2323 } 2324 2325 #if OMPT_SUPPORT 2326 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2327 kmp_team_t *team) { 2328 // restore state outside the region 2329 thread->th.ompt_thread_info.state = 2330 ((team->t.t_serialized) ? ompt_state_work_serial 2331 : ompt_state_work_parallel); 2332 } 2333 2334 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2335 kmp_team_t *team, ompt_data_t *parallel_data, 2336 fork_context_e fork_context, void *codeptr) { 2337 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2338 if (ompt_enabled.ompt_callback_parallel_end) { 2339 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2340 parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context), 2341 codeptr); 2342 } 2343 2344 task_info->frame.enter_frame = ompt_data_none; 2345 __kmp_join_restore_state(thread, team); 2346 } 2347 #endif 2348 2349 void __kmp_join_call(ident_t *loc, int gtid 2350 #if OMPT_SUPPORT 2351 , 2352 enum fork_context_e fork_context 2353 #endif 2354 #if OMP_40_ENABLED 2355 , 2356 int exit_teams 2357 #endif /* OMP_40_ENABLED */ 2358 ) { 2359 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2360 kmp_team_t *team; 2361 kmp_team_t *parent_team; 2362 kmp_info_t *master_th; 2363 kmp_root_t *root; 2364 int master_active; 2365 int i; 2366 2367 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2368 2369 /* setup current data */ 2370 master_th = __kmp_threads[gtid]; 2371 root = master_th->th.th_root; 2372 team = master_th->th.th_team; 2373 parent_team = team->t.t_parent; 2374 2375 master_th->th.th_ident = loc; 2376 2377 #if OMPT_SUPPORT 2378 if (ompt_enabled.enabled) { 2379 master_th->th.ompt_thread_info.state = ompt_state_overhead; 2380 } 2381 #endif 2382 2383 #if KMP_DEBUG 2384 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2385 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2386 "th_task_team = %p\n", 2387 __kmp_gtid_from_thread(master_th), team, 2388 team->t.t_task_team[master_th->th.th_task_state], 2389 master_th->th.th_task_team)); 2390 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2391 team->t.t_task_team[master_th->th.th_task_state]); 2392 } 2393 #endif 2394 2395 if (team->t.t_serialized) { 2396 #if OMP_40_ENABLED 2397 if (master_th->th.th_teams_microtask) { 2398 // We are in teams construct 2399 int level = team->t.t_level; 2400 int tlevel = master_th->th.th_teams_level; 2401 if (level == tlevel) { 2402 // AC: we haven't incremented it earlier at start of teams construct, 2403 // so do it here - at the end of teams construct 2404 team->t.t_level++; 2405 } else if (level == tlevel + 1) { 2406 // AC: we are exiting parallel inside teams, need to increment 2407 // serialization in order to restore it in the next call to 2408 // __kmpc_end_serialized_parallel 2409 team->t.t_serialized++; 2410 } 2411 } 2412 #endif /* OMP_40_ENABLED */ 2413 __kmpc_end_serialized_parallel(loc, gtid); 2414 2415 #if OMPT_SUPPORT 2416 if (ompt_enabled.enabled) { 2417 __kmp_join_restore_state(master_th, parent_team); 2418 } 2419 #endif 2420 2421 return; 2422 } 2423 2424 master_active = team->t.t_master_active; 2425 2426 #if OMP_40_ENABLED 2427 if (!exit_teams) 2428 #endif /* OMP_40_ENABLED */ 2429 { 2430 // AC: No barrier for internal teams at exit from teams construct. 2431 // But there is barrier for external team (league). 2432 __kmp_internal_join(loc, gtid, team); 2433 } 2434 #if OMP_40_ENABLED 2435 else { 2436 master_th->th.th_task_state = 2437 0; // AC: no tasking in teams (out of any parallel) 2438 } 2439 #endif /* OMP_40_ENABLED */ 2440 2441 KMP_MB(); 2442 2443 #if OMPT_SUPPORT 2444 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2445 void *codeptr = team->t.ompt_team_info.master_return_address; 2446 #endif 2447 2448 #if USE_ITT_BUILD 2449 if (__itt_stack_caller_create_ptr) { 2450 __kmp_itt_stack_caller_destroy( 2451 (__itt_caller)team->t 2452 .t_stack_id); // destroy the stack stitching id after join barrier 2453 } 2454 2455 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2456 if (team->t.t_active_level == 1 2457 #if OMP_40_ENABLED 2458 && !master_th->th.th_teams_microtask /* not in teams construct */ 2459 #endif /* OMP_40_ENABLED */ 2460 ) { 2461 master_th->th.th_ident = loc; 2462 // only one notification scheme (either "submit" or "forking/joined", not 2463 // both) 2464 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2465 __kmp_forkjoin_frames_mode == 3) 2466 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2467 master_th->th.th_frame_time, 0, loc, 2468 master_th->th.th_team_nproc, 1); 2469 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2470 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2471 __kmp_itt_region_joined(gtid); 2472 } // active_level == 1 2473 #endif /* USE_ITT_BUILD */ 2474 2475 #if OMP_40_ENABLED 2476 if (master_th->th.th_teams_microtask && !exit_teams && 2477 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2478 team->t.t_level == master_th->th.th_teams_level + 1) { 2479 // AC: We need to leave the team structure intact at the end of parallel 2480 // inside the teams construct, so that at the next parallel same (hot) team 2481 // works, only adjust nesting levels 2482 2483 /* Decrement our nested depth level */ 2484 team->t.t_level--; 2485 team->t.t_active_level--; 2486 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2487 2488 /* Restore number of threads in the team if needed */ 2489 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2490 int old_num = master_th->th.th_team_nproc; 2491 int new_num = master_th->th.th_teams_size.nth; 2492 kmp_info_t **other_threads = team->t.t_threads; 2493 team->t.t_nproc = new_num; 2494 for (i = 0; i < old_num; ++i) { 2495 other_threads[i]->th.th_team_nproc = new_num; 2496 } 2497 // Adjust states of non-used threads of the team 2498 for (i = old_num; i < new_num; ++i) { 2499 // Re-initialize thread's barrier data. 2500 int b; 2501 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2502 for (b = 0; b < bs_last_barrier; ++b) { 2503 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2504 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2505 #if USE_DEBUGGER 2506 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2507 #endif 2508 } 2509 if (__kmp_tasking_mode != tskm_immediate_exec) { 2510 // Synchronize thread's task state 2511 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2512 } 2513 } 2514 } 2515 2516 #if OMPT_SUPPORT 2517 if (ompt_enabled.enabled) { 2518 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2519 codeptr); 2520 } 2521 #endif 2522 2523 return; 2524 } 2525 #endif /* OMP_40_ENABLED */ 2526 2527 /* do cleanup and restore the parent team */ 2528 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2529 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2530 2531 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2532 2533 /* jc: The following lock has instructions with REL and ACQ semantics, 2534 separating the parallel user code called in this parallel region 2535 from the serial user code called after this function returns. */ 2536 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2537 2538 #if OMP_40_ENABLED 2539 if (!master_th->th.th_teams_microtask || 2540 team->t.t_level > master_th->th.th_teams_level) 2541 #endif /* OMP_40_ENABLED */ 2542 { 2543 /* Decrement our nested depth level */ 2544 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2545 } 2546 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2547 2548 #if OMPT_SUPPORT 2549 if (ompt_enabled.enabled) { 2550 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2551 if (ompt_enabled.ompt_callback_implicit_task) { 2552 int ompt_team_size = team->t.t_nproc; 2553 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2554 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2555 OMPT_CUR_TASK_INFO(master_th)->thread_num, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 2556 } 2557 2558 task_info->frame.exit_frame = ompt_data_none; 2559 task_info->task_data = ompt_data_none; 2560 } 2561 #endif 2562 2563 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2564 master_th, team)); 2565 __kmp_pop_current_task_from_thread(master_th); 2566 2567 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 2568 // Restore master thread's partition. 2569 master_th->th.th_first_place = team->t.t_first_place; 2570 master_th->th.th_last_place = team->t.t_last_place; 2571 #endif /* OMP_40_ENABLED */ 2572 #if OMP_50_ENABLED 2573 master_th->th.th_def_allocator = team->t.t_def_allocator; 2574 #endif 2575 2576 updateHWFPControl(team); 2577 2578 if (root->r.r_active != master_active) 2579 root->r.r_active = master_active; 2580 2581 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2582 master_th)); // this will free worker threads 2583 2584 /* this race was fun to find. make sure the following is in the critical 2585 region otherwise assertions may fail occasionally since the old team may be 2586 reallocated and the hierarchy appears inconsistent. it is actually safe to 2587 run and won't cause any bugs, but will cause those assertion failures. it's 2588 only one deref&assign so might as well put this in the critical region */ 2589 master_th->th.th_team = parent_team; 2590 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2591 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2592 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2593 2594 /* restore serialized team, if need be */ 2595 if (parent_team->t.t_serialized && 2596 parent_team != master_th->th.th_serial_team && 2597 parent_team != root->r.r_root_team) { 2598 __kmp_free_team(root, 2599 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2600 master_th->th.th_serial_team = parent_team; 2601 } 2602 2603 if (__kmp_tasking_mode != tskm_immediate_exec) { 2604 if (master_th->th.th_task_state_top > 2605 0) { // Restore task state from memo stack 2606 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2607 // Remember master's state if we re-use this nested hot team 2608 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2609 master_th->th.th_task_state; 2610 --master_th->th.th_task_state_top; // pop 2611 // Now restore state at this level 2612 master_th->th.th_task_state = 2613 master_th->th 2614 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2615 } 2616 // Copy the task team from the parent team to the master thread 2617 master_th->th.th_task_team = 2618 parent_team->t.t_task_team[master_th->th.th_task_state]; 2619 KA_TRACE(20, 2620 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2621 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2622 parent_team)); 2623 } 2624 2625 // TODO: GEH - cannot do this assertion because root thread not set up as 2626 // executing 2627 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2628 master_th->th.th_current_task->td_flags.executing = 1; 2629 2630 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2631 2632 #if OMPT_SUPPORT 2633 if (ompt_enabled.enabled) { 2634 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2635 codeptr); 2636 } 2637 #endif 2638 2639 KMP_MB(); 2640 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2641 } 2642 2643 /* Check whether we should push an internal control record onto the 2644 serial team stack. If so, do it. */ 2645 void __kmp_save_internal_controls(kmp_info_t *thread) { 2646 2647 if (thread->th.th_team != thread->th.th_serial_team) { 2648 return; 2649 } 2650 if (thread->th.th_team->t.t_serialized > 1) { 2651 int push = 0; 2652 2653 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2654 push = 1; 2655 } else { 2656 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2657 thread->th.th_team->t.t_serialized) { 2658 push = 1; 2659 } 2660 } 2661 if (push) { /* push a record on the serial team's stack */ 2662 kmp_internal_control_t *control = 2663 (kmp_internal_control_t *)__kmp_allocate( 2664 sizeof(kmp_internal_control_t)); 2665 2666 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2667 2668 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2669 2670 control->next = thread->th.th_team->t.t_control_stack_top; 2671 thread->th.th_team->t.t_control_stack_top = control; 2672 } 2673 } 2674 } 2675 2676 /* Changes set_nproc */ 2677 void __kmp_set_num_threads(int new_nth, int gtid) { 2678 kmp_info_t *thread; 2679 kmp_root_t *root; 2680 2681 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2682 KMP_DEBUG_ASSERT(__kmp_init_serial); 2683 2684 if (new_nth < 1) 2685 new_nth = 1; 2686 else if (new_nth > __kmp_max_nth) 2687 new_nth = __kmp_max_nth; 2688 2689 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2690 thread = __kmp_threads[gtid]; 2691 if (thread->th.th_current_task->td_icvs.nproc == new_nth) 2692 return; // nothing to do 2693 2694 __kmp_save_internal_controls(thread); 2695 2696 set__nproc(thread, new_nth); 2697 2698 // If this omp_set_num_threads() call will cause the hot team size to be 2699 // reduced (in the absence of a num_threads clause), then reduce it now, 2700 // rather than waiting for the next parallel region. 2701 root = thread->th.th_root; 2702 if (__kmp_init_parallel && (!root->r.r_active) && 2703 (root->r.r_hot_team->t.t_nproc > new_nth) 2704 #if KMP_NESTED_HOT_TEAMS 2705 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2706 #endif 2707 ) { 2708 kmp_team_t *hot_team = root->r.r_hot_team; 2709 int f; 2710 2711 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2712 2713 // Release the extra threads we don't need any more. 2714 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2715 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2716 if (__kmp_tasking_mode != tskm_immediate_exec) { 2717 // When decreasing team size, threads no longer in the team should unref 2718 // task team. 2719 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2720 } 2721 __kmp_free_thread(hot_team->t.t_threads[f]); 2722 hot_team->t.t_threads[f] = NULL; 2723 } 2724 hot_team->t.t_nproc = new_nth; 2725 #if KMP_NESTED_HOT_TEAMS 2726 if (thread->th.th_hot_teams) { 2727 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2728 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2729 } 2730 #endif 2731 2732 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2733 2734 // Update the t_nproc field in the threads that are still active. 2735 for (f = 0; f < new_nth; f++) { 2736 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2737 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2738 } 2739 // Special flag in case omp_set_num_threads() call 2740 hot_team->t.t_size_changed = -1; 2741 } 2742 } 2743 2744 /* Changes max_active_levels */ 2745 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2746 kmp_info_t *thread; 2747 2748 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2749 "%d = (%d)\n", 2750 gtid, max_active_levels)); 2751 KMP_DEBUG_ASSERT(__kmp_init_serial); 2752 2753 // validate max_active_levels 2754 if (max_active_levels < 0) { 2755 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2756 // We ignore this call if the user has specified a negative value. 2757 // The current setting won't be changed. The last valid setting will be 2758 // used. A warning will be issued (if warnings are allowed as controlled by 2759 // the KMP_WARNINGS env var). 2760 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2761 "max_active_levels for thread %d = (%d)\n", 2762 gtid, max_active_levels)); 2763 return; 2764 } 2765 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2766 // it's OK, the max_active_levels is within the valid range: [ 0; 2767 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2768 // We allow a zero value. (implementation defined behavior) 2769 } else { 2770 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2771 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2772 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2773 // Current upper limit is MAX_INT. (implementation defined behavior) 2774 // If the input exceeds the upper limit, we correct the input to be the 2775 // upper limit. (implementation defined behavior) 2776 // Actually, the flow should never get here until we use MAX_INT limit. 2777 } 2778 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2779 "max_active_levels for thread %d = (%d)\n", 2780 gtid, max_active_levels)); 2781 2782 thread = __kmp_threads[gtid]; 2783 2784 __kmp_save_internal_controls(thread); 2785 2786 set__max_active_levels(thread, max_active_levels); 2787 } 2788 2789 /* Gets max_active_levels */ 2790 int __kmp_get_max_active_levels(int gtid) { 2791 kmp_info_t *thread; 2792 2793 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2794 KMP_DEBUG_ASSERT(__kmp_init_serial); 2795 2796 thread = __kmp_threads[gtid]; 2797 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2798 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2799 "curtask_maxaclevel=%d\n", 2800 gtid, thread->th.th_current_task, 2801 thread->th.th_current_task->td_icvs.max_active_levels)); 2802 return thread->th.th_current_task->td_icvs.max_active_levels; 2803 } 2804 2805 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2806 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2807 kmp_info_t *thread; 2808 // kmp_team_t *team; 2809 2810 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2811 gtid, (int)kind, chunk)); 2812 KMP_DEBUG_ASSERT(__kmp_init_serial); 2813 2814 // Check if the kind parameter is valid, correct if needed. 2815 // Valid parameters should fit in one of two intervals - standard or extended: 2816 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2817 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2818 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2819 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2820 // TODO: Hint needs attention in case we change the default schedule. 2821 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2822 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2823 __kmp_msg_null); 2824 kind = kmp_sched_default; 2825 chunk = 0; // ignore chunk value in case of bad kind 2826 } 2827 2828 thread = __kmp_threads[gtid]; 2829 2830 __kmp_save_internal_controls(thread); 2831 2832 if (kind < kmp_sched_upper_std) { 2833 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2834 // differ static chunked vs. unchunked: chunk should be invalid to 2835 // indicate unchunked schedule (which is the default) 2836 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2837 } else { 2838 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2839 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2840 } 2841 } else { 2842 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2843 // kmp_sched_lower - 2 ]; 2844 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2845 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2846 kmp_sched_lower - 2]; 2847 } 2848 if (kind == kmp_sched_auto || chunk < 1) { 2849 // ignore parameter chunk for schedule auto 2850 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2851 } else { 2852 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2853 } 2854 } 2855 2856 /* Gets def_sched_var ICV values */ 2857 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2858 kmp_info_t *thread; 2859 enum sched_type th_type; 2860 2861 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2862 KMP_DEBUG_ASSERT(__kmp_init_serial); 2863 2864 thread = __kmp_threads[gtid]; 2865 2866 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2867 2868 switch (th_type) { 2869 case kmp_sch_static: 2870 case kmp_sch_static_greedy: 2871 case kmp_sch_static_balanced: 2872 *kind = kmp_sched_static; 2873 *chunk = 0; // chunk was not set, try to show this fact via zero value 2874 return; 2875 case kmp_sch_static_chunked: 2876 *kind = kmp_sched_static; 2877 break; 2878 case kmp_sch_dynamic_chunked: 2879 *kind = kmp_sched_dynamic; 2880 break; 2881 case kmp_sch_guided_chunked: 2882 case kmp_sch_guided_iterative_chunked: 2883 case kmp_sch_guided_analytical_chunked: 2884 *kind = kmp_sched_guided; 2885 break; 2886 case kmp_sch_auto: 2887 *kind = kmp_sched_auto; 2888 break; 2889 case kmp_sch_trapezoidal: 2890 *kind = kmp_sched_trapezoidal; 2891 break; 2892 #if KMP_STATIC_STEAL_ENABLED 2893 case kmp_sch_static_steal: 2894 *kind = kmp_sched_static_steal; 2895 break; 2896 #endif 2897 default: 2898 KMP_FATAL(UnknownSchedulingType, th_type); 2899 } 2900 2901 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2902 } 2903 2904 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2905 2906 int ii, dd; 2907 kmp_team_t *team; 2908 kmp_info_t *thr; 2909 2910 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2911 KMP_DEBUG_ASSERT(__kmp_init_serial); 2912 2913 // validate level 2914 if (level == 0) 2915 return 0; 2916 if (level < 0) 2917 return -1; 2918 thr = __kmp_threads[gtid]; 2919 team = thr->th.th_team; 2920 ii = team->t.t_level; 2921 if (level > ii) 2922 return -1; 2923 2924 #if OMP_40_ENABLED 2925 if (thr->th.th_teams_microtask) { 2926 // AC: we are in teams region where multiple nested teams have same level 2927 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2928 if (level <= 2929 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2930 KMP_DEBUG_ASSERT(ii >= tlevel); 2931 // AC: As we need to pass by the teams league, we need to artificially 2932 // increase ii 2933 if (ii == tlevel) { 2934 ii += 2; // three teams have same level 2935 } else { 2936 ii++; // two teams have same level 2937 } 2938 } 2939 } 2940 #endif 2941 2942 if (ii == level) 2943 return __kmp_tid_from_gtid(gtid); 2944 2945 dd = team->t.t_serialized; 2946 level++; 2947 while (ii > level) { 2948 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2949 } 2950 if ((team->t.t_serialized) && (!dd)) { 2951 team = team->t.t_parent; 2952 continue; 2953 } 2954 if (ii > level) { 2955 team = team->t.t_parent; 2956 dd = team->t.t_serialized; 2957 ii--; 2958 } 2959 } 2960 2961 return (dd > 1) ? (0) : (team->t.t_master_tid); 2962 } 2963 2964 int __kmp_get_team_size(int gtid, int level) { 2965 2966 int ii, dd; 2967 kmp_team_t *team; 2968 kmp_info_t *thr; 2969 2970 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2971 KMP_DEBUG_ASSERT(__kmp_init_serial); 2972 2973 // validate level 2974 if (level == 0) 2975 return 1; 2976 if (level < 0) 2977 return -1; 2978 thr = __kmp_threads[gtid]; 2979 team = thr->th.th_team; 2980 ii = team->t.t_level; 2981 if (level > ii) 2982 return -1; 2983 2984 #if OMP_40_ENABLED 2985 if (thr->th.th_teams_microtask) { 2986 // AC: we are in teams region where multiple nested teams have same level 2987 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2988 if (level <= 2989 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2990 KMP_DEBUG_ASSERT(ii >= tlevel); 2991 // AC: As we need to pass by the teams league, we need to artificially 2992 // increase ii 2993 if (ii == tlevel) { 2994 ii += 2; // three teams have same level 2995 } else { 2996 ii++; // two teams have same level 2997 } 2998 } 2999 } 3000 #endif 3001 3002 while (ii > level) { 3003 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 3004 } 3005 if (team->t.t_serialized && (!dd)) { 3006 team = team->t.t_parent; 3007 continue; 3008 } 3009 if (ii > level) { 3010 team = team->t.t_parent; 3011 ii--; 3012 } 3013 } 3014 3015 return team->t.t_nproc; 3016 } 3017 3018 kmp_r_sched_t __kmp_get_schedule_global() { 3019 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 3020 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 3021 // independently. So one can get the updated schedule here. 3022 3023 kmp_r_sched_t r_sched; 3024 3025 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 3026 // __kmp_guided. __kmp_sched should keep original value, so that user can set 3027 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 3028 // different roots (even in OMP 2.5) 3029 if (__kmp_sched == kmp_sch_static) { 3030 // replace STATIC with more detailed schedule (balanced or greedy) 3031 r_sched.r_sched_type = __kmp_static; 3032 } else if (__kmp_sched == kmp_sch_guided_chunked) { 3033 // replace GUIDED with more detailed schedule (iterative or analytical) 3034 r_sched.r_sched_type = __kmp_guided; 3035 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 3036 r_sched.r_sched_type = __kmp_sched; 3037 } 3038 3039 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 3040 // __kmp_chunk may be wrong here (if it was not ever set) 3041 r_sched.chunk = KMP_DEFAULT_CHUNK; 3042 } else { 3043 r_sched.chunk = __kmp_chunk; 3044 } 3045 3046 return r_sched; 3047 } 3048 3049 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 3050 at least argc number of *t_argv entries for the requested team. */ 3051 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 3052 3053 KMP_DEBUG_ASSERT(team); 3054 if (!realloc || argc > team->t.t_max_argc) { 3055 3056 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 3057 "current entries=%d\n", 3058 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 3059 /* if previously allocated heap space for args, free them */ 3060 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3061 __kmp_free((void *)team->t.t_argv); 3062 3063 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3064 /* use unused space in the cache line for arguments */ 3065 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3066 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3067 "argv entries\n", 3068 team->t.t_id, team->t.t_max_argc)); 3069 team->t.t_argv = &team->t.t_inline_argv[0]; 3070 if (__kmp_storage_map) { 3071 __kmp_print_storage_map_gtid( 3072 -1, &team->t.t_inline_argv[0], 3073 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3074 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3075 team->t.t_id); 3076 } 3077 } else { 3078 /* allocate space for arguments in the heap */ 3079 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3080 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3081 : 2 * argc; 3082 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3083 "argv entries\n", 3084 team->t.t_id, team->t.t_max_argc)); 3085 team->t.t_argv = 3086 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3087 if (__kmp_storage_map) { 3088 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3089 &team->t.t_argv[team->t.t_max_argc], 3090 sizeof(void *) * team->t.t_max_argc, 3091 "team_%d.t_argv", team->t.t_id); 3092 } 3093 } 3094 } 3095 } 3096 3097 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3098 int i; 3099 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3100 team->t.t_threads = 3101 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3102 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3103 sizeof(dispatch_shared_info_t) * num_disp_buff); 3104 team->t.t_dispatch = 3105 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3106 team->t.t_implicit_task_taskdata = 3107 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3108 team->t.t_max_nproc = max_nth; 3109 3110 /* setup dispatch buffers */ 3111 for (i = 0; i < num_disp_buff; ++i) { 3112 team->t.t_disp_buffer[i].buffer_index = i; 3113 #if OMP_45_ENABLED 3114 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3115 #endif 3116 } 3117 } 3118 3119 static void __kmp_free_team_arrays(kmp_team_t *team) { 3120 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3121 int i; 3122 for (i = 0; i < team->t.t_max_nproc; ++i) { 3123 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3124 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3125 team->t.t_dispatch[i].th_disp_buffer = NULL; 3126 } 3127 } 3128 #if KMP_USE_HIER_SCHED 3129 __kmp_dispatch_free_hierarchies(team); 3130 #endif 3131 __kmp_free(team->t.t_threads); 3132 __kmp_free(team->t.t_disp_buffer); 3133 __kmp_free(team->t.t_dispatch); 3134 __kmp_free(team->t.t_implicit_task_taskdata); 3135 team->t.t_threads = NULL; 3136 team->t.t_disp_buffer = NULL; 3137 team->t.t_dispatch = NULL; 3138 team->t.t_implicit_task_taskdata = 0; 3139 } 3140 3141 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3142 kmp_info_t **oldThreads = team->t.t_threads; 3143 3144 __kmp_free(team->t.t_disp_buffer); 3145 __kmp_free(team->t.t_dispatch); 3146 __kmp_free(team->t.t_implicit_task_taskdata); 3147 __kmp_allocate_team_arrays(team, max_nth); 3148 3149 KMP_MEMCPY(team->t.t_threads, oldThreads, 3150 team->t.t_nproc * sizeof(kmp_info_t *)); 3151 3152 __kmp_free(oldThreads); 3153 } 3154 3155 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3156 3157 kmp_r_sched_t r_sched = 3158 __kmp_get_schedule_global(); // get current state of scheduling globals 3159 3160 #if OMP_40_ENABLED 3161 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3162 #endif /* OMP_40_ENABLED */ 3163 3164 kmp_internal_control_t g_icvs = { 3165 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3166 (kmp_int8)__kmp_dflt_nested, // int nested; //internal control 3167 // for nested parallelism (per thread) 3168 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3169 // adjustment of threads (per thread) 3170 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3171 // whether blocktime is explicitly set 3172 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3173 #if KMP_USE_MONITOR 3174 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3175 // intervals 3176 #endif 3177 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3178 // next parallel region (per thread) 3179 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3180 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3181 // for max_active_levels 3182 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3183 // {sched,chunk} pair 3184 #if OMP_40_ENABLED 3185 __kmp_nested_proc_bind.bind_types[0], 3186 __kmp_default_device, 3187 #endif /* OMP_40_ENABLED */ 3188 NULL // struct kmp_internal_control *next; 3189 }; 3190 3191 return g_icvs; 3192 } 3193 3194 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3195 3196 kmp_internal_control_t gx_icvs; 3197 gx_icvs.serial_nesting_level = 3198 0; // probably =team->t.t_serial like in save_inter_controls 3199 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3200 gx_icvs.next = NULL; 3201 3202 return gx_icvs; 3203 } 3204 3205 static void __kmp_initialize_root(kmp_root_t *root) { 3206 int f; 3207 kmp_team_t *root_team; 3208 kmp_team_t *hot_team; 3209 int hot_team_max_nth; 3210 kmp_r_sched_t r_sched = 3211 __kmp_get_schedule_global(); // get current state of scheduling globals 3212 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3213 KMP_DEBUG_ASSERT(root); 3214 KMP_ASSERT(!root->r.r_begin); 3215 3216 /* setup the root state structure */ 3217 __kmp_init_lock(&root->r.r_begin_lock); 3218 root->r.r_begin = FALSE; 3219 root->r.r_active = FALSE; 3220 root->r.r_in_parallel = 0; 3221 root->r.r_blocktime = __kmp_dflt_blocktime; 3222 root->r.r_nested = __kmp_dflt_nested; 3223 root->r.r_cg_nthreads = 1; 3224 3225 /* setup the root team for this task */ 3226 /* allocate the root team structure */ 3227 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3228 3229 root_team = 3230 __kmp_allocate_team(root, 3231 1, // new_nproc 3232 1, // max_nproc 3233 #if OMPT_SUPPORT 3234 ompt_data_none, // root parallel id 3235 #endif 3236 #if OMP_40_ENABLED 3237 __kmp_nested_proc_bind.bind_types[0], 3238 #endif 3239 &r_icvs, 3240 0 // argc 3241 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3242 ); 3243 #if USE_DEBUGGER 3244 // Non-NULL value should be assigned to make the debugger display the root 3245 // team. 3246 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3247 #endif 3248 3249 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3250 3251 root->r.r_root_team = root_team; 3252 root_team->t.t_control_stack_top = NULL; 3253 3254 /* initialize root team */ 3255 root_team->t.t_threads[0] = NULL; 3256 root_team->t.t_nproc = 1; 3257 root_team->t.t_serialized = 1; 3258 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3259 root_team->t.t_sched.sched = r_sched.sched; 3260 KA_TRACE( 3261 20, 3262 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3263 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3264 3265 /* setup the hot team for this task */ 3266 /* allocate the hot team structure */ 3267 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3268 3269 hot_team = 3270 __kmp_allocate_team(root, 3271 1, // new_nproc 3272 __kmp_dflt_team_nth_ub * 2, // max_nproc 3273 #if OMPT_SUPPORT 3274 ompt_data_none, // root parallel id 3275 #endif 3276 #if OMP_40_ENABLED 3277 __kmp_nested_proc_bind.bind_types[0], 3278 #endif 3279 &r_icvs, 3280 0 // argc 3281 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3282 ); 3283 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3284 3285 root->r.r_hot_team = hot_team; 3286 root_team->t.t_control_stack_top = NULL; 3287 3288 /* first-time initialization */ 3289 hot_team->t.t_parent = root_team; 3290 3291 /* initialize hot team */ 3292 hot_team_max_nth = hot_team->t.t_max_nproc; 3293 for (f = 0; f < hot_team_max_nth; ++f) { 3294 hot_team->t.t_threads[f] = NULL; 3295 } 3296 hot_team->t.t_nproc = 1; 3297 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3298 hot_team->t.t_sched.sched = r_sched.sched; 3299 hot_team->t.t_size_changed = 0; 3300 } 3301 3302 #ifdef KMP_DEBUG 3303 3304 typedef struct kmp_team_list_item { 3305 kmp_team_p const *entry; 3306 struct kmp_team_list_item *next; 3307 } kmp_team_list_item_t; 3308 typedef kmp_team_list_item_t *kmp_team_list_t; 3309 3310 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3311 kmp_team_list_t list, // List of teams. 3312 kmp_team_p const *team // Team to add. 3313 ) { 3314 3315 // List must terminate with item where both entry and next are NULL. 3316 // Team is added to the list only once. 3317 // List is sorted in ascending order by team id. 3318 // Team id is *not* a key. 3319 3320 kmp_team_list_t l; 3321 3322 KMP_DEBUG_ASSERT(list != NULL); 3323 if (team == NULL) { 3324 return; 3325 } 3326 3327 __kmp_print_structure_team_accum(list, team->t.t_parent); 3328 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3329 3330 // Search list for the team. 3331 l = list; 3332 while (l->next != NULL && l->entry != team) { 3333 l = l->next; 3334 } 3335 if (l->next != NULL) { 3336 return; // Team has been added before, exit. 3337 } 3338 3339 // Team is not found. Search list again for insertion point. 3340 l = list; 3341 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3342 l = l->next; 3343 } 3344 3345 // Insert team. 3346 { 3347 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3348 sizeof(kmp_team_list_item_t)); 3349 *item = *l; 3350 l->entry = team; 3351 l->next = item; 3352 } 3353 } 3354 3355 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3356 3357 ) { 3358 __kmp_printf("%s", title); 3359 if (team != NULL) { 3360 __kmp_printf("%2x %p\n", team->t.t_id, team); 3361 } else { 3362 __kmp_printf(" - (nil)\n"); 3363 } 3364 } 3365 3366 static void __kmp_print_structure_thread(char const *title, 3367 kmp_info_p const *thread) { 3368 __kmp_printf("%s", title); 3369 if (thread != NULL) { 3370 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3371 } else { 3372 __kmp_printf(" - (nil)\n"); 3373 } 3374 } 3375 3376 void __kmp_print_structure(void) { 3377 3378 kmp_team_list_t list; 3379 3380 // Initialize list of teams. 3381 list = 3382 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3383 list->entry = NULL; 3384 list->next = NULL; 3385 3386 __kmp_printf("\n------------------------------\nGlobal Thread " 3387 "Table\n------------------------------\n"); 3388 { 3389 int gtid; 3390 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3391 __kmp_printf("%2d", gtid); 3392 if (__kmp_threads != NULL) { 3393 __kmp_printf(" %p", __kmp_threads[gtid]); 3394 } 3395 if (__kmp_root != NULL) { 3396 __kmp_printf(" %p", __kmp_root[gtid]); 3397 } 3398 __kmp_printf("\n"); 3399 } 3400 } 3401 3402 // Print out __kmp_threads array. 3403 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3404 "----------\n"); 3405 if (__kmp_threads != NULL) { 3406 int gtid; 3407 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3408 kmp_info_t const *thread = __kmp_threads[gtid]; 3409 if (thread != NULL) { 3410 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3411 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3412 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3413 __kmp_print_structure_team(" Serial Team: ", 3414 thread->th.th_serial_team); 3415 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3416 __kmp_print_structure_thread(" Master: ", 3417 thread->th.th_team_master); 3418 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3419 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3420 #if OMP_40_ENABLED 3421 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3422 #endif 3423 __kmp_print_structure_thread(" Next in pool: ", 3424 thread->th.th_next_pool); 3425 __kmp_printf("\n"); 3426 __kmp_print_structure_team_accum(list, thread->th.th_team); 3427 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3428 } 3429 } 3430 } else { 3431 __kmp_printf("Threads array is not allocated.\n"); 3432 } 3433 3434 // Print out __kmp_root array. 3435 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3436 "--------\n"); 3437 if (__kmp_root != NULL) { 3438 int gtid; 3439 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3440 kmp_root_t const *root = __kmp_root[gtid]; 3441 if (root != NULL) { 3442 __kmp_printf("GTID %2d %p:\n", gtid, root); 3443 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3444 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3445 __kmp_print_structure_thread(" Uber Thread: ", 3446 root->r.r_uber_thread); 3447 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3448 __kmp_printf(" Nested?: %2d\n", root->r.r_nested); 3449 __kmp_printf(" In Parallel: %2d\n", 3450 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3451 __kmp_printf("\n"); 3452 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3453 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3454 } 3455 } 3456 } else { 3457 __kmp_printf("Ubers array is not allocated.\n"); 3458 } 3459 3460 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3461 "--------\n"); 3462 while (list->next != NULL) { 3463 kmp_team_p const *team = list->entry; 3464 int i; 3465 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3466 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3467 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3468 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3469 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3470 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3471 for (i = 0; i < team->t.t_nproc; ++i) { 3472 __kmp_printf(" Thread %2d: ", i); 3473 __kmp_print_structure_thread("", team->t.t_threads[i]); 3474 } 3475 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3476 __kmp_printf("\n"); 3477 list = list->next; 3478 } 3479 3480 // Print out __kmp_thread_pool and __kmp_team_pool. 3481 __kmp_printf("\n------------------------------\nPools\n----------------------" 3482 "--------\n"); 3483 __kmp_print_structure_thread("Thread pool: ", 3484 CCAST(kmp_info_t *, __kmp_thread_pool)); 3485 __kmp_print_structure_team("Team pool: ", 3486 CCAST(kmp_team_t *, __kmp_team_pool)); 3487 __kmp_printf("\n"); 3488 3489 // Free team list. 3490 while (list != NULL) { 3491 kmp_team_list_item_t *item = list; 3492 list = list->next; 3493 KMP_INTERNAL_FREE(item); 3494 } 3495 } 3496 3497 #endif 3498 3499 //--------------------------------------------------------------------------- 3500 // Stuff for per-thread fast random number generator 3501 // Table of primes 3502 static const unsigned __kmp_primes[] = { 3503 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3504 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3505 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3506 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3507 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3508 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3509 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3510 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3511 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3512 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3513 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3514 3515 //--------------------------------------------------------------------------- 3516 // __kmp_get_random: Get a random number using a linear congruential method. 3517 unsigned short __kmp_get_random(kmp_info_t *thread) { 3518 unsigned x = thread->th.th_x; 3519 unsigned short r = x >> 16; 3520 3521 thread->th.th_x = x * thread->th.th_a + 1; 3522 3523 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3524 thread->th.th_info.ds.ds_tid, r)); 3525 3526 return r; 3527 } 3528 //-------------------------------------------------------- 3529 // __kmp_init_random: Initialize a random number generator 3530 void __kmp_init_random(kmp_info_t *thread) { 3531 unsigned seed = thread->th.th_info.ds.ds_tid; 3532 3533 thread->th.th_a = 3534 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3535 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3536 KA_TRACE(30, 3537 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3538 } 3539 3540 #if KMP_OS_WINDOWS 3541 /* reclaim array entries for root threads that are already dead, returns number 3542 * reclaimed */ 3543 static int __kmp_reclaim_dead_roots(void) { 3544 int i, r = 0; 3545 3546 for (i = 0; i < __kmp_threads_capacity; ++i) { 3547 if (KMP_UBER_GTID(i) && 3548 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3549 !__kmp_root[i] 3550 ->r.r_active) { // AC: reclaim only roots died in non-active state 3551 r += __kmp_unregister_root_other_thread(i); 3552 } 3553 } 3554 return r; 3555 } 3556 #endif 3557 3558 /* This function attempts to create free entries in __kmp_threads and 3559 __kmp_root, and returns the number of free entries generated. 3560 3561 For Windows* OS static library, the first mechanism used is to reclaim array 3562 entries for root threads that are already dead. 3563 3564 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3565 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3566 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3567 threadprivate cache array has been created. Synchronization with 3568 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3569 3570 After any dead root reclamation, if the clipping value allows array expansion 3571 to result in the generation of a total of nNeed free slots, the function does 3572 that expansion. If not, nothing is done beyond the possible initial root 3573 thread reclamation. 3574 3575 If any argument is negative, the behavior is undefined. */ 3576 static int __kmp_expand_threads(int nNeed) { 3577 int added = 0; 3578 int minimumRequiredCapacity; 3579 int newCapacity; 3580 kmp_info_t **newThreads; 3581 kmp_root_t **newRoot; 3582 3583 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3584 // resizing __kmp_threads does not need additional protection if foreign 3585 // threads are present 3586 3587 #if KMP_OS_WINDOWS && !KMP_DYNAMIC_LIB 3588 /* only for Windows static library */ 3589 /* reclaim array entries for root threads that are already dead */ 3590 added = __kmp_reclaim_dead_roots(); 3591 3592 if (nNeed) { 3593 nNeed -= added; 3594 if (nNeed < 0) 3595 nNeed = 0; 3596 } 3597 #endif 3598 if (nNeed <= 0) 3599 return added; 3600 3601 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3602 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3603 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3604 // > __kmp_max_nth in one of two ways: 3605 // 3606 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3607 // may not be resused by another thread, so we may need to increase 3608 // __kmp_threads_capacity to __kmp_max_nth + 1. 3609 // 3610 // 2) New foreign root(s) are encountered. We always register new foreign 3611 // roots. This may cause a smaller # of threads to be allocated at 3612 // subsequent parallel regions, but the worker threads hang around (and 3613 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3614 // 3615 // Anyway, that is the reason for moving the check to see if 3616 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3617 // instead of having it performed here. -BB 3618 3619 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3620 3621 /* compute expansion headroom to check if we can expand */ 3622 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3623 /* possible expansion too small -- give up */ 3624 return added; 3625 } 3626 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3627 3628 newCapacity = __kmp_threads_capacity; 3629 do { 3630 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3631 : __kmp_sys_max_nth; 3632 } while (newCapacity < minimumRequiredCapacity); 3633 newThreads = (kmp_info_t **)__kmp_allocate( 3634 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3635 newRoot = 3636 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3637 KMP_MEMCPY(newThreads, __kmp_threads, 3638 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3639 KMP_MEMCPY(newRoot, __kmp_root, 3640 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3641 3642 kmp_info_t **temp_threads = __kmp_threads; 3643 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3644 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3645 __kmp_free(temp_threads); 3646 added += newCapacity - __kmp_threads_capacity; 3647 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3648 3649 if (newCapacity > __kmp_tp_capacity) { 3650 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3651 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3652 __kmp_threadprivate_resize_cache(newCapacity); 3653 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3654 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3655 } 3656 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3657 } 3658 3659 return added; 3660 } 3661 3662 /* Register the current thread as a root thread and obtain our gtid. We must 3663 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3664 thread that calls from __kmp_do_serial_initialize() */ 3665 int __kmp_register_root(int initial_thread) { 3666 kmp_info_t *root_thread; 3667 kmp_root_t *root; 3668 int gtid; 3669 int capacity; 3670 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3671 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3672 KMP_MB(); 3673 3674 /* 2007-03-02: 3675 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3676 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3677 work as expected -- it may return false (that means there is at least one 3678 empty slot in __kmp_threads array), but it is possible the only free slot 3679 is #0, which is reserved for initial thread and so cannot be used for this 3680 one. Following code workarounds this bug. 3681 3682 However, right solution seems to be not reserving slot #0 for initial 3683 thread because: 3684 (1) there is no magic in slot #0, 3685 (2) we cannot detect initial thread reliably (the first thread which does 3686 serial initialization may be not a real initial thread). 3687 */ 3688 capacity = __kmp_threads_capacity; 3689 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3690 --capacity; 3691 } 3692 3693 /* see if there are too many threads */ 3694 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3695 if (__kmp_tp_cached) { 3696 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3697 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3698 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3699 } else { 3700 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3701 __kmp_msg_null); 3702 } 3703 } 3704 3705 /* find an available thread slot */ 3706 /* Don't reassign the zero slot since we need that to only be used by initial 3707 thread */ 3708 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL; 3709 gtid++) 3710 ; 3711 KA_TRACE(1, 3712 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3713 KMP_ASSERT(gtid < __kmp_threads_capacity); 3714 3715 /* update global accounting */ 3716 __kmp_all_nth++; 3717 TCW_4(__kmp_nth, __kmp_nth + 1); 3718 3719 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3720 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3721 if (__kmp_adjust_gtid_mode) { 3722 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3723 if (TCR_4(__kmp_gtid_mode) != 2) { 3724 TCW_4(__kmp_gtid_mode, 2); 3725 } 3726 } else { 3727 if (TCR_4(__kmp_gtid_mode) != 1) { 3728 TCW_4(__kmp_gtid_mode, 1); 3729 } 3730 } 3731 } 3732 3733 #ifdef KMP_ADJUST_BLOCKTIME 3734 /* Adjust blocktime to zero if necessary */ 3735 /* Middle initialization might not have occurred yet */ 3736 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3737 if (__kmp_nth > __kmp_avail_proc) { 3738 __kmp_zero_bt = TRUE; 3739 } 3740 } 3741 #endif /* KMP_ADJUST_BLOCKTIME */ 3742 3743 /* setup this new hierarchy */ 3744 if (!(root = __kmp_root[gtid])) { 3745 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3746 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3747 } 3748 3749 #if KMP_STATS_ENABLED 3750 // Initialize stats as soon as possible (right after gtid assignment). 3751 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3752 __kmp_stats_thread_ptr->startLife(); 3753 KMP_SET_THREAD_STATE(SERIAL_REGION); 3754 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3755 #endif 3756 __kmp_initialize_root(root); 3757 3758 /* setup new root thread structure */ 3759 if (root->r.r_uber_thread) { 3760 root_thread = root->r.r_uber_thread; 3761 } else { 3762 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3763 if (__kmp_storage_map) { 3764 __kmp_print_thread_storage_map(root_thread, gtid); 3765 } 3766 root_thread->th.th_info.ds.ds_gtid = gtid; 3767 #if OMPT_SUPPORT 3768 root_thread->th.ompt_thread_info.thread_data = ompt_data_none; 3769 #endif 3770 root_thread->th.th_root = root; 3771 if (__kmp_env_consistency_check) { 3772 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3773 } 3774 #if USE_FAST_MEMORY 3775 __kmp_initialize_fast_memory(root_thread); 3776 #endif /* USE_FAST_MEMORY */ 3777 3778 #if KMP_USE_BGET 3779 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3780 __kmp_initialize_bget(root_thread); 3781 #endif 3782 __kmp_init_random(root_thread); // Initialize random number generator 3783 } 3784 3785 /* setup the serial team held in reserve by the root thread */ 3786 if (!root_thread->th.th_serial_team) { 3787 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3788 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3789 root_thread->th.th_serial_team = 3790 __kmp_allocate_team(root, 1, 1, 3791 #if OMPT_SUPPORT 3792 ompt_data_none, // root parallel id 3793 #endif 3794 #if OMP_40_ENABLED 3795 proc_bind_default, 3796 #endif 3797 &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3798 } 3799 KMP_ASSERT(root_thread->th.th_serial_team); 3800 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3801 root_thread->th.th_serial_team)); 3802 3803 /* drop root_thread into place */ 3804 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3805 3806 root->r.r_root_team->t.t_threads[0] = root_thread; 3807 root->r.r_hot_team->t.t_threads[0] = root_thread; 3808 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3809 // AC: the team created in reserve, not for execution (it is unused for now). 3810 root_thread->th.th_serial_team->t.t_serialized = 0; 3811 root->r.r_uber_thread = root_thread; 3812 3813 /* initialize the thread, get it ready to go */ 3814 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3815 TCW_4(__kmp_init_gtid, TRUE); 3816 3817 /* prepare the master thread for get_gtid() */ 3818 __kmp_gtid_set_specific(gtid); 3819 3820 #if USE_ITT_BUILD 3821 __kmp_itt_thread_name(gtid); 3822 #endif /* USE_ITT_BUILD */ 3823 3824 #ifdef KMP_TDATA_GTID 3825 __kmp_gtid = gtid; 3826 #endif 3827 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3828 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3829 3830 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3831 "plain=%u\n", 3832 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3833 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3834 KMP_INIT_BARRIER_STATE)); 3835 { // Initialize barrier data. 3836 int b; 3837 for (b = 0; b < bs_last_barrier; ++b) { 3838 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3839 #if USE_DEBUGGER 3840 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3841 #endif 3842 } 3843 } 3844 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3845 KMP_INIT_BARRIER_STATE); 3846 3847 #if KMP_AFFINITY_SUPPORTED 3848 #if OMP_40_ENABLED 3849 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3850 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3851 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3852 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3853 #endif 3854 if (TCR_4(__kmp_init_middle)) { 3855 __kmp_affinity_set_init_mask(gtid, TRUE); 3856 } 3857 #endif /* KMP_AFFINITY_SUPPORTED */ 3858 #if OMP_50_ENABLED 3859 root_thread->th.th_def_allocator = __kmp_def_allocator; 3860 root_thread->th.th_prev_level = 0; 3861 root_thread->th.th_prev_num_threads = 1; 3862 #endif 3863 3864 __kmp_root_counter++; 3865 3866 #if OMPT_SUPPORT 3867 if (!initial_thread && ompt_enabled.enabled) { 3868 3869 kmp_info_t *root_thread = ompt_get_thread(); 3870 3871 ompt_set_thread_state(root_thread, ompt_state_overhead); 3872 3873 if (ompt_enabled.ompt_callback_thread_begin) { 3874 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3875 ompt_thread_initial, __ompt_get_thread_data_internal()); 3876 } 3877 ompt_data_t *task_data; 3878 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 3879 if (ompt_enabled.ompt_callback_task_create) { 3880 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 3881 NULL, NULL, task_data, ompt_task_initial, 0, NULL); 3882 // initial task has nothing to return to 3883 } 3884 3885 ompt_set_thread_state(root_thread, ompt_state_work_serial); 3886 } 3887 #endif 3888 3889 KMP_MB(); 3890 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3891 3892 return gtid; 3893 } 3894 3895 #if KMP_NESTED_HOT_TEAMS 3896 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3897 const int max_level) { 3898 int i, n, nth; 3899 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3900 if (!hot_teams || !hot_teams[level].hot_team) { 3901 return 0; 3902 } 3903 KMP_DEBUG_ASSERT(level < max_level); 3904 kmp_team_t *team = hot_teams[level].hot_team; 3905 nth = hot_teams[level].hot_team_nth; 3906 n = nth - 1; // master is not freed 3907 if (level < max_level - 1) { 3908 for (i = 0; i < nth; ++i) { 3909 kmp_info_t *th = team->t.t_threads[i]; 3910 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3911 if (i > 0 && th->th.th_hot_teams) { 3912 __kmp_free(th->th.th_hot_teams); 3913 th->th.th_hot_teams = NULL; 3914 } 3915 } 3916 } 3917 __kmp_free_team(root, team, NULL); 3918 return n; 3919 } 3920 #endif 3921 3922 // Resets a root thread and clear its root and hot teams. 3923 // Returns the number of __kmp_threads entries directly and indirectly freed. 3924 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3925 kmp_team_t *root_team = root->r.r_root_team; 3926 kmp_team_t *hot_team = root->r.r_hot_team; 3927 int n = hot_team->t.t_nproc; 3928 int i; 3929 3930 KMP_DEBUG_ASSERT(!root->r.r_active); 3931 3932 root->r.r_root_team = NULL; 3933 root->r.r_hot_team = NULL; 3934 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3935 // before call to __kmp_free_team(). 3936 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3937 #if KMP_NESTED_HOT_TEAMS 3938 if (__kmp_hot_teams_max_level > 3939 0) { // need to free nested hot teams and their threads if any 3940 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3941 kmp_info_t *th = hot_team->t.t_threads[i]; 3942 if (__kmp_hot_teams_max_level > 1) { 3943 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3944 } 3945 if (th->th.th_hot_teams) { 3946 __kmp_free(th->th.th_hot_teams); 3947 th->th.th_hot_teams = NULL; 3948 } 3949 } 3950 } 3951 #endif 3952 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3953 3954 // Before we can reap the thread, we need to make certain that all other 3955 // threads in the teams that had this root as ancestor have stopped trying to 3956 // steal tasks. 3957 if (__kmp_tasking_mode != tskm_immediate_exec) { 3958 __kmp_wait_to_unref_task_teams(); 3959 } 3960 3961 #if KMP_OS_WINDOWS 3962 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3963 KA_TRACE( 3964 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3965 "\n", 3966 (LPVOID) & (root->r.r_uber_thread->th), 3967 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3968 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3969 #endif /* KMP_OS_WINDOWS */ 3970 3971 #if OMPT_SUPPORT 3972 if (ompt_enabled.ompt_callback_thread_end) { 3973 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3974 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3975 } 3976 #endif 3977 3978 TCW_4(__kmp_nth, 3979 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3980 root->r.r_cg_nthreads--; 3981 3982 __kmp_reap_thread(root->r.r_uber_thread, 1); 3983 3984 // We canot put root thread to __kmp_thread_pool, so we have to reap it istead 3985 // of freeing. 3986 root->r.r_uber_thread = NULL; 3987 /* mark root as no longer in use */ 3988 root->r.r_begin = FALSE; 3989 3990 return n; 3991 } 3992 3993 void __kmp_unregister_root_current_thread(int gtid) { 3994 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3995 /* this lock should be ok, since unregister_root_current_thread is never 3996 called during an abort, only during a normal close. furthermore, if you 3997 have the forkjoin lock, you should never try to get the initz lock */ 3998 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3999 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 4000 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 4001 "exiting T#%d\n", 4002 gtid)); 4003 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4004 return; 4005 } 4006 kmp_root_t *root = __kmp_root[gtid]; 4007 4008 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4009 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4010 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4011 KMP_ASSERT(root->r.r_active == FALSE); 4012 4013 KMP_MB(); 4014 4015 #if OMP_45_ENABLED 4016 kmp_info_t *thread = __kmp_threads[gtid]; 4017 kmp_team_t *team = thread->th.th_team; 4018 kmp_task_team_t *task_team = thread->th.th_task_team; 4019 4020 // we need to wait for the proxy tasks before finishing the thread 4021 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 4022 #if OMPT_SUPPORT 4023 // the runtime is shutting down so we won't report any events 4024 thread->th.ompt_thread_info.state = ompt_state_undefined; 4025 #endif 4026 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 4027 } 4028 #endif 4029 4030 __kmp_reset_root(gtid, root); 4031 4032 /* free up this thread slot */ 4033 __kmp_gtid_set_specific(KMP_GTID_DNE); 4034 #ifdef KMP_TDATA_GTID 4035 __kmp_gtid = KMP_GTID_DNE; 4036 #endif 4037 4038 KMP_MB(); 4039 KC_TRACE(10, 4040 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 4041 4042 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 4043 } 4044 4045 #if KMP_OS_WINDOWS 4046 /* __kmp_forkjoin_lock must be already held 4047 Unregisters a root thread that is not the current thread. Returns the number 4048 of __kmp_threads entries freed as a result. */ 4049 static int __kmp_unregister_root_other_thread(int gtid) { 4050 kmp_root_t *root = __kmp_root[gtid]; 4051 int r; 4052 4053 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 4054 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 4055 KMP_ASSERT(KMP_UBER_GTID(gtid)); 4056 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 4057 KMP_ASSERT(root->r.r_active == FALSE); 4058 4059 r = __kmp_reset_root(gtid, root); 4060 KC_TRACE(10, 4061 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 4062 return r; 4063 } 4064 #endif 4065 4066 #if KMP_DEBUG 4067 void __kmp_task_info() { 4068 4069 kmp_int32 gtid = __kmp_entry_gtid(); 4070 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4071 kmp_info_t *this_thr = __kmp_threads[gtid]; 4072 kmp_team_t *steam = this_thr->th.th_serial_team; 4073 kmp_team_t *team = this_thr->th.th_team; 4074 4075 __kmp_printf( 4076 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4077 "ptask=%p\n", 4078 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4079 team->t.t_implicit_task_taskdata[tid].td_parent); 4080 } 4081 #endif // KMP_DEBUG 4082 4083 /* TODO optimize with one big memclr, take out what isn't needed, split 4084 responsibility to workers as much as possible, and delay initialization of 4085 features as much as possible */ 4086 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4087 int tid, int gtid) { 4088 /* this_thr->th.th_info.ds.ds_gtid is setup in 4089 kmp_allocate_thread/create_worker. 4090 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4091 kmp_info_t *master = team->t.t_threads[0]; 4092 KMP_DEBUG_ASSERT(this_thr != NULL); 4093 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4094 KMP_DEBUG_ASSERT(team); 4095 KMP_DEBUG_ASSERT(team->t.t_threads); 4096 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4097 KMP_DEBUG_ASSERT(master); 4098 KMP_DEBUG_ASSERT(master->th.th_root); 4099 4100 KMP_MB(); 4101 4102 TCW_SYNC_PTR(this_thr->th.th_team, team); 4103 4104 this_thr->th.th_info.ds.ds_tid = tid; 4105 this_thr->th.th_set_nproc = 0; 4106 if (__kmp_tasking_mode != tskm_immediate_exec) 4107 // When tasking is possible, threads are not safe to reap until they are 4108 // done tasking; this will be set when tasking code is exited in wait 4109 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4110 else // no tasking --> always safe to reap 4111 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4112 #if OMP_40_ENABLED 4113 this_thr->th.th_set_proc_bind = proc_bind_default; 4114 #if KMP_AFFINITY_SUPPORTED 4115 this_thr->th.th_new_place = this_thr->th.th_current_place; 4116 #endif 4117 #endif 4118 this_thr->th.th_root = master->th.th_root; 4119 4120 /* setup the thread's cache of the team structure */ 4121 this_thr->th.th_team_nproc = team->t.t_nproc; 4122 this_thr->th.th_team_master = master; 4123 this_thr->th.th_team_serialized = team->t.t_serialized; 4124 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4125 4126 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4127 4128 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4129 tid, gtid, this_thr, this_thr->th.th_current_task)); 4130 4131 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4132 team, tid, TRUE); 4133 4134 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4135 tid, gtid, this_thr, this_thr->th.th_current_task)); 4136 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4137 // __kmp_initialize_team()? 4138 4139 /* TODO no worksharing in speculative threads */ 4140 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4141 4142 this_thr->th.th_local.this_construct = 0; 4143 4144 if (!this_thr->th.th_pri_common) { 4145 this_thr->th.th_pri_common = 4146 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4147 if (__kmp_storage_map) { 4148 __kmp_print_storage_map_gtid( 4149 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4150 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4151 } 4152 this_thr->th.th_pri_head = NULL; 4153 } 4154 4155 /* Initialize dynamic dispatch */ 4156 { 4157 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4158 // Use team max_nproc since this will never change for the team. 4159 size_t disp_size = 4160 sizeof(dispatch_private_info_t) * 4161 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4162 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4163 team->t.t_max_nproc)); 4164 KMP_ASSERT(dispatch); 4165 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4166 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4167 4168 dispatch->th_disp_index = 0; 4169 #if OMP_45_ENABLED 4170 dispatch->th_doacross_buf_idx = 0; 4171 #endif 4172 if (!dispatch->th_disp_buffer) { 4173 dispatch->th_disp_buffer = 4174 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4175 4176 if (__kmp_storage_map) { 4177 __kmp_print_storage_map_gtid( 4178 gtid, &dispatch->th_disp_buffer[0], 4179 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4180 ? 1 4181 : __kmp_dispatch_num_buffers], 4182 disp_size, "th_%d.th_dispatch.th_disp_buffer " 4183 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4184 gtid, team->t.t_id, gtid); 4185 } 4186 } else { 4187 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4188 } 4189 4190 dispatch->th_dispatch_pr_current = 0; 4191 dispatch->th_dispatch_sh_current = 0; 4192 4193 dispatch->th_deo_fcn = 0; /* ORDERED */ 4194 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4195 } 4196 4197 this_thr->th.th_next_pool = NULL; 4198 4199 if (!this_thr->th.th_task_state_memo_stack) { 4200 size_t i; 4201 this_thr->th.th_task_state_memo_stack = 4202 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4203 this_thr->th.th_task_state_top = 0; 4204 this_thr->th.th_task_state_stack_sz = 4; 4205 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4206 ++i) // zero init the stack 4207 this_thr->th.th_task_state_memo_stack[i] = 0; 4208 } 4209 4210 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4211 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4212 4213 KMP_MB(); 4214 } 4215 4216 /* allocate a new thread for the requesting team. this is only called from 4217 within a forkjoin critical section. we will first try to get an available 4218 thread from the thread pool. if none is available, we will fork a new one 4219 assuming we are able to create a new one. this should be assured, as the 4220 caller should check on this first. */ 4221 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4222 int new_tid) { 4223 kmp_team_t *serial_team; 4224 kmp_info_t *new_thr; 4225 int new_gtid; 4226 4227 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4228 KMP_DEBUG_ASSERT(root && team); 4229 #if !KMP_NESTED_HOT_TEAMS 4230 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4231 #endif 4232 KMP_MB(); 4233 4234 /* first, try to get one from the thread pool */ 4235 if (__kmp_thread_pool) { 4236 4237 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4238 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4239 if (new_thr == __kmp_thread_pool_insert_pt) { 4240 __kmp_thread_pool_insert_pt = NULL; 4241 } 4242 TCW_4(new_thr->th.th_in_pool, FALSE); 4243 // Don't touch th_active_in_pool or th_active. 4244 // The worker thread adjusts those flags as it sleeps/awakens. 4245 __kmp_thread_pool_nth--; 4246 4247 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4248 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4249 KMP_ASSERT(!new_thr->th.th_team); 4250 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4251 KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0); 4252 4253 /* setup the thread structure */ 4254 __kmp_initialize_info(new_thr, team, new_tid, 4255 new_thr->th.th_info.ds.ds_gtid); 4256 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4257 4258 TCW_4(__kmp_nth, __kmp_nth + 1); 4259 root->r.r_cg_nthreads++; 4260 4261 new_thr->th.th_task_state = 0; 4262 new_thr->th.th_task_state_top = 0; 4263 new_thr->th.th_task_state_stack_sz = 4; 4264 4265 #ifdef KMP_ADJUST_BLOCKTIME 4266 /* Adjust blocktime back to zero if necessary */ 4267 /* Middle initialization might not have occurred yet */ 4268 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4269 if (__kmp_nth > __kmp_avail_proc) { 4270 __kmp_zero_bt = TRUE; 4271 } 4272 } 4273 #endif /* KMP_ADJUST_BLOCKTIME */ 4274 4275 #if KMP_DEBUG 4276 // If thread entered pool via __kmp_free_thread, wait_flag should != 4277 // KMP_BARRIER_PARENT_FLAG. 4278 int b; 4279 kmp_balign_t *balign = new_thr->th.th_bar; 4280 for (b = 0; b < bs_last_barrier; ++b) 4281 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4282 #endif 4283 4284 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4285 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4286 4287 KMP_MB(); 4288 return new_thr; 4289 } 4290 4291 /* no, well fork a new one */ 4292 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4293 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4294 4295 #if KMP_USE_MONITOR 4296 // If this is the first worker thread the RTL is creating, then also 4297 // launch the monitor thread. We try to do this as early as possible. 4298 if (!TCR_4(__kmp_init_monitor)) { 4299 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4300 if (!TCR_4(__kmp_init_monitor)) { 4301 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4302 TCW_4(__kmp_init_monitor, 1); 4303 __kmp_create_monitor(&__kmp_monitor); 4304 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4305 #if KMP_OS_WINDOWS 4306 // AC: wait until monitor has started. This is a fix for CQ232808. 4307 // The reason is that if the library is loaded/unloaded in a loop with 4308 // small (parallel) work in between, then there is high probability that 4309 // monitor thread started after the library shutdown. At shutdown it is 4310 // too late to cope with the problem, because when the master is in 4311 // DllMain (process detach) the monitor has no chances to start (it is 4312 // blocked), and master has no means to inform the monitor that the 4313 // library has gone, because all the memory which the monitor can access 4314 // is going to be released/reset. 4315 while (TCR_4(__kmp_init_monitor) < 2) { 4316 KMP_YIELD(TRUE); 4317 } 4318 KF_TRACE(10, ("after monitor thread has started\n")); 4319 #endif 4320 } 4321 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4322 } 4323 #endif 4324 4325 KMP_MB(); 4326 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { 4327 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4328 } 4329 4330 /* allocate space for it. */ 4331 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4332 4333 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4334 4335 if (__kmp_storage_map) { 4336 __kmp_print_thread_storage_map(new_thr, new_gtid); 4337 } 4338 4339 // add the reserve serialized team, initialized from the team's master thread 4340 { 4341 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4342 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4343 new_thr->th.th_serial_team = serial_team = 4344 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4345 #if OMPT_SUPPORT 4346 ompt_data_none, // root parallel id 4347 #endif 4348 #if OMP_40_ENABLED 4349 proc_bind_default, 4350 #endif 4351 &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 4352 } 4353 KMP_ASSERT(serial_team); 4354 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4355 // execution (it is unused for now). 4356 serial_team->t.t_threads[0] = new_thr; 4357 KF_TRACE(10, 4358 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4359 new_thr)); 4360 4361 /* setup the thread structures */ 4362 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4363 4364 #if USE_FAST_MEMORY 4365 __kmp_initialize_fast_memory(new_thr); 4366 #endif /* USE_FAST_MEMORY */ 4367 4368 #if KMP_USE_BGET 4369 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4370 __kmp_initialize_bget(new_thr); 4371 #endif 4372 4373 __kmp_init_random(new_thr); // Initialize random number generator 4374 4375 /* Initialize these only once when thread is grabbed for a team allocation */ 4376 KA_TRACE(20, 4377 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4378 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4379 4380 int b; 4381 kmp_balign_t *balign = new_thr->th.th_bar; 4382 for (b = 0; b < bs_last_barrier; ++b) { 4383 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4384 balign[b].bb.team = NULL; 4385 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4386 balign[b].bb.use_oncore_barrier = 0; 4387 } 4388 4389 new_thr->th.th_spin_here = FALSE; 4390 new_thr->th.th_next_waiting = 0; 4391 #if KMP_OS_UNIX 4392 new_thr->th.th_blocking = false; 4393 #endif 4394 4395 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 4396 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4397 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4398 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4399 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4400 #endif 4401 #if OMP_50_ENABLED 4402 new_thr->th.th_def_allocator = __kmp_def_allocator; 4403 new_thr->th.th_prev_level = 0; 4404 new_thr->th.th_prev_num_threads = 1; 4405 #endif 4406 4407 TCW_4(new_thr->th.th_in_pool, FALSE); 4408 new_thr->th.th_active_in_pool = FALSE; 4409 TCW_4(new_thr->th.th_active, TRUE); 4410 4411 /* adjust the global counters */ 4412 __kmp_all_nth++; 4413 __kmp_nth++; 4414 4415 root->r.r_cg_nthreads++; 4416 4417 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4418 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4419 if (__kmp_adjust_gtid_mode) { 4420 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4421 if (TCR_4(__kmp_gtid_mode) != 2) { 4422 TCW_4(__kmp_gtid_mode, 2); 4423 } 4424 } else { 4425 if (TCR_4(__kmp_gtid_mode) != 1) { 4426 TCW_4(__kmp_gtid_mode, 1); 4427 } 4428 } 4429 } 4430 4431 #ifdef KMP_ADJUST_BLOCKTIME 4432 /* Adjust blocktime back to zero if necessary */ 4433 /* Middle initialization might not have occurred yet */ 4434 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4435 if (__kmp_nth > __kmp_avail_proc) { 4436 __kmp_zero_bt = TRUE; 4437 } 4438 } 4439 #endif /* KMP_ADJUST_BLOCKTIME */ 4440 4441 /* actually fork it and create the new worker thread */ 4442 KF_TRACE( 4443 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4444 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4445 KF_TRACE(10, 4446 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4447 4448 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4449 new_gtid)); 4450 KMP_MB(); 4451 return new_thr; 4452 } 4453 4454 /* Reinitialize team for reuse. 4455 The hot team code calls this case at every fork barrier, so EPCC barrier 4456 test are extremely sensitive to changes in it, esp. writes to the team 4457 struct, which cause a cache invalidation in all threads. 4458 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4459 static void __kmp_reinitialize_team(kmp_team_t *team, 4460 kmp_internal_control_t *new_icvs, 4461 ident_t *loc) { 4462 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4463 team->t.t_threads[0], team)); 4464 KMP_DEBUG_ASSERT(team && new_icvs); 4465 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4466 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4467 4468 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4469 // Copy ICVs to the master thread's implicit taskdata 4470 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4471 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4472 4473 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4474 team->t.t_threads[0], team)); 4475 } 4476 4477 /* Initialize the team data structure. 4478 This assumes the t_threads and t_max_nproc are already set. 4479 Also, we don't touch the arguments */ 4480 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4481 kmp_internal_control_t *new_icvs, 4482 ident_t *loc) { 4483 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4484 4485 /* verify */ 4486 KMP_DEBUG_ASSERT(team); 4487 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4488 KMP_DEBUG_ASSERT(team->t.t_threads); 4489 KMP_MB(); 4490 4491 team->t.t_master_tid = 0; /* not needed */ 4492 /* team->t.t_master_bar; not needed */ 4493 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4494 team->t.t_nproc = new_nproc; 4495 4496 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4497 team->t.t_next_pool = NULL; 4498 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4499 * up hot team */ 4500 4501 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4502 team->t.t_invoke = NULL; /* not needed */ 4503 4504 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4505 team->t.t_sched.sched = new_icvs->sched.sched; 4506 4507 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4508 team->t.t_fp_control_saved = FALSE; /* not needed */ 4509 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4510 team->t.t_mxcsr = 0; /* not needed */ 4511 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4512 4513 team->t.t_construct = 0; 4514 4515 team->t.t_ordered.dt.t_value = 0; 4516 team->t.t_master_active = FALSE; 4517 4518 memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t)); 4519 4520 #ifdef KMP_DEBUG 4521 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4522 #endif 4523 #if KMP_OS_WINDOWS 4524 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4525 #endif 4526 4527 team->t.t_control_stack_top = NULL; 4528 4529 __kmp_reinitialize_team(team, new_icvs, loc); 4530 4531 KMP_MB(); 4532 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4533 } 4534 4535 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 4536 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4537 static void 4538 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4539 if (KMP_AFFINITY_CAPABLE()) { 4540 int status; 4541 if (old_mask != NULL) { 4542 status = __kmp_get_system_affinity(old_mask, TRUE); 4543 int error = errno; 4544 if (status != 0) { 4545 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4546 __kmp_msg_null); 4547 } 4548 } 4549 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4550 } 4551 } 4552 #endif 4553 4554 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 4555 4556 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4557 // It calculats the worker + master thread's partition based upon the parent 4558 // thread's partition, and binds each worker to a thread in their partition. 4559 // The master thread's partition should already include its current binding. 4560 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4561 // Copy the master thread's place partion to the team struct 4562 kmp_info_t *master_th = team->t.t_threads[0]; 4563 KMP_DEBUG_ASSERT(master_th != NULL); 4564 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4565 int first_place = master_th->th.th_first_place; 4566 int last_place = master_th->th.th_last_place; 4567 int masters_place = master_th->th.th_current_place; 4568 team->t.t_first_place = first_place; 4569 team->t.t_last_place = last_place; 4570 4571 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4572 "bound to place %d partition = [%d,%d]\n", 4573 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4574 team->t.t_id, masters_place, first_place, last_place)); 4575 4576 switch (proc_bind) { 4577 4578 case proc_bind_default: 4579 // serial teams might have the proc_bind policy set to proc_bind_default. It 4580 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4581 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4582 break; 4583 4584 case proc_bind_master: { 4585 int f; 4586 int n_th = team->t.t_nproc; 4587 for (f = 1; f < n_th; f++) { 4588 kmp_info_t *th = team->t.t_threads[f]; 4589 KMP_DEBUG_ASSERT(th != NULL); 4590 th->th.th_first_place = first_place; 4591 th->th.th_last_place = last_place; 4592 th->th.th_new_place = masters_place; 4593 #if OMP_50_ENABLED 4594 if (__kmp_display_affinity && masters_place != th->th.th_current_place && 4595 team->t.t_display_affinity != 1) { 4596 team->t.t_display_affinity = 1; 4597 } 4598 #endif 4599 4600 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4601 "partition = [%d,%d]\n", 4602 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4603 f, masters_place, first_place, last_place)); 4604 } 4605 } break; 4606 4607 case proc_bind_close: { 4608 int f; 4609 int n_th = team->t.t_nproc; 4610 int n_places; 4611 if (first_place <= last_place) { 4612 n_places = last_place - first_place + 1; 4613 } else { 4614 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4615 } 4616 if (n_th <= n_places) { 4617 int place = masters_place; 4618 for (f = 1; f < n_th; f++) { 4619 kmp_info_t *th = team->t.t_threads[f]; 4620 KMP_DEBUG_ASSERT(th != NULL); 4621 4622 if (place == last_place) { 4623 place = first_place; 4624 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4625 place = 0; 4626 } else { 4627 place++; 4628 } 4629 th->th.th_first_place = first_place; 4630 th->th.th_last_place = last_place; 4631 th->th.th_new_place = place; 4632 #if OMP_50_ENABLED 4633 if (__kmp_display_affinity && place != th->th.th_current_place && 4634 team->t.t_display_affinity != 1) { 4635 team->t.t_display_affinity = 1; 4636 } 4637 #endif 4638 4639 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4640 "partition = [%d,%d]\n", 4641 __kmp_gtid_from_thread(team->t.t_threads[f]), 4642 team->t.t_id, f, place, first_place, last_place)); 4643 } 4644 } else { 4645 int S, rem, gap, s_count; 4646 S = n_th / n_places; 4647 s_count = 0; 4648 rem = n_th - (S * n_places); 4649 gap = rem > 0 ? n_places / rem : n_places; 4650 int place = masters_place; 4651 int gap_ct = gap; 4652 for (f = 0; f < n_th; f++) { 4653 kmp_info_t *th = team->t.t_threads[f]; 4654 KMP_DEBUG_ASSERT(th != NULL); 4655 4656 th->th.th_first_place = first_place; 4657 th->th.th_last_place = last_place; 4658 th->th.th_new_place = place; 4659 #if OMP_50_ENABLED 4660 if (__kmp_display_affinity && place != th->th.th_current_place && 4661 team->t.t_display_affinity != 1) { 4662 team->t.t_display_affinity = 1; 4663 } 4664 #endif 4665 s_count++; 4666 4667 if ((s_count == S) && rem && (gap_ct == gap)) { 4668 // do nothing, add an extra thread to place on next iteration 4669 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4670 // we added an extra thread to this place; move to next place 4671 if (place == last_place) { 4672 place = first_place; 4673 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4674 place = 0; 4675 } else { 4676 place++; 4677 } 4678 s_count = 0; 4679 gap_ct = 1; 4680 rem--; 4681 } else if (s_count == S) { // place full; don't add extra 4682 if (place == last_place) { 4683 place = first_place; 4684 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4685 place = 0; 4686 } else { 4687 place++; 4688 } 4689 gap_ct++; 4690 s_count = 0; 4691 } 4692 4693 KA_TRACE(100, 4694 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4695 "partition = [%d,%d]\n", 4696 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4697 th->th.th_new_place, first_place, last_place)); 4698 } 4699 KMP_DEBUG_ASSERT(place == masters_place); 4700 } 4701 } break; 4702 4703 case proc_bind_spread: { 4704 int f; 4705 int n_th = team->t.t_nproc; 4706 int n_places; 4707 int thidx; 4708 if (first_place <= last_place) { 4709 n_places = last_place - first_place + 1; 4710 } else { 4711 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4712 } 4713 if (n_th <= n_places) { 4714 int place = -1; 4715 4716 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4717 int S = n_places / n_th; 4718 int s_count, rem, gap, gap_ct; 4719 4720 place = masters_place; 4721 rem = n_places - n_th * S; 4722 gap = rem ? n_th / rem : 1; 4723 gap_ct = gap; 4724 thidx = n_th; 4725 if (update_master_only == 1) 4726 thidx = 1; 4727 for (f = 0; f < thidx; f++) { 4728 kmp_info_t *th = team->t.t_threads[f]; 4729 KMP_DEBUG_ASSERT(th != NULL); 4730 4731 th->th.th_first_place = place; 4732 th->th.th_new_place = place; 4733 #if OMP_50_ENABLED 4734 if (__kmp_display_affinity && place != th->th.th_current_place && 4735 team->t.t_display_affinity != 1) { 4736 team->t.t_display_affinity = 1; 4737 } 4738 #endif 4739 s_count = 1; 4740 while (s_count < S) { 4741 if (place == last_place) { 4742 place = first_place; 4743 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4744 place = 0; 4745 } else { 4746 place++; 4747 } 4748 s_count++; 4749 } 4750 if (rem && (gap_ct == gap)) { 4751 if (place == last_place) { 4752 place = first_place; 4753 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4754 place = 0; 4755 } else { 4756 place++; 4757 } 4758 rem--; 4759 gap_ct = 0; 4760 } 4761 th->th.th_last_place = place; 4762 gap_ct++; 4763 4764 if (place == last_place) { 4765 place = first_place; 4766 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4767 place = 0; 4768 } else { 4769 place++; 4770 } 4771 4772 KA_TRACE(100, 4773 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4774 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4775 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4776 f, th->th.th_new_place, th->th.th_first_place, 4777 th->th.th_last_place, __kmp_affinity_num_masks)); 4778 } 4779 } else { 4780 /* Having uniform space of available computation places I can create 4781 T partitions of round(P/T) size and put threads into the first 4782 place of each partition. */ 4783 double current = static_cast<double>(masters_place); 4784 double spacing = 4785 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4786 int first, last; 4787 kmp_info_t *th; 4788 4789 thidx = n_th + 1; 4790 if (update_master_only == 1) 4791 thidx = 1; 4792 for (f = 0; f < thidx; f++) { 4793 first = static_cast<int>(current); 4794 last = static_cast<int>(current + spacing) - 1; 4795 KMP_DEBUG_ASSERT(last >= first); 4796 if (first >= n_places) { 4797 if (masters_place) { 4798 first -= n_places; 4799 last -= n_places; 4800 if (first == (masters_place + 1)) { 4801 KMP_DEBUG_ASSERT(f == n_th); 4802 first--; 4803 } 4804 if (last == masters_place) { 4805 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4806 last--; 4807 } 4808 } else { 4809 KMP_DEBUG_ASSERT(f == n_th); 4810 first = 0; 4811 last = 0; 4812 } 4813 } 4814 if (last >= n_places) { 4815 last = (n_places - 1); 4816 } 4817 place = first; 4818 current += spacing; 4819 if (f < n_th) { 4820 KMP_DEBUG_ASSERT(0 <= first); 4821 KMP_DEBUG_ASSERT(n_places > first); 4822 KMP_DEBUG_ASSERT(0 <= last); 4823 KMP_DEBUG_ASSERT(n_places > last); 4824 KMP_DEBUG_ASSERT(last_place >= first_place); 4825 th = team->t.t_threads[f]; 4826 KMP_DEBUG_ASSERT(th); 4827 th->th.th_first_place = first; 4828 th->th.th_new_place = place; 4829 th->th.th_last_place = last; 4830 #if OMP_50_ENABLED 4831 if (__kmp_display_affinity && place != th->th.th_current_place && 4832 team->t.t_display_affinity != 1) { 4833 team->t.t_display_affinity = 1; 4834 } 4835 #endif 4836 KA_TRACE(100, 4837 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4838 "partition = [%d,%d], spacing = %.4f\n", 4839 __kmp_gtid_from_thread(team->t.t_threads[f]), 4840 team->t.t_id, f, th->th.th_new_place, 4841 th->th.th_first_place, th->th.th_last_place, spacing)); 4842 } 4843 } 4844 } 4845 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4846 } else { 4847 int S, rem, gap, s_count; 4848 S = n_th / n_places; 4849 s_count = 0; 4850 rem = n_th - (S * n_places); 4851 gap = rem > 0 ? n_places / rem : n_places; 4852 int place = masters_place; 4853 int gap_ct = gap; 4854 thidx = n_th; 4855 if (update_master_only == 1) 4856 thidx = 1; 4857 for (f = 0; f < thidx; f++) { 4858 kmp_info_t *th = team->t.t_threads[f]; 4859 KMP_DEBUG_ASSERT(th != NULL); 4860 4861 th->th.th_first_place = place; 4862 th->th.th_last_place = place; 4863 th->th.th_new_place = place; 4864 #if OMP_50_ENABLED 4865 if (__kmp_display_affinity && place != th->th.th_current_place && 4866 team->t.t_display_affinity != 1) { 4867 team->t.t_display_affinity = 1; 4868 } 4869 #endif 4870 s_count++; 4871 4872 if ((s_count == S) && rem && (gap_ct == gap)) { 4873 // do nothing, add an extra thread to place on next iteration 4874 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4875 // we added an extra thread to this place; move on to next place 4876 if (place == last_place) { 4877 place = first_place; 4878 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4879 place = 0; 4880 } else { 4881 place++; 4882 } 4883 s_count = 0; 4884 gap_ct = 1; 4885 rem--; 4886 } else if (s_count == S) { // place is full; don't add extra thread 4887 if (place == last_place) { 4888 place = first_place; 4889 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4890 place = 0; 4891 } else { 4892 place++; 4893 } 4894 gap_ct++; 4895 s_count = 0; 4896 } 4897 4898 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4899 "partition = [%d,%d]\n", 4900 __kmp_gtid_from_thread(team->t.t_threads[f]), 4901 team->t.t_id, f, th->th.th_new_place, 4902 th->th.th_first_place, th->th.th_last_place)); 4903 } 4904 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4905 } 4906 } break; 4907 4908 default: 4909 break; 4910 } 4911 4912 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4913 } 4914 4915 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */ 4916 4917 /* allocate a new team data structure to use. take one off of the free pool if 4918 available */ 4919 kmp_team_t * 4920 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4921 #if OMPT_SUPPORT 4922 ompt_data_t ompt_parallel_data, 4923 #endif 4924 #if OMP_40_ENABLED 4925 kmp_proc_bind_t new_proc_bind, 4926 #endif 4927 kmp_internal_control_t *new_icvs, 4928 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4929 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4930 int f; 4931 kmp_team_t *team; 4932 int use_hot_team = !root->r.r_active; 4933 int level = 0; 4934 4935 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4936 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4937 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4938 KMP_MB(); 4939 4940 #if KMP_NESTED_HOT_TEAMS 4941 kmp_hot_team_ptr_t *hot_teams; 4942 if (master) { 4943 team = master->th.th_team; 4944 level = team->t.t_active_level; 4945 if (master->th.th_teams_microtask) { // in teams construct? 4946 if (master->th.th_teams_size.nteams > 1 && 4947 ( // #teams > 1 4948 team->t.t_pkfn == 4949 (microtask_t)__kmp_teams_master || // inner fork of the teams 4950 master->th.th_teams_level < 4951 team->t.t_level)) { // or nested parallel inside the teams 4952 ++level; // not increment if #teams==1, or for outer fork of the teams; 4953 // increment otherwise 4954 } 4955 } 4956 hot_teams = master->th.th_hot_teams; 4957 if (level < __kmp_hot_teams_max_level && hot_teams && 4958 hot_teams[level] 4959 .hot_team) { // hot team has already been allocated for given level 4960 use_hot_team = 1; 4961 } else { 4962 use_hot_team = 0; 4963 } 4964 } 4965 #endif 4966 // Optimization to use a "hot" team 4967 if (use_hot_team && new_nproc > 1) { 4968 KMP_DEBUG_ASSERT(new_nproc == max_nproc); 4969 #if KMP_NESTED_HOT_TEAMS 4970 team = hot_teams[level].hot_team; 4971 #else 4972 team = root->r.r_hot_team; 4973 #endif 4974 #if KMP_DEBUG 4975 if (__kmp_tasking_mode != tskm_immediate_exec) { 4976 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4977 "task_team[1] = %p before reinit\n", 4978 team->t.t_task_team[0], team->t.t_task_team[1])); 4979 } 4980 #endif 4981 4982 // Has the number of threads changed? 4983 /* Let's assume the most common case is that the number of threads is 4984 unchanged, and put that case first. */ 4985 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4986 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4987 // This case can mean that omp_set_num_threads() was called and the hot 4988 // team size was already reduced, so we check the special flag 4989 if (team->t.t_size_changed == -1) { 4990 team->t.t_size_changed = 1; 4991 } else { 4992 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4993 } 4994 4995 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4996 kmp_r_sched_t new_sched = new_icvs->sched; 4997 // set master's schedule as new run-time schedule 4998 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 4999 5000 __kmp_reinitialize_team(team, new_icvs, 5001 root->r.r_uber_thread->th.th_ident); 5002 5003 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 5004 team->t.t_threads[0], team)); 5005 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5006 5007 #if OMP_40_ENABLED 5008 #if KMP_AFFINITY_SUPPORTED 5009 if ((team->t.t_size_changed == 0) && 5010 (team->t.t_proc_bind == new_proc_bind)) { 5011 if (new_proc_bind == proc_bind_spread) { 5012 __kmp_partition_places( 5013 team, 1); // add flag to update only master for spread 5014 } 5015 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 5016 "proc_bind = %d, partition = [%d,%d]\n", 5017 team->t.t_id, new_proc_bind, team->t.t_first_place, 5018 team->t.t_last_place)); 5019 } else { 5020 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5021 __kmp_partition_places(team); 5022 } 5023 #else 5024 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5025 #endif /* KMP_AFFINITY_SUPPORTED */ 5026 #endif /* OMP_40_ENABLED */ 5027 } else if (team->t.t_nproc > new_nproc) { 5028 KA_TRACE(20, 5029 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 5030 new_nproc)); 5031 5032 team->t.t_size_changed = 1; 5033 #if KMP_NESTED_HOT_TEAMS 5034 if (__kmp_hot_teams_mode == 0) { 5035 // AC: saved number of threads should correspond to team's value in this 5036 // mode, can be bigger in mode 1, when hot team has threads in reserve 5037 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 5038 hot_teams[level].hot_team_nth = new_nproc; 5039 #endif // KMP_NESTED_HOT_TEAMS 5040 /* release the extra threads we don't need any more */ 5041 for (f = new_nproc; f < team->t.t_nproc; f++) { 5042 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5043 if (__kmp_tasking_mode != tskm_immediate_exec) { 5044 // When decreasing team size, threads no longer in the team should 5045 // unref task team. 5046 team->t.t_threads[f]->th.th_task_team = NULL; 5047 } 5048 __kmp_free_thread(team->t.t_threads[f]); 5049 team->t.t_threads[f] = NULL; 5050 } 5051 #if KMP_NESTED_HOT_TEAMS 5052 } // (__kmp_hot_teams_mode == 0) 5053 else { 5054 // When keeping extra threads in team, switch threads to wait on own 5055 // b_go flag 5056 for (f = new_nproc; f < team->t.t_nproc; ++f) { 5057 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5058 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 5059 for (int b = 0; b < bs_last_barrier; ++b) { 5060 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 5061 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5062 } 5063 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 5064 } 5065 } 5066 } 5067 #endif // KMP_NESTED_HOT_TEAMS 5068 team->t.t_nproc = new_nproc; 5069 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 5070 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 5071 __kmp_reinitialize_team(team, new_icvs, 5072 root->r.r_uber_thread->th.th_ident); 5073 5074 /* update the remaining threads */ 5075 for (f = 0; f < new_nproc; ++f) { 5076 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 5077 } 5078 // restore the current task state of the master thread: should be the 5079 // implicit task 5080 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 5081 team->t.t_threads[0], team)); 5082 5083 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 5084 5085 #ifdef KMP_DEBUG 5086 for (f = 0; f < team->t.t_nproc; f++) { 5087 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5088 team->t.t_threads[f]->th.th_team_nproc == 5089 team->t.t_nproc); 5090 } 5091 #endif 5092 5093 #if OMP_40_ENABLED 5094 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5095 #if KMP_AFFINITY_SUPPORTED 5096 __kmp_partition_places(team); 5097 #endif 5098 #endif 5099 } else { // team->t.t_nproc < new_nproc 5100 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5101 kmp_affin_mask_t *old_mask; 5102 if (KMP_AFFINITY_CAPABLE()) { 5103 KMP_CPU_ALLOC(old_mask); 5104 } 5105 #endif 5106 5107 KA_TRACE(20, 5108 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5109 new_nproc)); 5110 5111 team->t.t_size_changed = 1; 5112 5113 #if KMP_NESTED_HOT_TEAMS 5114 int avail_threads = hot_teams[level].hot_team_nth; 5115 if (new_nproc < avail_threads) 5116 avail_threads = new_nproc; 5117 kmp_info_t **other_threads = team->t.t_threads; 5118 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5119 // Adjust barrier data of reserved threads (if any) of the team 5120 // Other data will be set in __kmp_initialize_info() below. 5121 int b; 5122 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5123 for (b = 0; b < bs_last_barrier; ++b) { 5124 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5125 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5126 #if USE_DEBUGGER 5127 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5128 #endif 5129 } 5130 } 5131 if (hot_teams[level].hot_team_nth >= new_nproc) { 5132 // we have all needed threads in reserve, no need to allocate any 5133 // this only possible in mode 1, cannot have reserved threads in mode 0 5134 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5135 team->t.t_nproc = new_nproc; // just get reserved threads involved 5136 } else { 5137 // we may have some threads in reserve, but not enough 5138 team->t.t_nproc = 5139 hot_teams[level] 5140 .hot_team_nth; // get reserved threads involved if any 5141 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5142 #endif // KMP_NESTED_HOT_TEAMS 5143 if (team->t.t_max_nproc < new_nproc) { 5144 /* reallocate larger arrays */ 5145 __kmp_reallocate_team_arrays(team, new_nproc); 5146 __kmp_reinitialize_team(team, new_icvs, NULL); 5147 } 5148 5149 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5150 /* Temporarily set full mask for master thread before creation of 5151 workers. The reason is that workers inherit the affinity from master, 5152 so if a lot of workers are created on the single core quickly, they 5153 don't get a chance to set their own affinity for a long time. */ 5154 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5155 #endif 5156 5157 /* allocate new threads for the hot team */ 5158 for (f = team->t.t_nproc; f < new_nproc; f++) { 5159 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5160 KMP_DEBUG_ASSERT(new_worker); 5161 team->t.t_threads[f] = new_worker; 5162 5163 KA_TRACE(20, 5164 ("__kmp_allocate_team: team %d init T#%d arrived: " 5165 "join=%llu, plain=%llu\n", 5166 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5167 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5168 team->t.t_bar[bs_plain_barrier].b_arrived)); 5169 5170 { // Initialize barrier data for new threads. 5171 int b; 5172 kmp_balign_t *balign = new_worker->th.th_bar; 5173 for (b = 0; b < bs_last_barrier; ++b) { 5174 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5175 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5176 KMP_BARRIER_PARENT_FLAG); 5177 #if USE_DEBUGGER 5178 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5179 #endif 5180 } 5181 } 5182 } 5183 5184 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5185 if (KMP_AFFINITY_CAPABLE()) { 5186 /* Restore initial master thread's affinity mask */ 5187 __kmp_set_system_affinity(old_mask, TRUE); 5188 KMP_CPU_FREE(old_mask); 5189 } 5190 #endif 5191 #if KMP_NESTED_HOT_TEAMS 5192 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5193 #endif // KMP_NESTED_HOT_TEAMS 5194 /* make sure everyone is syncronized */ 5195 int old_nproc = team->t.t_nproc; // save old value and use to update only 5196 // new threads below 5197 __kmp_initialize_team(team, new_nproc, new_icvs, 5198 root->r.r_uber_thread->th.th_ident); 5199 5200 /* reinitialize the threads */ 5201 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5202 for (f = 0; f < team->t.t_nproc; ++f) 5203 __kmp_initialize_info(team->t.t_threads[f], team, f, 5204 __kmp_gtid_from_tid(f, team)); 5205 if (level) { // set th_task_state for new threads in nested hot team 5206 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5207 // only need to set the th_task_state for the new threads. th_task_state 5208 // for master thread will not be accurate until after this in 5209 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5210 // correct value. 5211 for (f = old_nproc; f < team->t.t_nproc; ++f) 5212 team->t.t_threads[f]->th.th_task_state = 5213 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5214 } else { // set th_task_state for new threads in non-nested hot team 5215 int old_state = 5216 team->t.t_threads[0]->th.th_task_state; // copy master's state 5217 for (f = old_nproc; f < team->t.t_nproc; ++f) 5218 team->t.t_threads[f]->th.th_task_state = old_state; 5219 } 5220 5221 #ifdef KMP_DEBUG 5222 for (f = 0; f < team->t.t_nproc; ++f) { 5223 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5224 team->t.t_threads[f]->th.th_team_nproc == 5225 team->t.t_nproc); 5226 } 5227 #endif 5228 5229 #if OMP_40_ENABLED 5230 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5231 #if KMP_AFFINITY_SUPPORTED 5232 __kmp_partition_places(team); 5233 #endif 5234 #endif 5235 } // Check changes in number of threads 5236 5237 #if OMP_40_ENABLED 5238 kmp_info_t *master = team->t.t_threads[0]; 5239 if (master->th.th_teams_microtask) { 5240 for (f = 1; f < new_nproc; ++f) { 5241 // propagate teams construct specific info to workers 5242 kmp_info_t *thr = team->t.t_threads[f]; 5243 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5244 thr->th.th_teams_level = master->th.th_teams_level; 5245 thr->th.th_teams_size = master->th.th_teams_size; 5246 } 5247 } 5248 #endif /* OMP_40_ENABLED */ 5249 #if KMP_NESTED_HOT_TEAMS 5250 if (level) { 5251 // Sync barrier state for nested hot teams, not needed for outermost hot 5252 // team. 5253 for (f = 1; f < new_nproc; ++f) { 5254 kmp_info_t *thr = team->t.t_threads[f]; 5255 int b; 5256 kmp_balign_t *balign = thr->th.th_bar; 5257 for (b = 0; b < bs_last_barrier; ++b) { 5258 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5259 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5260 #if USE_DEBUGGER 5261 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5262 #endif 5263 } 5264 } 5265 } 5266 #endif // KMP_NESTED_HOT_TEAMS 5267 5268 /* reallocate space for arguments if necessary */ 5269 __kmp_alloc_argv_entries(argc, team, TRUE); 5270 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5271 // The hot team re-uses the previous task team, 5272 // if untouched during the previous release->gather phase. 5273 5274 KF_TRACE(10, (" hot_team = %p\n", team)); 5275 5276 #if KMP_DEBUG 5277 if (__kmp_tasking_mode != tskm_immediate_exec) { 5278 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5279 "task_team[1] = %p after reinit\n", 5280 team->t.t_task_team[0], team->t.t_task_team[1])); 5281 } 5282 #endif 5283 5284 #if OMPT_SUPPORT 5285 __ompt_team_assign_id(team, ompt_parallel_data); 5286 #endif 5287 5288 KMP_MB(); 5289 5290 return team; 5291 } 5292 5293 /* next, let's try to take one from the team pool */ 5294 KMP_MB(); 5295 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5296 /* TODO: consider resizing undersized teams instead of reaping them, now 5297 that we have a resizing mechanism */ 5298 if (team->t.t_max_nproc >= max_nproc) { 5299 /* take this team from the team pool */ 5300 __kmp_team_pool = team->t.t_next_pool; 5301 5302 /* setup the team for fresh use */ 5303 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5304 5305 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5306 "task_team[1] %p to NULL\n", 5307 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5308 team->t.t_task_team[0] = NULL; 5309 team->t.t_task_team[1] = NULL; 5310 5311 /* reallocate space for arguments if necessary */ 5312 __kmp_alloc_argv_entries(argc, team, TRUE); 5313 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5314 5315 KA_TRACE( 5316 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5317 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5318 { // Initialize barrier data. 5319 int b; 5320 for (b = 0; b < bs_last_barrier; ++b) { 5321 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5322 #if USE_DEBUGGER 5323 team->t.t_bar[b].b_master_arrived = 0; 5324 team->t.t_bar[b].b_team_arrived = 0; 5325 #endif 5326 } 5327 } 5328 5329 #if OMP_40_ENABLED 5330 team->t.t_proc_bind = new_proc_bind; 5331 #endif 5332 5333 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5334 team->t.t_id)); 5335 5336 #if OMPT_SUPPORT 5337 __ompt_team_assign_id(team, ompt_parallel_data); 5338 #endif 5339 5340 KMP_MB(); 5341 5342 return team; 5343 } 5344 5345 /* reap team if it is too small, then loop back and check the next one */ 5346 // not sure if this is wise, but, will be redone during the hot-teams 5347 // rewrite. 5348 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5349 team = __kmp_reap_team(team); 5350 __kmp_team_pool = team; 5351 } 5352 5353 /* nothing available in the pool, no matter, make a new team! */ 5354 KMP_MB(); 5355 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5356 5357 /* and set it up */ 5358 team->t.t_max_nproc = max_nproc; 5359 /* NOTE well, for some reason allocating one big buffer and dividing it up 5360 seems to really hurt performance a lot on the P4, so, let's not use this */ 5361 __kmp_allocate_team_arrays(team, max_nproc); 5362 5363 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5364 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5365 5366 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5367 "%p to NULL\n", 5368 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5369 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5370 // memory, no need to duplicate 5371 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5372 // memory, no need to duplicate 5373 5374 if (__kmp_storage_map) { 5375 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5376 } 5377 5378 /* allocate space for arguments */ 5379 __kmp_alloc_argv_entries(argc, team, FALSE); 5380 team->t.t_argc = argc; 5381 5382 KA_TRACE(20, 5383 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5384 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5385 { // Initialize barrier data. 5386 int b; 5387 for (b = 0; b < bs_last_barrier; ++b) { 5388 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5389 #if USE_DEBUGGER 5390 team->t.t_bar[b].b_master_arrived = 0; 5391 team->t.t_bar[b].b_team_arrived = 0; 5392 #endif 5393 } 5394 } 5395 5396 #if OMP_40_ENABLED 5397 team->t.t_proc_bind = new_proc_bind; 5398 #endif 5399 5400 #if OMPT_SUPPORT 5401 __ompt_team_assign_id(team, ompt_parallel_data); 5402 team->t.ompt_serialized_team_info = NULL; 5403 #endif 5404 5405 KMP_MB(); 5406 5407 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5408 team->t.t_id)); 5409 5410 return team; 5411 } 5412 5413 /* TODO implement hot-teams at all levels */ 5414 /* TODO implement lazy thread release on demand (disband request) */ 5415 5416 /* free the team. return it to the team pool. release all the threads 5417 * associated with it */ 5418 void __kmp_free_team(kmp_root_t *root, 5419 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5420 int f; 5421 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5422 team->t.t_id)); 5423 5424 /* verify state */ 5425 KMP_DEBUG_ASSERT(root); 5426 KMP_DEBUG_ASSERT(team); 5427 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5428 KMP_DEBUG_ASSERT(team->t.t_threads); 5429 5430 int use_hot_team = team == root->r.r_hot_team; 5431 #if KMP_NESTED_HOT_TEAMS 5432 int level; 5433 kmp_hot_team_ptr_t *hot_teams; 5434 if (master) { 5435 level = team->t.t_active_level - 1; 5436 if (master->th.th_teams_microtask) { // in teams construct? 5437 if (master->th.th_teams_size.nteams > 1) { 5438 ++level; // level was not increased in teams construct for 5439 // team_of_masters 5440 } 5441 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5442 master->th.th_teams_level == team->t.t_level) { 5443 ++level; // level was not increased in teams construct for 5444 // team_of_workers before the parallel 5445 } // team->t.t_level will be increased inside parallel 5446 } 5447 hot_teams = master->th.th_hot_teams; 5448 if (level < __kmp_hot_teams_max_level) { 5449 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5450 use_hot_team = 1; 5451 } 5452 } 5453 #endif // KMP_NESTED_HOT_TEAMS 5454 5455 /* team is done working */ 5456 TCW_SYNC_PTR(team->t.t_pkfn, 5457 NULL); // Important for Debugging Support Library. 5458 #if KMP_OS_WINDOWS 5459 team->t.t_copyin_counter = 0; // init counter for possible reuse 5460 #endif 5461 // Do not reset pointer to parent team to NULL for hot teams. 5462 5463 /* if we are non-hot team, release our threads */ 5464 if (!use_hot_team) { 5465 if (__kmp_tasking_mode != tskm_immediate_exec) { 5466 // Wait for threads to reach reapable state 5467 for (f = 1; f < team->t.t_nproc; ++f) { 5468 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5469 kmp_info_t *th = team->t.t_threads[f]; 5470 volatile kmp_uint32 *state = &th->th.th_reap_state; 5471 while (*state != KMP_SAFE_TO_REAP) { 5472 #if KMP_OS_WINDOWS 5473 // On Windows a thread can be killed at any time, check this 5474 DWORD ecode; 5475 if (!__kmp_is_thread_alive(th, &ecode)) { 5476 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5477 break; 5478 } 5479 #endif 5480 // first check if thread is sleeping 5481 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5482 if (fl.is_sleeping()) 5483 fl.resume(__kmp_gtid_from_thread(th)); 5484 KMP_CPU_PAUSE(); 5485 } 5486 } 5487 5488 // Delete task teams 5489 int tt_idx; 5490 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5491 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5492 if (task_team != NULL) { 5493 for (f = 0; f < team->t.t_nproc; 5494 ++f) { // Have all threads unref task teams 5495 team->t.t_threads[f]->th.th_task_team = NULL; 5496 } 5497 KA_TRACE( 5498 20, 5499 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5500 __kmp_get_gtid(), task_team, team->t.t_id)); 5501 #if KMP_NESTED_HOT_TEAMS 5502 __kmp_free_task_team(master, task_team); 5503 #endif 5504 team->t.t_task_team[tt_idx] = NULL; 5505 } 5506 } 5507 } 5508 5509 // Reset pointer to parent team only for non-hot teams. 5510 team->t.t_parent = NULL; 5511 team->t.t_level = 0; 5512 team->t.t_active_level = 0; 5513 5514 /* free the worker threads */ 5515 for (f = 1; f < team->t.t_nproc; ++f) { 5516 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5517 __kmp_free_thread(team->t.t_threads[f]); 5518 team->t.t_threads[f] = NULL; 5519 } 5520 5521 /* put the team back in the team pool */ 5522 /* TODO limit size of team pool, call reap_team if pool too large */ 5523 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5524 __kmp_team_pool = (volatile kmp_team_t *)team; 5525 } 5526 5527 KMP_MB(); 5528 } 5529 5530 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5531 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5532 kmp_team_t *next_pool = team->t.t_next_pool; 5533 5534 KMP_DEBUG_ASSERT(team); 5535 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5536 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5537 KMP_DEBUG_ASSERT(team->t.t_threads); 5538 KMP_DEBUG_ASSERT(team->t.t_argv); 5539 5540 /* TODO clean the threads that are a part of this? */ 5541 5542 /* free stuff */ 5543 __kmp_free_team_arrays(team); 5544 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5545 __kmp_free((void *)team->t.t_argv); 5546 __kmp_free(team); 5547 5548 KMP_MB(); 5549 return next_pool; 5550 } 5551 5552 // Free the thread. Don't reap it, just place it on the pool of available 5553 // threads. 5554 // 5555 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5556 // binding for the affinity mechanism to be useful. 5557 // 5558 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5559 // However, we want to avoid a potential performance problem by always 5560 // scanning through the list to find the correct point at which to insert 5561 // the thread (potential N**2 behavior). To do this we keep track of the 5562 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5563 // With single-level parallelism, threads will always be added to the tail 5564 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5565 // parallelism, all bets are off and we may need to scan through the entire 5566 // free list. 5567 // 5568 // This change also has a potentially large performance benefit, for some 5569 // applications. Previously, as threads were freed from the hot team, they 5570 // would be placed back on the free list in inverse order. If the hot team 5571 // grew back to it's original size, then the freed thread would be placed 5572 // back on the hot team in reverse order. This could cause bad cache 5573 // locality problems on programs where the size of the hot team regularly 5574 // grew and shrunk. 5575 // 5576 // Now, for single-level parallelism, the OMP tid is alway == gtid. 5577 void __kmp_free_thread(kmp_info_t *this_th) { 5578 int gtid; 5579 kmp_info_t **scan; 5580 kmp_root_t *root = this_th->th.th_root; 5581 5582 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5583 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5584 5585 KMP_DEBUG_ASSERT(this_th); 5586 5587 // When moving thread to pool, switch thread to wait on own b_go flag, and 5588 // uninitialized (NULL team). 5589 int b; 5590 kmp_balign_t *balign = this_th->th.th_bar; 5591 for (b = 0; b < bs_last_barrier; ++b) { 5592 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5593 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5594 balign[b].bb.team = NULL; 5595 balign[b].bb.leaf_kids = 0; 5596 } 5597 this_th->th.th_task_state = 0; 5598 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5599 5600 /* put thread back on the free pool */ 5601 TCW_PTR(this_th->th.th_team, NULL); 5602 TCW_PTR(this_th->th.th_root, NULL); 5603 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5604 5605 /* If the implicit task assigned to this thread can be used by other threads 5606 * -> multiple threads can share the data and try to free the task at 5607 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5608 * with higher probability when hot team is disabled but can occurs even when 5609 * the hot team is enabled */ 5610 __kmp_free_implicit_task(this_th); 5611 this_th->th.th_current_task = NULL; 5612 5613 // If the __kmp_thread_pool_insert_pt is already past the new insert 5614 // point, then we need to re-scan the entire list. 5615 gtid = this_th->th.th_info.ds.ds_gtid; 5616 if (__kmp_thread_pool_insert_pt != NULL) { 5617 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5618 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5619 __kmp_thread_pool_insert_pt = NULL; 5620 } 5621 } 5622 5623 // Scan down the list to find the place to insert the thread. 5624 // scan is the address of a link in the list, possibly the address of 5625 // __kmp_thread_pool itself. 5626 // 5627 // In the absence of nested parallism, the for loop will have 0 iterations. 5628 if (__kmp_thread_pool_insert_pt != NULL) { 5629 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5630 } else { 5631 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5632 } 5633 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5634 scan = &((*scan)->th.th_next_pool)) 5635 ; 5636 5637 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5638 // to its address. 5639 TCW_PTR(this_th->th.th_next_pool, *scan); 5640 __kmp_thread_pool_insert_pt = *scan = this_th; 5641 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5642 (this_th->th.th_info.ds.ds_gtid < 5643 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5644 TCW_4(this_th->th.th_in_pool, TRUE); 5645 __kmp_thread_pool_nth++; 5646 5647 TCW_4(__kmp_nth, __kmp_nth - 1); 5648 root->r.r_cg_nthreads--; 5649 5650 #ifdef KMP_ADJUST_BLOCKTIME 5651 /* Adjust blocktime back to user setting or default if necessary */ 5652 /* Middle initialization might never have occurred */ 5653 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5654 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5655 if (__kmp_nth <= __kmp_avail_proc) { 5656 __kmp_zero_bt = FALSE; 5657 } 5658 } 5659 #endif /* KMP_ADJUST_BLOCKTIME */ 5660 5661 KMP_MB(); 5662 } 5663 5664 /* ------------------------------------------------------------------------ */ 5665 5666 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5667 int gtid = this_thr->th.th_info.ds.ds_gtid; 5668 /* void *stack_data;*/ 5669 kmp_team_t *(*volatile pteam); 5670 5671 KMP_MB(); 5672 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5673 5674 if (__kmp_env_consistency_check) { 5675 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5676 } 5677 5678 #if OMPT_SUPPORT 5679 ompt_data_t *thread_data; 5680 if (ompt_enabled.enabled) { 5681 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5682 *thread_data = ompt_data_none; 5683 5684 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5685 this_thr->th.ompt_thread_info.wait_id = 0; 5686 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5687 if (ompt_enabled.ompt_callback_thread_begin) { 5688 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5689 ompt_thread_worker, thread_data); 5690 } 5691 } 5692 #endif 5693 5694 #if OMPT_SUPPORT 5695 if (ompt_enabled.enabled) { 5696 this_thr->th.ompt_thread_info.state = ompt_state_idle; 5697 } 5698 #endif 5699 /* This is the place where threads wait for work */ 5700 while (!TCR_4(__kmp_global.g.g_done)) { 5701 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5702 KMP_MB(); 5703 5704 /* wait for work to do */ 5705 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5706 5707 /* No tid yet since not part of a team */ 5708 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5709 5710 #if OMPT_SUPPORT 5711 if (ompt_enabled.enabled) { 5712 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5713 } 5714 #endif 5715 5716 pteam = (kmp_team_t * (*))(&this_thr->th.th_team); 5717 5718 /* have we been allocated? */ 5719 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5720 /* we were just woken up, so run our new task */ 5721 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5722 int rc; 5723 KA_TRACE(20, 5724 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5725 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5726 (*pteam)->t.t_pkfn)); 5727 5728 updateHWFPControl(*pteam); 5729 5730 #if OMPT_SUPPORT 5731 if (ompt_enabled.enabled) { 5732 this_thr->th.ompt_thread_info.state = ompt_state_work_parallel; 5733 } 5734 #endif 5735 5736 rc = (*pteam)->t.t_invoke(gtid); 5737 KMP_ASSERT(rc); 5738 5739 KMP_MB(); 5740 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5741 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5742 (*pteam)->t.t_pkfn)); 5743 } 5744 #if OMPT_SUPPORT 5745 if (ompt_enabled.enabled) { 5746 /* no frame set while outside task */ 5747 __ompt_get_task_info_object(0)->frame.exit_frame = ompt_data_none; 5748 5749 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 5750 } 5751 #endif 5752 /* join barrier after parallel region */ 5753 __kmp_join_barrier(gtid); 5754 } 5755 } 5756 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5757 5758 #if OMPT_SUPPORT 5759 if (ompt_enabled.ompt_callback_thread_end) { 5760 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5761 } 5762 #endif 5763 5764 this_thr->th.th_task_team = NULL; 5765 /* run the destructors for the threadprivate data for this thread */ 5766 __kmp_common_destroy_gtid(gtid); 5767 5768 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5769 KMP_MB(); 5770 return this_thr; 5771 } 5772 5773 /* ------------------------------------------------------------------------ */ 5774 5775 void __kmp_internal_end_dest(void *specific_gtid) { 5776 #if KMP_COMPILER_ICC 5777 #pragma warning(push) 5778 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose 5779 // significant bits 5780 #endif 5781 // Make sure no significant bits are lost 5782 int gtid = (kmp_intptr_t)specific_gtid - 1; 5783 #if KMP_COMPILER_ICC 5784 #pragma warning(pop) 5785 #endif 5786 5787 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5788 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5789 * this is because 0 is reserved for the nothing-stored case */ 5790 5791 /* josh: One reason for setting the gtid specific data even when it is being 5792 destroyed by pthread is to allow gtid lookup through thread specific data 5793 (__kmp_gtid_get_specific). Some of the code, especially stat code, 5794 that gets executed in the call to __kmp_internal_end_thread, actually 5795 gets the gtid through the thread specific data. Setting it here seems 5796 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread 5797 to run smoothly. 5798 todo: get rid of this after we remove the dependence on 5799 __kmp_gtid_get_specific */ 5800 if (gtid >= 0 && KMP_UBER_GTID(gtid)) 5801 __kmp_gtid_set_specific(gtid); 5802 #ifdef KMP_TDATA_GTID 5803 __kmp_gtid = gtid; 5804 #endif 5805 __kmp_internal_end_thread(gtid); 5806 } 5807 5808 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5809 5810 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases 5811 // destructors work perfectly, but in real libomp.so I have no evidence it is 5812 // ever called. However, -fini linker option in makefile.mk works fine. 5813 5814 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5815 __kmp_internal_end_atexit(); 5816 } 5817 5818 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); } 5819 5820 #endif 5821 5822 /* [Windows] josh: when the atexit handler is called, there may still be more 5823 than one thread alive */ 5824 void __kmp_internal_end_atexit(void) { 5825 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5826 /* [Windows] 5827 josh: ideally, we want to completely shutdown the library in this atexit 5828 handler, but stat code that depends on thread specific data for gtid fails 5829 because that data becomes unavailable at some point during the shutdown, so 5830 we call __kmp_internal_end_thread instead. We should eventually remove the 5831 dependency on __kmp_get_specific_gtid in the stat code and use 5832 __kmp_internal_end_library to cleanly shutdown the library. 5833 5834 // TODO: Can some of this comment about GVS be removed? 5835 I suspect that the offending stat code is executed when the calling thread 5836 tries to clean up a dead root thread's data structures, resulting in GVS 5837 code trying to close the GVS structures for that thread, but since the stat 5838 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5839 the calling thread is cleaning up itself instead of another thread, it get 5840 confused. This happens because allowing a thread to unregister and cleanup 5841 another thread is a recent modification for addressing an issue. 5842 Based on the current design (20050722), a thread may end up 5843 trying to unregister another thread only if thread death does not trigger 5844 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5845 thread specific data destructor function to detect thread death. For 5846 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5847 is nothing. Thus, the workaround is applicable only for Windows static 5848 stat library. */ 5849 __kmp_internal_end_library(-1); 5850 #if KMP_OS_WINDOWS 5851 __kmp_close_console(); 5852 #endif 5853 } 5854 5855 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5856 // It is assumed __kmp_forkjoin_lock is acquired. 5857 5858 int gtid; 5859 5860 KMP_DEBUG_ASSERT(thread != NULL); 5861 5862 gtid = thread->th.th_info.ds.ds_gtid; 5863 5864 if (!is_root) { 5865 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5866 /* Assume the threads are at the fork barrier here */ 5867 KA_TRACE( 5868 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5869 gtid)); 5870 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5871 * (GEH) */ 5872 ANNOTATE_HAPPENS_BEFORE(thread); 5873 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 5874 __kmp_release_64(&flag); 5875 } 5876 5877 // Terminate OS thread. 5878 __kmp_reap_worker(thread); 5879 5880 // The thread was killed asynchronously. If it was actively 5881 // spinning in the thread pool, decrement the global count. 5882 // 5883 // There is a small timing hole here - if the worker thread was just waking 5884 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5885 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5886 // the global counter might not get updated. 5887 // 5888 // Currently, this can only happen as the library is unloaded, 5889 // so there are no harmful side effects. 5890 if (thread->th.th_active_in_pool) { 5891 thread->th.th_active_in_pool = FALSE; 5892 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5893 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5894 } 5895 5896 // Decrement # of [worker] threads in the pool. 5897 KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0); 5898 --__kmp_thread_pool_nth; 5899 } 5900 5901 __kmp_free_implicit_task(thread); 5902 5903 // Free the fast memory for tasking 5904 #if USE_FAST_MEMORY 5905 __kmp_free_fast_memory(thread); 5906 #endif /* USE_FAST_MEMORY */ 5907 5908 __kmp_suspend_uninitialize_thread(thread); 5909 5910 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5911 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5912 5913 --__kmp_all_nth; 5914 // __kmp_nth was decremented when thread is added to the pool. 5915 5916 #ifdef KMP_ADJUST_BLOCKTIME 5917 /* Adjust blocktime back to user setting or default if necessary */ 5918 /* Middle initialization might never have occurred */ 5919 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5920 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5921 if (__kmp_nth <= __kmp_avail_proc) { 5922 __kmp_zero_bt = FALSE; 5923 } 5924 } 5925 #endif /* KMP_ADJUST_BLOCKTIME */ 5926 5927 /* free the memory being used */ 5928 if (__kmp_env_consistency_check) { 5929 if (thread->th.th_cons) { 5930 __kmp_free_cons_stack(thread->th.th_cons); 5931 thread->th.th_cons = NULL; 5932 } 5933 } 5934 5935 if (thread->th.th_pri_common != NULL) { 5936 __kmp_free(thread->th.th_pri_common); 5937 thread->th.th_pri_common = NULL; 5938 } 5939 5940 if (thread->th.th_task_state_memo_stack != NULL) { 5941 __kmp_free(thread->th.th_task_state_memo_stack); 5942 thread->th.th_task_state_memo_stack = NULL; 5943 } 5944 5945 #if KMP_USE_BGET 5946 if (thread->th.th_local.bget_data != NULL) { 5947 __kmp_finalize_bget(thread); 5948 } 5949 #endif 5950 5951 #if KMP_AFFINITY_SUPPORTED 5952 if (thread->th.th_affin_mask != NULL) { 5953 KMP_CPU_FREE(thread->th.th_affin_mask); 5954 thread->th.th_affin_mask = NULL; 5955 } 5956 #endif /* KMP_AFFINITY_SUPPORTED */ 5957 5958 #if KMP_USE_HIER_SCHED 5959 if (thread->th.th_hier_bar_data != NULL) { 5960 __kmp_free(thread->th.th_hier_bar_data); 5961 thread->th.th_hier_bar_data = NULL; 5962 } 5963 #endif 5964 5965 __kmp_reap_team(thread->th.th_serial_team); 5966 thread->th.th_serial_team = NULL; 5967 __kmp_free(thread); 5968 5969 KMP_MB(); 5970 5971 } // __kmp_reap_thread 5972 5973 static void __kmp_internal_end(void) { 5974 int i; 5975 5976 /* First, unregister the library */ 5977 __kmp_unregister_library(); 5978 5979 #if KMP_OS_WINDOWS 5980 /* In Win static library, we can't tell when a root actually dies, so we 5981 reclaim the data structures for any root threads that have died but not 5982 unregistered themselves, in order to shut down cleanly. 5983 In Win dynamic library we also can't tell when a thread dies. */ 5984 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 5985 // dead roots 5986 #endif 5987 5988 for (i = 0; i < __kmp_threads_capacity; i++) 5989 if (__kmp_root[i]) 5990 if (__kmp_root[i]->r.r_active) 5991 break; 5992 KMP_MB(); /* Flush all pending memory write invalidates. */ 5993 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 5994 5995 if (i < __kmp_threads_capacity) { 5996 #if KMP_USE_MONITOR 5997 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 5998 KMP_MB(); /* Flush all pending memory write invalidates. */ 5999 6000 // Need to check that monitor was initialized before reaping it. If we are 6001 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 6002 // __kmp_monitor will appear to contain valid data, but it is only valid in 6003 // the parent process, not the child. 6004 // New behavior (201008): instead of keying off of the flag 6005 // __kmp_init_parallel, the monitor thread creation is keyed off 6006 // of the new flag __kmp_init_monitor. 6007 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6008 if (TCR_4(__kmp_init_monitor)) { 6009 __kmp_reap_monitor(&__kmp_monitor); 6010 TCW_4(__kmp_init_monitor, 0); 6011 } 6012 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6013 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6014 #endif // KMP_USE_MONITOR 6015 } else { 6016 /* TODO move this to cleanup code */ 6017 #ifdef KMP_DEBUG 6018 /* make sure that everything has properly ended */ 6019 for (i = 0; i < __kmp_threads_capacity; i++) { 6020 if (__kmp_root[i]) { 6021 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 6022 // there can be uber threads alive here 6023 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 6024 } 6025 } 6026 #endif 6027 6028 KMP_MB(); 6029 6030 // Reap the worker threads. 6031 // This is valid for now, but be careful if threads are reaped sooner. 6032 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 6033 // Get the next thread from the pool. 6034 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 6035 __kmp_thread_pool = thread->th.th_next_pool; 6036 // Reap it. 6037 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 6038 thread->th.th_next_pool = NULL; 6039 thread->th.th_in_pool = FALSE; 6040 __kmp_reap_thread(thread, 0); 6041 } 6042 __kmp_thread_pool_insert_pt = NULL; 6043 6044 // Reap teams. 6045 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 6046 // Get the next team from the pool. 6047 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 6048 __kmp_team_pool = team->t.t_next_pool; 6049 // Reap it. 6050 team->t.t_next_pool = NULL; 6051 __kmp_reap_team(team); 6052 } 6053 6054 __kmp_reap_task_teams(); 6055 6056 #if KMP_OS_UNIX 6057 // Threads that are not reaped should not access any resources since they 6058 // are going to be deallocated soon, so the shutdown sequence should wait 6059 // until all threads either exit the final spin-waiting loop or begin 6060 // sleeping after the given blocktime. 6061 for (i = 0; i < __kmp_threads_capacity; i++) { 6062 kmp_info_t *thr = __kmp_threads[i]; 6063 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 6064 KMP_CPU_PAUSE(); 6065 } 6066 #endif 6067 6068 for (i = 0; i < __kmp_threads_capacity; ++i) { 6069 // TBD: Add some checking... 6070 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 6071 } 6072 6073 /* Make sure all threadprivate destructors get run by joining with all 6074 worker threads before resetting this flag */ 6075 TCW_SYNC_4(__kmp_init_common, FALSE); 6076 6077 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 6078 KMP_MB(); 6079 6080 #if KMP_USE_MONITOR 6081 // See note above: One of the possible fixes for CQ138434 / CQ140126 6082 // 6083 // FIXME: push both code fragments down and CSE them? 6084 // push them into __kmp_cleanup() ? 6085 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 6086 if (TCR_4(__kmp_init_monitor)) { 6087 __kmp_reap_monitor(&__kmp_monitor); 6088 TCW_4(__kmp_init_monitor, 0); 6089 } 6090 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 6091 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 6092 #endif 6093 } /* else !__kmp_global.t_active */ 6094 TCW_4(__kmp_init_gtid, FALSE); 6095 KMP_MB(); /* Flush all pending memory write invalidates. */ 6096 6097 __kmp_cleanup(); 6098 #if OMPT_SUPPORT 6099 ompt_fini(); 6100 #endif 6101 } 6102 6103 void __kmp_internal_end_library(int gtid_req) { 6104 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6105 /* this shouldn't be a race condition because __kmp_internal_end() is the 6106 only place to clear __kmp_serial_init */ 6107 /* we'll check this later too, after we get the lock */ 6108 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6109 // redundaant, because the next check will work in any case. 6110 if (__kmp_global.g.g_abort) { 6111 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6112 /* TODO abort? */ 6113 return; 6114 } 6115 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6116 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6117 return; 6118 } 6119 6120 KMP_MB(); /* Flush all pending memory write invalidates. */ 6121 6122 /* find out who we are and what we should do */ 6123 { 6124 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6125 KA_TRACE( 6126 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6127 if (gtid == KMP_GTID_SHUTDOWN) { 6128 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6129 "already shutdown\n")); 6130 return; 6131 } else if (gtid == KMP_GTID_MONITOR) { 6132 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6133 "registered, or system shutdown\n")); 6134 return; 6135 } else if (gtid == KMP_GTID_DNE) { 6136 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6137 "shutdown\n")); 6138 /* we don't know who we are, but we may still shutdown the library */ 6139 } else if (KMP_UBER_GTID(gtid)) { 6140 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6141 if (__kmp_root[gtid]->r.r_active) { 6142 __kmp_global.g.g_abort = -1; 6143 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6144 KA_TRACE(10, 6145 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6146 gtid)); 6147 return; 6148 } else { 6149 KA_TRACE( 6150 10, 6151 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6152 __kmp_unregister_root_current_thread(gtid); 6153 } 6154 } else { 6155 /* worker threads may call this function through the atexit handler, if they 6156 * call exit() */ 6157 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6158 TODO: do a thorough shutdown instead */ 6159 #ifdef DUMP_DEBUG_ON_EXIT 6160 if (__kmp_debug_buf) 6161 __kmp_dump_debug_buffer(); 6162 #endif 6163 return; 6164 } 6165 } 6166 /* synchronize the termination process */ 6167 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6168 6169 /* have we already finished */ 6170 if (__kmp_global.g.g_abort) { 6171 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6172 /* TODO abort? */ 6173 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6174 return; 6175 } 6176 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6177 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6178 return; 6179 } 6180 6181 /* We need this lock to enforce mutex between this reading of 6182 __kmp_threads_capacity and the writing by __kmp_register_root. 6183 Alternatively, we can use a counter of roots that is atomically updated by 6184 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6185 __kmp_internal_end_*. */ 6186 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6187 6188 /* now we can safely conduct the actual termination */ 6189 __kmp_internal_end(); 6190 6191 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6192 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6193 6194 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6195 6196 #ifdef DUMP_DEBUG_ON_EXIT 6197 if (__kmp_debug_buf) 6198 __kmp_dump_debug_buffer(); 6199 #endif 6200 6201 #if KMP_OS_WINDOWS 6202 __kmp_close_console(); 6203 #endif 6204 6205 __kmp_fini_allocator(); 6206 6207 } // __kmp_internal_end_library 6208 6209 void __kmp_internal_end_thread(int gtid_req) { 6210 int i; 6211 6212 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6213 /* this shouldn't be a race condition because __kmp_internal_end() is the 6214 * only place to clear __kmp_serial_init */ 6215 /* we'll check this later too, after we get the lock */ 6216 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6217 // redundant, because the next check will work in any case. 6218 if (__kmp_global.g.g_abort) { 6219 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6220 /* TODO abort? */ 6221 return; 6222 } 6223 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6224 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6225 return; 6226 } 6227 6228 KMP_MB(); /* Flush all pending memory write invalidates. */ 6229 6230 /* find out who we are and what we should do */ 6231 { 6232 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6233 KA_TRACE(10, 6234 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6235 if (gtid == KMP_GTID_SHUTDOWN) { 6236 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6237 "already shutdown\n")); 6238 return; 6239 } else if (gtid == KMP_GTID_MONITOR) { 6240 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6241 "registered, or system shutdown\n")); 6242 return; 6243 } else if (gtid == KMP_GTID_DNE) { 6244 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6245 "shutdown\n")); 6246 return; 6247 /* we don't know who we are */ 6248 } else if (KMP_UBER_GTID(gtid)) { 6249 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6250 if (__kmp_root[gtid]->r.r_active) { 6251 __kmp_global.g.g_abort = -1; 6252 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6253 KA_TRACE(10, 6254 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6255 gtid)); 6256 return; 6257 } else { 6258 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6259 gtid)); 6260 __kmp_unregister_root_current_thread(gtid); 6261 } 6262 } else { 6263 /* just a worker thread, let's leave */ 6264 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6265 6266 if (gtid >= 0) { 6267 __kmp_threads[gtid]->th.th_task_team = NULL; 6268 } 6269 6270 KA_TRACE(10, 6271 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6272 gtid)); 6273 return; 6274 } 6275 } 6276 #if KMP_DYNAMIC_LIB 6277 // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber 6278 // thread, because we will better shutdown later in the library destructor. 6279 // The reason of this change is performance problem when non-openmp thread in 6280 // a loop forks and joins many openmp threads. We can save a lot of time 6281 // keeping worker threads alive until the program shutdown. 6282 // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) 6283 // and Windows(DPD200287443) that occurs when using critical sections from 6284 // foreign threads. 6285 if (__kmp_pause_status != kmp_hard_paused) { 6286 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6287 return; 6288 } 6289 #endif 6290 /* synchronize the termination process */ 6291 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6292 6293 /* have we already finished */ 6294 if (__kmp_global.g.g_abort) { 6295 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6296 /* TODO abort? */ 6297 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6298 return; 6299 } 6300 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6301 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6302 return; 6303 } 6304 6305 /* We need this lock to enforce mutex between this reading of 6306 __kmp_threads_capacity and the writing by __kmp_register_root. 6307 Alternatively, we can use a counter of roots that is atomically updated by 6308 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6309 __kmp_internal_end_*. */ 6310 6311 /* should we finish the run-time? are all siblings done? */ 6312 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6313 6314 for (i = 0; i < __kmp_threads_capacity; ++i) { 6315 if (KMP_UBER_GTID(i)) { 6316 KA_TRACE( 6317 10, 6318 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6319 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6320 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6321 return; 6322 } 6323 } 6324 6325 /* now we can safely conduct the actual termination */ 6326 6327 __kmp_internal_end(); 6328 6329 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6330 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6331 6332 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6333 6334 #ifdef DUMP_DEBUG_ON_EXIT 6335 if (__kmp_debug_buf) 6336 __kmp_dump_debug_buffer(); 6337 #endif 6338 } // __kmp_internal_end_thread 6339 6340 // ----------------------------------------------------------------------------- 6341 // Library registration stuff. 6342 6343 static long __kmp_registration_flag = 0; 6344 // Random value used to indicate library initialization. 6345 static char *__kmp_registration_str = NULL; 6346 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6347 6348 static inline char *__kmp_reg_status_name() { 6349 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6350 each thread. If registration and unregistration go in different threads 6351 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6352 env var can not be found, because the name will contain different pid. */ 6353 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6354 } // __kmp_reg_status_get 6355 6356 void __kmp_register_library_startup(void) { 6357 6358 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6359 int done = 0; 6360 union { 6361 double dtime; 6362 long ltime; 6363 } time; 6364 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6365 __kmp_initialize_system_tick(); 6366 #endif 6367 __kmp_read_system_time(&time.dtime); 6368 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6369 __kmp_registration_str = 6370 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6371 __kmp_registration_flag, KMP_LIBRARY_FILE); 6372 6373 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6374 __kmp_registration_str)); 6375 6376 while (!done) { 6377 6378 char *value = NULL; // Actual value of the environment variable. 6379 6380 // Set environment variable, but do not overwrite if it is exist. 6381 __kmp_env_set(name, __kmp_registration_str, 0); 6382 // Check the variable is written. 6383 value = __kmp_env_get(name); 6384 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6385 6386 done = 1; // Ok, environment variable set successfully, exit the loop. 6387 6388 } else { 6389 6390 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6391 // Check whether it alive or dead. 6392 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6393 char *tail = value; 6394 char *flag_addr_str = NULL; 6395 char *flag_val_str = NULL; 6396 char const *file_name = NULL; 6397 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6398 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6399 file_name = tail; 6400 if (tail != NULL) { 6401 long *flag_addr = 0; 6402 long flag_val = 0; 6403 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr)); 6404 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6405 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6406 // First, check whether environment-encoded address is mapped into 6407 // addr space. 6408 // If so, dereference it to see if it still has the right value. 6409 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6410 neighbor = 1; 6411 } else { 6412 // If not, then we know the other copy of the library is no longer 6413 // running. 6414 neighbor = 2; 6415 } 6416 } 6417 } 6418 switch (neighbor) { 6419 case 0: // Cannot parse environment variable -- neighbor status unknown. 6420 // Assume it is the incompatible format of future version of the 6421 // library. Assume the other library is alive. 6422 // WARN( ... ); // TODO: Issue a warning. 6423 file_name = "unknown library"; 6424 KMP_FALLTHROUGH(); 6425 // Attention! Falling to the next case. That's intentional. 6426 case 1: { // Neighbor is alive. 6427 // Check it is allowed. 6428 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6429 if (!__kmp_str_match_true(duplicate_ok)) { 6430 // That's not allowed. Issue fatal error. 6431 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6432 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6433 } 6434 KMP_INTERNAL_FREE(duplicate_ok); 6435 __kmp_duplicate_library_ok = 1; 6436 done = 1; // Exit the loop. 6437 } break; 6438 case 2: { // Neighbor is dead. 6439 // Clear the variable and try to register library again. 6440 __kmp_env_unset(name); 6441 } break; 6442 default: { KMP_DEBUG_ASSERT(0); } break; 6443 } 6444 } 6445 KMP_INTERNAL_FREE((void *)value); 6446 } 6447 KMP_INTERNAL_FREE((void *)name); 6448 6449 } // func __kmp_register_library_startup 6450 6451 void __kmp_unregister_library(void) { 6452 6453 char *name = __kmp_reg_status_name(); 6454 char *value = __kmp_env_get(name); 6455 6456 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6457 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6458 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6459 // Ok, this is our variable. Delete it. 6460 __kmp_env_unset(name); 6461 } 6462 6463 KMP_INTERNAL_FREE(__kmp_registration_str); 6464 KMP_INTERNAL_FREE(value); 6465 KMP_INTERNAL_FREE(name); 6466 6467 __kmp_registration_flag = 0; 6468 __kmp_registration_str = NULL; 6469 6470 } // __kmp_unregister_library 6471 6472 // End of Library registration stuff. 6473 // ----------------------------------------------------------------------------- 6474 6475 #if KMP_MIC_SUPPORTED 6476 6477 static void __kmp_check_mic_type() { 6478 kmp_cpuid_t cpuid_state = {0}; 6479 kmp_cpuid_t *cs_p = &cpuid_state; 6480 __kmp_x86_cpuid(1, 0, cs_p); 6481 // We don't support mic1 at the moment 6482 if ((cs_p->eax & 0xff0) == 0xB10) { 6483 __kmp_mic_type = mic2; 6484 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6485 __kmp_mic_type = mic3; 6486 } else { 6487 __kmp_mic_type = non_mic; 6488 } 6489 } 6490 6491 #endif /* KMP_MIC_SUPPORTED */ 6492 6493 static void __kmp_do_serial_initialize(void) { 6494 int i, gtid; 6495 int size; 6496 6497 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6498 6499 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6500 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6501 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6502 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6503 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6504 6505 #if OMPT_SUPPORT 6506 ompt_pre_init(); 6507 #endif 6508 6509 __kmp_validate_locks(); 6510 6511 /* Initialize internal memory allocator */ 6512 __kmp_init_allocator(); 6513 6514 /* Register the library startup via an environment variable and check to see 6515 whether another copy of the library is already registered. */ 6516 6517 __kmp_register_library_startup(); 6518 6519 /* TODO reinitialization of library */ 6520 if (TCR_4(__kmp_global.g.g_done)) { 6521 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6522 } 6523 6524 __kmp_global.g.g_abort = 0; 6525 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6526 6527 /* initialize the locks */ 6528 #if KMP_USE_ADAPTIVE_LOCKS 6529 #if KMP_DEBUG_ADAPTIVE_LOCKS 6530 __kmp_init_speculative_stats(); 6531 #endif 6532 #endif 6533 #if KMP_STATS_ENABLED 6534 __kmp_stats_init(); 6535 #endif 6536 __kmp_init_lock(&__kmp_global_lock); 6537 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6538 __kmp_init_lock(&__kmp_debug_lock); 6539 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6540 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6541 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6542 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6543 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6544 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6545 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6546 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6547 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6548 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6549 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6550 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6551 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6552 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6553 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6554 #if KMP_USE_MONITOR 6555 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6556 #endif 6557 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6558 6559 /* conduct initialization and initial setup of configuration */ 6560 6561 __kmp_runtime_initialize(); 6562 6563 #if KMP_MIC_SUPPORTED 6564 __kmp_check_mic_type(); 6565 #endif 6566 6567 // Some global variable initialization moved here from kmp_env_initialize() 6568 #ifdef KMP_DEBUG 6569 kmp_diag = 0; 6570 #endif 6571 __kmp_abort_delay = 0; 6572 6573 // From __kmp_init_dflt_team_nth() 6574 /* assume the entire machine will be used */ 6575 __kmp_dflt_team_nth_ub = __kmp_xproc; 6576 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6577 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6578 } 6579 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6580 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6581 } 6582 __kmp_max_nth = __kmp_sys_max_nth; 6583 __kmp_cg_max_nth = __kmp_sys_max_nth; 6584 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6585 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6586 __kmp_teams_max_nth = __kmp_sys_max_nth; 6587 } 6588 6589 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6590 // part 6591 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6592 #if KMP_USE_MONITOR 6593 __kmp_monitor_wakeups = 6594 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6595 __kmp_bt_intervals = 6596 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6597 #endif 6598 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6599 __kmp_library = library_throughput; 6600 // From KMP_SCHEDULE initialization 6601 __kmp_static = kmp_sch_static_balanced; 6602 // AC: do not use analytical here, because it is non-monotonous 6603 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6604 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6605 // need to repeat assignment 6606 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6607 // bit control and barrier method control parts 6608 #if KMP_FAST_REDUCTION_BARRIER 6609 #define kmp_reduction_barrier_gather_bb ((int)1) 6610 #define kmp_reduction_barrier_release_bb ((int)1) 6611 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6612 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6613 #endif // KMP_FAST_REDUCTION_BARRIER 6614 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6615 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6616 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6617 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6618 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6619 #if KMP_FAST_REDUCTION_BARRIER 6620 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6621 // lin_64 ): hyper,1 6622 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6623 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6624 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6625 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6626 } 6627 #endif // KMP_FAST_REDUCTION_BARRIER 6628 } 6629 #if KMP_FAST_REDUCTION_BARRIER 6630 #undef kmp_reduction_barrier_release_pat 6631 #undef kmp_reduction_barrier_gather_pat 6632 #undef kmp_reduction_barrier_release_bb 6633 #undef kmp_reduction_barrier_gather_bb 6634 #endif // KMP_FAST_REDUCTION_BARRIER 6635 #if KMP_MIC_SUPPORTED 6636 if (__kmp_mic_type == mic2) { // KNC 6637 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6638 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6639 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6640 1; // forkjoin release 6641 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6642 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6643 } 6644 #if KMP_FAST_REDUCTION_BARRIER 6645 if (__kmp_mic_type == mic2) { // KNC 6646 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6647 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6648 } 6649 #endif // KMP_FAST_REDUCTION_BARRIER 6650 #endif // KMP_MIC_SUPPORTED 6651 6652 // From KMP_CHECKS initialization 6653 #ifdef KMP_DEBUG 6654 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6655 #else 6656 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6657 #endif 6658 6659 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6660 __kmp_foreign_tp = TRUE; 6661 6662 __kmp_global.g.g_dynamic = FALSE; 6663 __kmp_global.g.g_dynamic_mode = dynamic_default; 6664 6665 __kmp_env_initialize(NULL); 6666 6667 // Print all messages in message catalog for testing purposes. 6668 #ifdef KMP_DEBUG 6669 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6670 if (__kmp_str_match_true(val)) { 6671 kmp_str_buf_t buffer; 6672 __kmp_str_buf_init(&buffer); 6673 __kmp_i18n_dump_catalog(&buffer); 6674 __kmp_printf("%s", buffer.str); 6675 __kmp_str_buf_free(&buffer); 6676 } 6677 __kmp_env_free(&val); 6678 #endif 6679 6680 __kmp_threads_capacity = 6681 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6682 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6683 __kmp_tp_capacity = __kmp_default_tp_capacity( 6684 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6685 6686 // If the library is shut down properly, both pools must be NULL. Just in 6687 // case, set them to NULL -- some memory may leak, but subsequent code will 6688 // work even if pools are not freed. 6689 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6690 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6691 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6692 __kmp_thread_pool = NULL; 6693 __kmp_thread_pool_insert_pt = NULL; 6694 __kmp_team_pool = NULL; 6695 6696 /* Allocate all of the variable sized records */ 6697 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6698 * expandable */ 6699 /* Since allocation is cache-aligned, just add extra padding at the end */ 6700 size = 6701 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6702 CACHE_LINE; 6703 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6704 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6705 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6706 6707 /* init thread counts */ 6708 KMP_DEBUG_ASSERT(__kmp_all_nth == 6709 0); // Asserts fail if the library is reinitializing and 6710 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6711 __kmp_all_nth = 0; 6712 __kmp_nth = 0; 6713 6714 /* setup the uber master thread and hierarchy */ 6715 gtid = __kmp_register_root(TRUE); 6716 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6717 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6718 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6719 6720 KMP_MB(); /* Flush all pending memory write invalidates. */ 6721 6722 __kmp_common_initialize(); 6723 6724 #if KMP_OS_UNIX 6725 /* invoke the child fork handler */ 6726 __kmp_register_atfork(); 6727 #endif 6728 6729 #if !KMP_DYNAMIC_LIB 6730 { 6731 /* Invoke the exit handler when the program finishes, only for static 6732 library. For dynamic library, we already have _fini and DllMain. */ 6733 int rc = atexit(__kmp_internal_end_atexit); 6734 if (rc != 0) { 6735 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6736 __kmp_msg_null); 6737 } 6738 } 6739 #endif 6740 6741 #if KMP_HANDLE_SIGNALS 6742 #if KMP_OS_UNIX 6743 /* NOTE: make sure that this is called before the user installs their own 6744 signal handlers so that the user handlers are called first. this way they 6745 can return false, not call our handler, avoid terminating the library, and 6746 continue execution where they left off. */ 6747 __kmp_install_signals(FALSE); 6748 #endif /* KMP_OS_UNIX */ 6749 #if KMP_OS_WINDOWS 6750 __kmp_install_signals(TRUE); 6751 #endif /* KMP_OS_WINDOWS */ 6752 #endif 6753 6754 /* we have finished the serial initialization */ 6755 __kmp_init_counter++; 6756 6757 __kmp_init_serial = TRUE; 6758 6759 if (__kmp_settings) { 6760 __kmp_env_print(); 6761 } 6762 6763 #if OMP_40_ENABLED 6764 if (__kmp_display_env || __kmp_display_env_verbose) { 6765 __kmp_env_print_2(); 6766 } 6767 #endif // OMP_40_ENABLED 6768 6769 #if OMPT_SUPPORT 6770 ompt_post_init(); 6771 #endif 6772 6773 KMP_MB(); 6774 6775 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6776 } 6777 6778 void __kmp_serial_initialize(void) { 6779 if (__kmp_init_serial) { 6780 return; 6781 } 6782 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6783 if (__kmp_init_serial) { 6784 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6785 return; 6786 } 6787 __kmp_do_serial_initialize(); 6788 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6789 } 6790 6791 static void __kmp_do_middle_initialize(void) { 6792 int i, j; 6793 int prev_dflt_team_nth; 6794 6795 if (!__kmp_init_serial) { 6796 __kmp_do_serial_initialize(); 6797 } 6798 6799 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6800 6801 // Save the previous value for the __kmp_dflt_team_nth so that 6802 // we can avoid some reinitialization if it hasn't changed. 6803 prev_dflt_team_nth = __kmp_dflt_team_nth; 6804 6805 #if KMP_AFFINITY_SUPPORTED 6806 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6807 // number of cores on the machine. 6808 __kmp_affinity_initialize(); 6809 6810 // Run through the __kmp_threads array and set the affinity mask 6811 // for each root thread that is currently registered with the RTL. 6812 for (i = 0; i < __kmp_threads_capacity; i++) { 6813 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6814 __kmp_affinity_set_init_mask(i, TRUE); 6815 } 6816 } 6817 #endif /* KMP_AFFINITY_SUPPORTED */ 6818 6819 KMP_ASSERT(__kmp_xproc > 0); 6820 if (__kmp_avail_proc == 0) { 6821 __kmp_avail_proc = __kmp_xproc; 6822 } 6823 6824 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6825 // correct them now 6826 j = 0; 6827 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6828 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6829 __kmp_avail_proc; 6830 j++; 6831 } 6832 6833 if (__kmp_dflt_team_nth == 0) { 6834 #ifdef KMP_DFLT_NTH_CORES 6835 // Default #threads = #cores 6836 __kmp_dflt_team_nth = __kmp_ncores; 6837 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6838 "__kmp_ncores (%d)\n", 6839 __kmp_dflt_team_nth)); 6840 #else 6841 // Default #threads = #available OS procs 6842 __kmp_dflt_team_nth = __kmp_avail_proc; 6843 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6844 "__kmp_avail_proc(%d)\n", 6845 __kmp_dflt_team_nth)); 6846 #endif /* KMP_DFLT_NTH_CORES */ 6847 } 6848 6849 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 6850 __kmp_dflt_team_nth = KMP_MIN_NTH; 6851 } 6852 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 6853 __kmp_dflt_team_nth = __kmp_sys_max_nth; 6854 } 6855 6856 // There's no harm in continuing if the following check fails, 6857 // but it indicates an error in the previous logic. 6858 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 6859 6860 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 6861 // Run through the __kmp_threads array and set the num threads icv for each 6862 // root thread that is currently registered with the RTL (which has not 6863 // already explicitly set its nthreads-var with a call to 6864 // omp_set_num_threads()). 6865 for (i = 0; i < __kmp_threads_capacity; i++) { 6866 kmp_info_t *thread = __kmp_threads[i]; 6867 if (thread == NULL) 6868 continue; 6869 if (thread->th.th_current_task->td_icvs.nproc != 0) 6870 continue; 6871 6872 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 6873 } 6874 } 6875 KA_TRACE( 6876 20, 6877 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 6878 __kmp_dflt_team_nth)); 6879 6880 #ifdef KMP_ADJUST_BLOCKTIME 6881 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 6882 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6883 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6884 if (__kmp_nth > __kmp_avail_proc) { 6885 __kmp_zero_bt = TRUE; 6886 } 6887 } 6888 #endif /* KMP_ADJUST_BLOCKTIME */ 6889 6890 /* we have finished middle initialization */ 6891 TCW_SYNC_4(__kmp_init_middle, TRUE); 6892 6893 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 6894 } 6895 6896 void __kmp_middle_initialize(void) { 6897 if (__kmp_init_middle) { 6898 return; 6899 } 6900 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6901 if (__kmp_init_middle) { 6902 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6903 return; 6904 } 6905 __kmp_do_middle_initialize(); 6906 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6907 } 6908 6909 void __kmp_parallel_initialize(void) { 6910 int gtid = __kmp_entry_gtid(); // this might be a new root 6911 6912 /* synchronize parallel initialization (for sibling) */ 6913 if (TCR_4(__kmp_init_parallel)) 6914 return; 6915 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6916 if (TCR_4(__kmp_init_parallel)) { 6917 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6918 return; 6919 } 6920 6921 /* TODO reinitialization after we have already shut down */ 6922 if (TCR_4(__kmp_global.g.g_done)) { 6923 KA_TRACE( 6924 10, 6925 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 6926 __kmp_infinite_loop(); 6927 } 6928 6929 /* jc: The lock __kmp_initz_lock is already held, so calling 6930 __kmp_serial_initialize would cause a deadlock. So we call 6931 __kmp_do_serial_initialize directly. */ 6932 if (!__kmp_init_middle) { 6933 __kmp_do_middle_initialize(); 6934 } 6935 6936 #if OMP_50_ENABLED 6937 __kmp_resume_if_hard_paused(); 6938 #endif 6939 6940 /* begin initialization */ 6941 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 6942 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6943 6944 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6945 // Save the FP control regs. 6946 // Worker threads will set theirs to these values at thread startup. 6947 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 6948 __kmp_store_mxcsr(&__kmp_init_mxcsr); 6949 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 6950 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 6951 6952 #if KMP_OS_UNIX 6953 #if KMP_HANDLE_SIGNALS 6954 /* must be after __kmp_serial_initialize */ 6955 __kmp_install_signals(TRUE); 6956 #endif 6957 #endif 6958 6959 __kmp_suspend_initialize(); 6960 6961 #if defined(USE_LOAD_BALANCE) 6962 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6963 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 6964 } 6965 #else 6966 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6967 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 6968 } 6969 #endif 6970 6971 if (__kmp_version) { 6972 __kmp_print_version_2(); 6973 } 6974 6975 /* we have finished parallel initialization */ 6976 TCW_SYNC_4(__kmp_init_parallel, TRUE); 6977 6978 KMP_MB(); 6979 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 6980 6981 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6982 } 6983 6984 /* ------------------------------------------------------------------------ */ 6985 6986 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6987 kmp_team_t *team) { 6988 kmp_disp_t *dispatch; 6989 6990 KMP_MB(); 6991 6992 /* none of the threads have encountered any constructs, yet. */ 6993 this_thr->th.th_local.this_construct = 0; 6994 #if KMP_CACHE_MANAGE 6995 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 6996 #endif /* KMP_CACHE_MANAGE */ 6997 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 6998 KMP_DEBUG_ASSERT(dispatch); 6999 KMP_DEBUG_ASSERT(team->t.t_dispatch); 7000 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 7001 // this_thr->th.th_info.ds.ds_tid ] ); 7002 7003 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 7004 #if OMP_45_ENABLED 7005 dispatch->th_doacross_buf_idx = 7006 0; /* reset the doacross dispatch buffer counter */ 7007 #endif 7008 if (__kmp_env_consistency_check) 7009 __kmp_push_parallel(gtid, team->t.t_ident); 7010 7011 KMP_MB(); /* Flush all pending memory write invalidates. */ 7012 } 7013 7014 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 7015 kmp_team_t *team) { 7016 if (__kmp_env_consistency_check) 7017 __kmp_pop_parallel(gtid, team->t.t_ident); 7018 7019 __kmp_finish_implicit_task(this_thr); 7020 } 7021 7022 int __kmp_invoke_task_func(int gtid) { 7023 int rc; 7024 int tid = __kmp_tid_from_gtid(gtid); 7025 kmp_info_t *this_thr = __kmp_threads[gtid]; 7026 kmp_team_t *team = this_thr->th.th_team; 7027 7028 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 7029 #if USE_ITT_BUILD 7030 if (__itt_stack_caller_create_ptr) { 7031 __kmp_itt_stack_callee_enter( 7032 (__itt_caller) 7033 team->t.t_stack_id); // inform ittnotify about entering user's code 7034 } 7035 #endif /* USE_ITT_BUILD */ 7036 #if INCLUDE_SSC_MARKS 7037 SSC_MARK_INVOKING(); 7038 #endif 7039 7040 #if OMPT_SUPPORT 7041 void *dummy; 7042 void **exit_runtime_p; 7043 ompt_data_t *my_task_data; 7044 ompt_data_t *my_parallel_data; 7045 int ompt_team_size; 7046 7047 if (ompt_enabled.enabled) { 7048 exit_runtime_p = &( 7049 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame.ptr); 7050 } else { 7051 exit_runtime_p = &dummy; 7052 } 7053 7054 my_task_data = 7055 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 7056 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 7057 if (ompt_enabled.ompt_callback_implicit_task) { 7058 ompt_team_size = team->t.t_nproc; 7059 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7060 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 7061 __kmp_tid_from_gtid(gtid), ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7062 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 7063 } 7064 #endif 7065 7066 { 7067 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 7068 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 7069 rc = 7070 __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 7071 tid, (int)team->t.t_argc, (void **)team->t.t_argv 7072 #if OMPT_SUPPORT 7073 , 7074 exit_runtime_p 7075 #endif 7076 ); 7077 #if OMPT_SUPPORT 7078 *exit_runtime_p = NULL; 7079 #endif 7080 } 7081 7082 #if USE_ITT_BUILD 7083 if (__itt_stack_caller_create_ptr) { 7084 __kmp_itt_stack_callee_leave( 7085 (__itt_caller) 7086 team->t.t_stack_id); // inform ittnotify about leaving user's code 7087 } 7088 #endif /* USE_ITT_BUILD */ 7089 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 7090 7091 return rc; 7092 } 7093 7094 #if OMP_40_ENABLED 7095 void __kmp_teams_master(int gtid) { 7096 // This routine is called by all master threads in teams construct 7097 kmp_info_t *thr = __kmp_threads[gtid]; 7098 kmp_team_t *team = thr->th.th_team; 7099 ident_t *loc = team->t.t_ident; 7100 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 7101 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 7102 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 7103 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 7104 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 7105 // Launch league of teams now, but not let workers execute 7106 // (they hang on fork barrier until next parallel) 7107 #if INCLUDE_SSC_MARKS 7108 SSC_MARK_FORKING(); 7109 #endif 7110 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7111 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7112 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7113 #if INCLUDE_SSC_MARKS 7114 SSC_MARK_JOINING(); 7115 #endif 7116 7117 // AC: last parameter "1" eliminates join barrier which won't work because 7118 // worker threads are in a fork barrier waiting for more parallel regions 7119 __kmp_join_call(loc, gtid 7120 #if OMPT_SUPPORT 7121 , 7122 fork_context_intel 7123 #endif 7124 , 7125 1); 7126 } 7127 7128 int __kmp_invoke_teams_master(int gtid) { 7129 kmp_info_t *this_thr = __kmp_threads[gtid]; 7130 kmp_team_t *team = this_thr->th.th_team; 7131 #if KMP_DEBUG 7132 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7133 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7134 (void *)__kmp_teams_master); 7135 #endif 7136 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7137 __kmp_teams_master(gtid); 7138 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7139 return 1; 7140 } 7141 #endif /* OMP_40_ENABLED */ 7142 7143 /* this sets the requested number of threads for the next parallel region 7144 encountered by this team. since this should be enclosed in the forkjoin 7145 critical section it should avoid race conditions with assymmetrical nested 7146 parallelism */ 7147 7148 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7149 kmp_info_t *thr = __kmp_threads[gtid]; 7150 7151 if (num_threads > 0) 7152 thr->th.th_set_nproc = num_threads; 7153 } 7154 7155 #if OMP_40_ENABLED 7156 7157 /* this sets the requested number of teams for the teams region and/or 7158 the number of threads for the next parallel region encountered */ 7159 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7160 int num_threads) { 7161 kmp_info_t *thr = __kmp_threads[gtid]; 7162 KMP_DEBUG_ASSERT(num_teams >= 0); 7163 KMP_DEBUG_ASSERT(num_threads >= 0); 7164 7165 if (num_teams == 0) 7166 num_teams = 1; // default number of teams is 1. 7167 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7168 if (!__kmp_reserve_warn) { 7169 __kmp_reserve_warn = 1; 7170 __kmp_msg(kmp_ms_warning, 7171 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7172 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7173 } 7174 num_teams = __kmp_teams_max_nth; 7175 } 7176 // Set number of teams (number of threads in the outer "parallel" of the 7177 // teams) 7178 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7179 7180 // Remember the number of threads for inner parallel regions 7181 if (num_threads == 0) { 7182 if (!TCR_4(__kmp_init_middle)) 7183 __kmp_middle_initialize(); // get __kmp_avail_proc calculated 7184 num_threads = __kmp_avail_proc / num_teams; 7185 if (num_teams * num_threads > __kmp_teams_max_nth) { 7186 // adjust num_threads w/o warning as it is not user setting 7187 num_threads = __kmp_teams_max_nth / num_teams; 7188 } 7189 } else { 7190 if (num_teams * num_threads > __kmp_teams_max_nth) { 7191 int new_threads = __kmp_teams_max_nth / num_teams; 7192 if (!__kmp_reserve_warn) { // user asked for too many threads 7193 __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT 7194 __kmp_msg(kmp_ms_warning, 7195 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7196 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7197 } 7198 num_threads = new_threads; 7199 } 7200 } 7201 thr->th.th_teams_size.nth = num_threads; 7202 } 7203 7204 // Set the proc_bind var to use in the following parallel region. 7205 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7206 kmp_info_t *thr = __kmp_threads[gtid]; 7207 thr->th.th_set_proc_bind = proc_bind; 7208 } 7209 7210 #endif /* OMP_40_ENABLED */ 7211 7212 /* Launch the worker threads into the microtask. */ 7213 7214 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7215 kmp_info_t *this_thr = __kmp_threads[gtid]; 7216 7217 #ifdef KMP_DEBUG 7218 int f; 7219 #endif /* KMP_DEBUG */ 7220 7221 KMP_DEBUG_ASSERT(team); 7222 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7223 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7224 KMP_MB(); /* Flush all pending memory write invalidates. */ 7225 7226 team->t.t_construct = 0; /* no single directives seen yet */ 7227 team->t.t_ordered.dt.t_value = 7228 0; /* thread 0 enters the ordered section first */ 7229 7230 /* Reset the identifiers on the dispatch buffer */ 7231 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7232 if (team->t.t_max_nproc > 1) { 7233 int i; 7234 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7235 team->t.t_disp_buffer[i].buffer_index = i; 7236 #if OMP_45_ENABLED 7237 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7238 #endif 7239 } 7240 } else { 7241 team->t.t_disp_buffer[0].buffer_index = 0; 7242 #if OMP_45_ENABLED 7243 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7244 #endif 7245 } 7246 7247 KMP_MB(); /* Flush all pending memory write invalidates. */ 7248 KMP_ASSERT(this_thr->th.th_team == team); 7249 7250 #ifdef KMP_DEBUG 7251 for (f = 0; f < team->t.t_nproc; f++) { 7252 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7253 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7254 } 7255 #endif /* KMP_DEBUG */ 7256 7257 /* release the worker threads so they may begin working */ 7258 __kmp_fork_barrier(gtid, 0); 7259 } 7260 7261 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7262 kmp_info_t *this_thr = __kmp_threads[gtid]; 7263 7264 KMP_DEBUG_ASSERT(team); 7265 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7266 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7267 KMP_MB(); /* Flush all pending memory write invalidates. */ 7268 7269 /* Join barrier after fork */ 7270 7271 #ifdef KMP_DEBUG 7272 if (__kmp_threads[gtid] && 7273 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7274 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7275 __kmp_threads[gtid]); 7276 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7277 "team->t.t_nproc=%d\n", 7278 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7279 team->t.t_nproc); 7280 __kmp_print_structure(); 7281 } 7282 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7283 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7284 #endif /* KMP_DEBUG */ 7285 7286 __kmp_join_barrier(gtid); /* wait for everyone */ 7287 #if OMPT_SUPPORT 7288 if (ompt_enabled.enabled && 7289 this_thr->th.ompt_thread_info.state == ompt_state_wait_barrier_implicit) { 7290 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7291 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7292 this_thr->th.ompt_thread_info.state = ompt_state_overhead; 7293 #if OMPT_OPTIONAL 7294 void *codeptr = NULL; 7295 if (KMP_MASTER_TID(ds_tid) && 7296 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7297 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7298 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7299 7300 if (ompt_enabled.ompt_callback_sync_region_wait) { 7301 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7302 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr); 7303 } 7304 if (ompt_enabled.ompt_callback_sync_region) { 7305 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7306 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr); 7307 } 7308 #endif 7309 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7310 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7311 ompt_scope_end, NULL, task_data, 0, ds_tid, ompt_task_implicit); // TODO: Can this be ompt_task_initial? 7312 } 7313 } 7314 #endif 7315 7316 KMP_MB(); /* Flush all pending memory write invalidates. */ 7317 KMP_ASSERT(this_thr->th.th_team == team); 7318 } 7319 7320 /* ------------------------------------------------------------------------ */ 7321 7322 #ifdef USE_LOAD_BALANCE 7323 7324 // Return the worker threads actively spinning in the hot team, if we 7325 // are at the outermost level of parallelism. Otherwise, return 0. 7326 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7327 int i; 7328 int retval; 7329 kmp_team_t *hot_team; 7330 7331 if (root->r.r_active) { 7332 return 0; 7333 } 7334 hot_team = root->r.r_hot_team; 7335 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7336 return hot_team->t.t_nproc - 1; // Don't count master thread 7337 } 7338 7339 // Skip the master thread - it is accounted for elsewhere. 7340 retval = 0; 7341 for (i = 1; i < hot_team->t.t_nproc; i++) { 7342 if (hot_team->t.t_threads[i]->th.th_active) { 7343 retval++; 7344 } 7345 } 7346 return retval; 7347 } 7348 7349 // Perform an automatic adjustment to the number of 7350 // threads used by the next parallel region. 7351 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7352 int retval; 7353 int pool_active; 7354 int hot_team_active; 7355 int team_curr_active; 7356 int system_active; 7357 7358 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7359 set_nproc)); 7360 KMP_DEBUG_ASSERT(root); 7361 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7362 ->th.th_current_task->td_icvs.dynamic == TRUE); 7363 KMP_DEBUG_ASSERT(set_nproc > 1); 7364 7365 if (set_nproc == 1) { 7366 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7367 return 1; 7368 } 7369 7370 // Threads that are active in the thread pool, active in the hot team for this 7371 // particular root (if we are at the outer par level), and the currently 7372 // executing thread (to become the master) are available to add to the new 7373 // team, but are currently contributing to the system load, and must be 7374 // accounted for. 7375 pool_active = __kmp_thread_pool_active_nth; 7376 hot_team_active = __kmp_active_hot_team_nproc(root); 7377 team_curr_active = pool_active + hot_team_active + 1; 7378 7379 // Check the system load. 7380 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7381 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7382 "hot team active = %d\n", 7383 system_active, pool_active, hot_team_active)); 7384 7385 if (system_active < 0) { 7386 // There was an error reading the necessary info from /proc, so use the 7387 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7388 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7389 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7390 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7391 7392 // Make this call behave like the thread limit algorithm. 7393 retval = __kmp_avail_proc - __kmp_nth + 7394 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7395 if (retval > set_nproc) { 7396 retval = set_nproc; 7397 } 7398 if (retval < KMP_MIN_NTH) { 7399 retval = KMP_MIN_NTH; 7400 } 7401 7402 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7403 retval)); 7404 return retval; 7405 } 7406 7407 // There is a slight delay in the load balance algorithm in detecting new 7408 // running procs. The real system load at this instant should be at least as 7409 // large as the #active omp thread that are available to add to the team. 7410 if (system_active < team_curr_active) { 7411 system_active = team_curr_active; 7412 } 7413 retval = __kmp_avail_proc - system_active + team_curr_active; 7414 if (retval > set_nproc) { 7415 retval = set_nproc; 7416 } 7417 if (retval < KMP_MIN_NTH) { 7418 retval = KMP_MIN_NTH; 7419 } 7420 7421 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7422 return retval; 7423 } // __kmp_load_balance_nproc() 7424 7425 #endif /* USE_LOAD_BALANCE */ 7426 7427 /* ------------------------------------------------------------------------ */ 7428 7429 /* NOTE: this is called with the __kmp_init_lock held */ 7430 void __kmp_cleanup(void) { 7431 int f; 7432 7433 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7434 7435 if (TCR_4(__kmp_init_parallel)) { 7436 #if KMP_HANDLE_SIGNALS 7437 __kmp_remove_signals(); 7438 #endif 7439 TCW_4(__kmp_init_parallel, FALSE); 7440 } 7441 7442 if (TCR_4(__kmp_init_middle)) { 7443 #if KMP_AFFINITY_SUPPORTED 7444 __kmp_affinity_uninitialize(); 7445 #endif /* KMP_AFFINITY_SUPPORTED */ 7446 __kmp_cleanup_hierarchy(); 7447 TCW_4(__kmp_init_middle, FALSE); 7448 } 7449 7450 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7451 7452 if (__kmp_init_serial) { 7453 __kmp_runtime_destroy(); 7454 __kmp_init_serial = FALSE; 7455 } 7456 7457 __kmp_cleanup_threadprivate_caches(); 7458 7459 for (f = 0; f < __kmp_threads_capacity; f++) { 7460 if (__kmp_root[f] != NULL) { 7461 __kmp_free(__kmp_root[f]); 7462 __kmp_root[f] = NULL; 7463 } 7464 } 7465 __kmp_free(__kmp_threads); 7466 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7467 // there is no need in freeing __kmp_root. 7468 __kmp_threads = NULL; 7469 __kmp_root = NULL; 7470 __kmp_threads_capacity = 0; 7471 7472 #if KMP_USE_DYNAMIC_LOCK 7473 __kmp_cleanup_indirect_user_locks(); 7474 #else 7475 __kmp_cleanup_user_locks(); 7476 #endif 7477 7478 #if KMP_AFFINITY_SUPPORTED 7479 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7480 __kmp_cpuinfo_file = NULL; 7481 #endif /* KMP_AFFINITY_SUPPORTED */ 7482 7483 #if KMP_USE_ADAPTIVE_LOCKS 7484 #if KMP_DEBUG_ADAPTIVE_LOCKS 7485 __kmp_print_speculative_stats(); 7486 #endif 7487 #endif 7488 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7489 __kmp_nested_nth.nth = NULL; 7490 __kmp_nested_nth.size = 0; 7491 __kmp_nested_nth.used = 0; 7492 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7493 __kmp_nested_proc_bind.bind_types = NULL; 7494 __kmp_nested_proc_bind.size = 0; 7495 __kmp_nested_proc_bind.used = 0; 7496 #if OMP_50_ENABLED 7497 if (__kmp_affinity_format) { 7498 KMP_INTERNAL_FREE(__kmp_affinity_format); 7499 __kmp_affinity_format = NULL; 7500 } 7501 #endif 7502 7503 __kmp_i18n_catclose(); 7504 7505 #if KMP_USE_HIER_SCHED 7506 __kmp_hier_scheds.deallocate(); 7507 #endif 7508 7509 #if KMP_STATS_ENABLED 7510 __kmp_stats_fini(); 7511 #endif 7512 7513 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7514 } 7515 7516 /* ------------------------------------------------------------------------ */ 7517 7518 int __kmp_ignore_mppbeg(void) { 7519 char *env; 7520 7521 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7522 if (__kmp_str_match_false(env)) 7523 return FALSE; 7524 } 7525 // By default __kmpc_begin() is no-op. 7526 return TRUE; 7527 } 7528 7529 int __kmp_ignore_mppend(void) { 7530 char *env; 7531 7532 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7533 if (__kmp_str_match_false(env)) 7534 return FALSE; 7535 } 7536 // By default __kmpc_end() is no-op. 7537 return TRUE; 7538 } 7539 7540 void __kmp_internal_begin(void) { 7541 int gtid; 7542 kmp_root_t *root; 7543 7544 /* this is a very important step as it will register new sibling threads 7545 and assign these new uber threads a new gtid */ 7546 gtid = __kmp_entry_gtid(); 7547 root = __kmp_threads[gtid]->th.th_root; 7548 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7549 7550 if (root->r.r_begin) 7551 return; 7552 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7553 if (root->r.r_begin) { 7554 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7555 return; 7556 } 7557 7558 root->r.r_begin = TRUE; 7559 7560 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7561 } 7562 7563 /* ------------------------------------------------------------------------ */ 7564 7565 void __kmp_user_set_library(enum library_type arg) { 7566 int gtid; 7567 kmp_root_t *root; 7568 kmp_info_t *thread; 7569 7570 /* first, make sure we are initialized so we can get our gtid */ 7571 7572 gtid = __kmp_entry_gtid(); 7573 thread = __kmp_threads[gtid]; 7574 7575 root = thread->th.th_root; 7576 7577 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7578 library_serial)); 7579 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7580 thread */ 7581 KMP_WARNING(SetLibraryIncorrectCall); 7582 return; 7583 } 7584 7585 switch (arg) { 7586 case library_serial: 7587 thread->th.th_set_nproc = 0; 7588 set__nproc(thread, 1); 7589 break; 7590 case library_turnaround: 7591 thread->th.th_set_nproc = 0; 7592 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7593 : __kmp_dflt_team_nth_ub); 7594 break; 7595 case library_throughput: 7596 thread->th.th_set_nproc = 0; 7597 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7598 : __kmp_dflt_team_nth_ub); 7599 break; 7600 default: 7601 KMP_FATAL(UnknownLibraryType, arg); 7602 } 7603 7604 __kmp_aux_set_library(arg); 7605 } 7606 7607 void __kmp_aux_set_stacksize(size_t arg) { 7608 if (!__kmp_init_serial) 7609 __kmp_serial_initialize(); 7610 7611 #if KMP_OS_DARWIN 7612 if (arg & (0x1000 - 1)) { 7613 arg &= ~(0x1000 - 1); 7614 if (arg + 0x1000) /* check for overflow if we round up */ 7615 arg += 0x1000; 7616 } 7617 #endif 7618 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7619 7620 /* only change the default stacksize before the first parallel region */ 7621 if (!TCR_4(__kmp_init_parallel)) { 7622 size_t value = arg; /* argument is in bytes */ 7623 7624 if (value < __kmp_sys_min_stksize) 7625 value = __kmp_sys_min_stksize; 7626 else if (value > KMP_MAX_STKSIZE) 7627 value = KMP_MAX_STKSIZE; 7628 7629 __kmp_stksize = value; 7630 7631 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7632 } 7633 7634 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7635 } 7636 7637 /* set the behaviour of the runtime library */ 7638 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7639 void __kmp_aux_set_library(enum library_type arg) { 7640 __kmp_library = arg; 7641 7642 switch (__kmp_library) { 7643 case library_serial: { 7644 KMP_INFORM(LibraryIsSerial); 7645 (void)__kmp_change_library(TRUE); 7646 } break; 7647 case library_turnaround: 7648 (void)__kmp_change_library(TRUE); 7649 break; 7650 case library_throughput: 7651 (void)__kmp_change_library(FALSE); 7652 break; 7653 default: 7654 KMP_FATAL(UnknownLibraryType, arg); 7655 } 7656 } 7657 7658 /* Getting team information common for all team API */ 7659 // Returns NULL if not in teams construct 7660 static kmp_team_t *__kmp_aux_get_team_info(int &teams_serialized) { 7661 kmp_info_t *thr = __kmp_entry_thread(); 7662 teams_serialized = 0; 7663 if (thr->th.th_teams_microtask) { 7664 kmp_team_t *team = thr->th.th_team; 7665 int tlevel = thr->th.th_teams_level; // the level of the teams construct 7666 int ii = team->t.t_level; 7667 teams_serialized = team->t.t_serialized; 7668 int level = tlevel + 1; 7669 KMP_DEBUG_ASSERT(ii >= tlevel); 7670 while (ii > level) { 7671 for (teams_serialized = team->t.t_serialized; 7672 (teams_serialized > 0) && (ii > level); teams_serialized--, ii--) { 7673 } 7674 if (team->t.t_serialized && (!teams_serialized)) { 7675 team = team->t.t_parent; 7676 continue; 7677 } 7678 if (ii > level) { 7679 team = team->t.t_parent; 7680 ii--; 7681 } 7682 } 7683 return team; 7684 } 7685 return NULL; 7686 } 7687 7688 int __kmp_aux_get_team_num() { 7689 int serialized; 7690 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7691 if (team) { 7692 if (serialized > 1) { 7693 return 0; // teams region is serialized ( 1 team of 1 thread ). 7694 } else { 7695 return team->t.t_master_tid; 7696 } 7697 } 7698 return 0; 7699 } 7700 7701 int __kmp_aux_get_num_teams() { 7702 int serialized; 7703 kmp_team_t *team = __kmp_aux_get_team_info(serialized); 7704 if (team) { 7705 if (serialized > 1) { 7706 return 1; 7707 } else { 7708 return team->t.t_parent->t.t_nproc; 7709 } 7710 } 7711 return 1; 7712 } 7713 7714 /* ------------------------------------------------------------------------ */ 7715 7716 #if OMP_50_ENABLED 7717 /* 7718 * Affinity Format Parser 7719 * 7720 * Field is in form of: %[[[0].]size]type 7721 * % and type are required (%% means print a literal '%') 7722 * type is either single char or long name surrounded by {}, 7723 * e.g., N or {num_threads} 7724 * 0 => leading zeros 7725 * . => right justified when size is specified 7726 * by default output is left justified 7727 * size is the *minimum* field length 7728 * All other characters are printed as is 7729 * 7730 * Available field types: 7731 * L {thread_level} - omp_get_level() 7732 * n {thread_num} - omp_get_thread_num() 7733 * h {host} - name of host machine 7734 * P {process_id} - process id (integer) 7735 * T {thread_identifier} - native thread identifier (integer) 7736 * N {num_threads} - omp_get_num_threads() 7737 * A {ancestor_tnum} - omp_get_ancestor_thread_num(omp_get_level()-1) 7738 * a {thread_affinity} - comma separated list of integers or integer ranges 7739 * (values of affinity mask) 7740 * 7741 * Implementation-specific field types can be added 7742 * If a type is unknown, print "undefined" 7743 */ 7744 7745 // Structure holding the short name, long name, and corresponding data type 7746 // for snprintf. A table of these will represent the entire valid keyword 7747 // field types. 7748 typedef struct kmp_affinity_format_field_t { 7749 char short_name; // from spec e.g., L -> thread level 7750 const char *long_name; // from spec thread_level -> thread level 7751 char field_format; // data type for snprintf (typically 'd' or 's' 7752 // for integer or string) 7753 } kmp_affinity_format_field_t; 7754 7755 static const kmp_affinity_format_field_t __kmp_affinity_format_table[] = { 7756 #if KMP_AFFINITY_SUPPORTED 7757 {'A', "thread_affinity", 's'}, 7758 #endif 7759 {'t', "team_num", 'd'}, 7760 {'T', "num_teams", 'd'}, 7761 {'L', "nesting_level", 'd'}, 7762 {'n', "thread_num", 'd'}, 7763 {'N', "num_threads", 'd'}, 7764 {'a', "ancestor_tnum", 'd'}, 7765 {'H', "host", 's'}, 7766 {'P', "process_id", 'd'}, 7767 {'i', "native_thread_id", 'd'}}; 7768 7769 // Return the number of characters it takes to hold field 7770 static int __kmp_aux_capture_affinity_field(int gtid, const kmp_info_t *th, 7771 const char **ptr, 7772 kmp_str_buf_t *field_buffer) { 7773 int rc, format_index, field_value; 7774 const char *width_left, *width_right; 7775 bool pad_zeros, right_justify, parse_long_name, found_valid_name; 7776 static const int FORMAT_SIZE = 20; 7777 char format[FORMAT_SIZE] = {0}; 7778 char absolute_short_name = 0; 7779 7780 KMP_DEBUG_ASSERT(gtid >= 0); 7781 KMP_DEBUG_ASSERT(th); 7782 KMP_DEBUG_ASSERT(**ptr == '%'); 7783 KMP_DEBUG_ASSERT(field_buffer); 7784 7785 __kmp_str_buf_clear(field_buffer); 7786 7787 // Skip the initial % 7788 (*ptr)++; 7789 7790 // Check for %% first 7791 if (**ptr == '%') { 7792 __kmp_str_buf_cat(field_buffer, "%", 1); 7793 (*ptr)++; // skip over the second % 7794 return 1; 7795 } 7796 7797 // Parse field modifiers if they are present 7798 pad_zeros = false; 7799 if (**ptr == '0') { 7800 pad_zeros = true; 7801 (*ptr)++; // skip over 0 7802 } 7803 right_justify = false; 7804 if (**ptr == '.') { 7805 right_justify = true; 7806 (*ptr)++; // skip over . 7807 } 7808 // Parse width of field: [width_left, width_right) 7809 width_left = width_right = NULL; 7810 if (**ptr >= '0' && **ptr <= '9') { 7811 width_left = *ptr; 7812 SKIP_DIGITS(*ptr); 7813 width_right = *ptr; 7814 } 7815 7816 // Create the format for KMP_SNPRINTF based on flags parsed above 7817 format_index = 0; 7818 format[format_index++] = '%'; 7819 if (!right_justify) 7820 format[format_index++] = '-'; 7821 if (pad_zeros) 7822 format[format_index++] = '0'; 7823 if (width_left && width_right) { 7824 int i = 0; 7825 // Only allow 8 digit number widths. 7826 // This also prevents overflowing format variable 7827 while (i < 8 && width_left < width_right) { 7828 format[format_index++] = *width_left; 7829 width_left++; 7830 i++; 7831 } 7832 } 7833 7834 // Parse a name (long or short) 7835 // Canonicalize the name into absolute_short_name 7836 found_valid_name = false; 7837 parse_long_name = (**ptr == '{'); 7838 if (parse_long_name) 7839 (*ptr)++; // skip initial left brace 7840 for (size_t i = 0; i < sizeof(__kmp_affinity_format_table) / 7841 sizeof(__kmp_affinity_format_table[0]); 7842 ++i) { 7843 char short_name = __kmp_affinity_format_table[i].short_name; 7844 const char *long_name = __kmp_affinity_format_table[i].long_name; 7845 char field_format = __kmp_affinity_format_table[i].field_format; 7846 if (parse_long_name) { 7847 int length = KMP_STRLEN(long_name); 7848 if (strncmp(*ptr, long_name, length) == 0) { 7849 found_valid_name = true; 7850 (*ptr) += length; // skip the long name 7851 } 7852 } else if (**ptr == short_name) { 7853 found_valid_name = true; 7854 (*ptr)++; // skip the short name 7855 } 7856 if (found_valid_name) { 7857 format[format_index++] = field_format; 7858 format[format_index++] = '\0'; 7859 absolute_short_name = short_name; 7860 break; 7861 } 7862 } 7863 if (parse_long_name) { 7864 if (**ptr != '}') { 7865 absolute_short_name = 0; 7866 } else { 7867 (*ptr)++; // skip over the right brace 7868 } 7869 } 7870 7871 // Attempt to fill the buffer with the requested 7872 // value using snprintf within __kmp_str_buf_print() 7873 switch (absolute_short_name) { 7874 case 't': 7875 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_team_num()); 7876 break; 7877 case 'T': 7878 rc = __kmp_str_buf_print(field_buffer, format, __kmp_aux_get_num_teams()); 7879 break; 7880 case 'L': 7881 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_level); 7882 break; 7883 case 'n': 7884 rc = __kmp_str_buf_print(field_buffer, format, __kmp_tid_from_gtid(gtid)); 7885 break; 7886 case 'H': { 7887 static const int BUFFER_SIZE = 256; 7888 char buf[BUFFER_SIZE]; 7889 __kmp_expand_host_name(buf, BUFFER_SIZE); 7890 rc = __kmp_str_buf_print(field_buffer, format, buf); 7891 } break; 7892 case 'P': 7893 rc = __kmp_str_buf_print(field_buffer, format, getpid()); 7894 break; 7895 case 'i': 7896 rc = __kmp_str_buf_print(field_buffer, format, __kmp_gettid()); 7897 break; 7898 case 'N': 7899 rc = __kmp_str_buf_print(field_buffer, format, th->th.th_team->t.t_nproc); 7900 break; 7901 case 'a': 7902 field_value = 7903 __kmp_get_ancestor_thread_num(gtid, th->th.th_team->t.t_level - 1); 7904 rc = __kmp_str_buf_print(field_buffer, format, field_value); 7905 break; 7906 #if KMP_AFFINITY_SUPPORTED 7907 case 'A': { 7908 kmp_str_buf_t buf; 7909 __kmp_str_buf_init(&buf); 7910 __kmp_affinity_str_buf_mask(&buf, th->th.th_affin_mask); 7911 rc = __kmp_str_buf_print(field_buffer, format, buf.str); 7912 __kmp_str_buf_free(&buf); 7913 } break; 7914 #endif 7915 default: 7916 // According to spec, If an implementation does not have info for field 7917 // type, then "undefined" is printed 7918 rc = __kmp_str_buf_print(field_buffer, "%s", "undefined"); 7919 // Skip the field 7920 if (parse_long_name) { 7921 SKIP_TOKEN(*ptr); 7922 if (**ptr == '}') 7923 (*ptr)++; 7924 } else { 7925 (*ptr)++; 7926 } 7927 } 7928 7929 KMP_ASSERT(format_index <= FORMAT_SIZE); 7930 return rc; 7931 } 7932 7933 /* 7934 * Return number of characters needed to hold the affinity string 7935 * (not including null byte character) 7936 * The resultant string is printed to buffer, which the caller can then 7937 * handle afterwards 7938 */ 7939 size_t __kmp_aux_capture_affinity(int gtid, const char *format, 7940 kmp_str_buf_t *buffer) { 7941 const char *parse_ptr; 7942 size_t retval; 7943 const kmp_info_t *th; 7944 kmp_str_buf_t field; 7945 7946 KMP_DEBUG_ASSERT(buffer); 7947 KMP_DEBUG_ASSERT(gtid >= 0); 7948 7949 __kmp_str_buf_init(&field); 7950 __kmp_str_buf_clear(buffer); 7951 7952 th = __kmp_threads[gtid]; 7953 retval = 0; 7954 7955 // If format is NULL or zero-length string, then we use 7956 // affinity-format-var ICV 7957 parse_ptr = format; 7958 if (parse_ptr == NULL || *parse_ptr == '\0') { 7959 parse_ptr = __kmp_affinity_format; 7960 } 7961 KMP_DEBUG_ASSERT(parse_ptr); 7962 7963 while (*parse_ptr != '\0') { 7964 // Parse a field 7965 if (*parse_ptr == '%') { 7966 // Put field in the buffer 7967 int rc = __kmp_aux_capture_affinity_field(gtid, th, &parse_ptr, &field); 7968 __kmp_str_buf_catbuf(buffer, &field); 7969 retval += rc; 7970 } else { 7971 // Put literal character in buffer 7972 __kmp_str_buf_cat(buffer, parse_ptr, 1); 7973 retval++; 7974 parse_ptr++; 7975 } 7976 } 7977 __kmp_str_buf_free(&field); 7978 return retval; 7979 } 7980 7981 // Displays the affinity string to stdout 7982 void __kmp_aux_display_affinity(int gtid, const char *format) { 7983 kmp_str_buf_t buf; 7984 __kmp_str_buf_init(&buf); 7985 __kmp_aux_capture_affinity(gtid, format, &buf); 7986 __kmp_fprintf(kmp_out, "%s" KMP_END_OF_LINE, buf.str); 7987 __kmp_str_buf_free(&buf); 7988 } 7989 #endif // OMP_50_ENABLED 7990 7991 /* ------------------------------------------------------------------------ */ 7992 7993 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 7994 int blocktime = arg; /* argument is in milliseconds */ 7995 #if KMP_USE_MONITOR 7996 int bt_intervals; 7997 #endif 7998 int bt_set; 7999 8000 __kmp_save_internal_controls(thread); 8001 8002 /* Normalize and set blocktime for the teams */ 8003 if (blocktime < KMP_MIN_BLOCKTIME) 8004 blocktime = KMP_MIN_BLOCKTIME; 8005 else if (blocktime > KMP_MAX_BLOCKTIME) 8006 blocktime = KMP_MAX_BLOCKTIME; 8007 8008 set__blocktime_team(thread->th.th_team, tid, blocktime); 8009 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 8010 8011 #if KMP_USE_MONITOR 8012 /* Calculate and set blocktime intervals for the teams */ 8013 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 8014 8015 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 8016 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 8017 #endif 8018 8019 /* Set whether blocktime has been set to "TRUE" */ 8020 bt_set = TRUE; 8021 8022 set__bt_set_team(thread->th.th_team, tid, bt_set); 8023 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 8024 #if KMP_USE_MONITOR 8025 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 8026 "bt_intervals=%d, monitor_updates=%d\n", 8027 __kmp_gtid_from_tid(tid, thread->th.th_team), 8028 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 8029 __kmp_monitor_wakeups)); 8030 #else 8031 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 8032 __kmp_gtid_from_tid(tid, thread->th.th_team), 8033 thread->th.th_team->t.t_id, tid, blocktime)); 8034 #endif 8035 } 8036 8037 void __kmp_aux_set_defaults(char const *str, int len) { 8038 if (!__kmp_init_serial) { 8039 __kmp_serial_initialize(); 8040 } 8041 __kmp_env_initialize(str); 8042 8043 if (__kmp_settings 8044 #if OMP_40_ENABLED 8045 || __kmp_display_env || __kmp_display_env_verbose 8046 #endif // OMP_40_ENABLED 8047 ) { 8048 __kmp_env_print(); 8049 } 8050 } // __kmp_aux_set_defaults 8051 8052 /* ------------------------------------------------------------------------ */ 8053 /* internal fast reduction routines */ 8054 8055 PACKED_REDUCTION_METHOD_T 8056 __kmp_determine_reduction_method( 8057 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 8058 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 8059 kmp_critical_name *lck) { 8060 8061 // Default reduction method: critical construct ( lck != NULL, like in current 8062 // PAROPT ) 8063 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 8064 // can be selected by RTL 8065 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 8066 // can be selected by RTL 8067 // Finally, it's up to OpenMP RTL to make a decision on which method to select 8068 // among generated by PAROPT. 8069 8070 PACKED_REDUCTION_METHOD_T retval; 8071 8072 int team_size; 8073 8074 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 8075 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 8076 8077 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 8078 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 8079 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 8080 8081 retval = critical_reduce_block; 8082 8083 // another choice of getting a team size (with 1 dynamic deference) is slower 8084 team_size = __kmp_get_team_num_threads(global_tid); 8085 if (team_size == 1) { 8086 8087 retval = empty_reduce_block; 8088 8089 } else { 8090 8091 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8092 8093 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 8094 8095 #if KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || \ 8096 KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8097 8098 int teamsize_cutoff = 4; 8099 8100 #if KMP_MIC_SUPPORTED 8101 if (__kmp_mic_type != non_mic) { 8102 teamsize_cutoff = 8; 8103 } 8104 #endif 8105 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8106 if (tree_available) { 8107 if (team_size <= teamsize_cutoff) { 8108 if (atomic_available) { 8109 retval = atomic_reduce_block; 8110 } 8111 } else { 8112 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8113 } 8114 } else if (atomic_available) { 8115 retval = atomic_reduce_block; 8116 } 8117 #else 8118 #error "Unknown or unsupported OS" 8119 #endif // KMP_OS_LINUX || KMP_OS_DRAGONFLY || KMP_OS_FREEBSD || KMP_OS_NETBSD || 8120 // KMP_OS_OPENBSD || KMP_OS_WINDOWS || KMP_OS_DARWIN || KMP_OS_HURD 8121 8122 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 8123 8124 #if KMP_OS_LINUX || KMP_OS_WINDOWS || KMP_OS_HURD 8125 8126 // basic tuning 8127 8128 if (atomic_available) { 8129 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 8130 retval = atomic_reduce_block; 8131 } 8132 } // otherwise: use critical section 8133 8134 #elif KMP_OS_DARWIN 8135 8136 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8137 if (atomic_available && (num_vars <= 3)) { 8138 retval = atomic_reduce_block; 8139 } else if (tree_available) { 8140 if ((reduce_size > (9 * sizeof(kmp_real64))) && 8141 (reduce_size < (2000 * sizeof(kmp_real64)))) { 8142 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 8143 } 8144 } // otherwise: use critical section 8145 8146 #else 8147 #error "Unknown or unsupported OS" 8148 #endif 8149 8150 #else 8151 #error "Unknown or unsupported architecture" 8152 #endif 8153 } 8154 8155 // KMP_FORCE_REDUCTION 8156 8157 // If the team is serialized (team_size == 1), ignore the forced reduction 8158 // method and stay with the unsynchronized method (empty_reduce_block) 8159 if (__kmp_force_reduction_method != reduction_method_not_defined && 8160 team_size != 1) { 8161 8162 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 8163 8164 int atomic_available, tree_available; 8165 8166 switch ((forced_retval = __kmp_force_reduction_method)) { 8167 case critical_reduce_block: 8168 KMP_ASSERT(lck); // lck should be != 0 8169 break; 8170 8171 case atomic_reduce_block: 8172 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 8173 if (!atomic_available) { 8174 KMP_WARNING(RedMethodNotSupported, "atomic"); 8175 forced_retval = critical_reduce_block; 8176 } 8177 break; 8178 8179 case tree_reduce_block: 8180 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 8181 if (!tree_available) { 8182 KMP_WARNING(RedMethodNotSupported, "tree"); 8183 forced_retval = critical_reduce_block; 8184 } else { 8185 #if KMP_FAST_REDUCTION_BARRIER 8186 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 8187 #endif 8188 } 8189 break; 8190 8191 default: 8192 KMP_ASSERT(0); // "unsupported method specified" 8193 } 8194 8195 retval = forced_retval; 8196 } 8197 8198 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 8199 8200 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 8201 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 8202 8203 return (retval); 8204 } 8205 8206 // this function is for testing set/get/determine reduce method 8207 kmp_int32 __kmp_get_reduce_method(void) { 8208 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 8209 } 8210 8211 #if OMP_50_ENABLED 8212 8213 // Soft pause sets up threads to ignore blocktime and just go to sleep. 8214 // Spin-wait code checks __kmp_pause_status and reacts accordingly. 8215 void __kmp_soft_pause() { __kmp_pause_status = kmp_soft_paused; } 8216 8217 // Hard pause shuts down the runtime completely. Resume happens naturally when 8218 // OpenMP is used subsequently. 8219 void __kmp_hard_pause() { 8220 __kmp_pause_status = kmp_hard_paused; 8221 __kmp_internal_end_thread(-1); 8222 } 8223 8224 // Soft resume sets __kmp_pause_status, and wakes up all threads. 8225 void __kmp_resume_if_soft_paused() { 8226 if (__kmp_pause_status == kmp_soft_paused) { 8227 __kmp_pause_status = kmp_not_paused; 8228 8229 for (int gtid = 1; gtid < __kmp_threads_capacity; ++gtid) { 8230 kmp_info_t *thread = __kmp_threads[gtid]; 8231 if (thread) { // Wake it if sleeping 8232 kmp_flag_64 fl(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 8233 if (fl.is_sleeping()) 8234 fl.resume(gtid); 8235 else if (__kmp_try_suspend_mx(thread)) { // got suspend lock 8236 __kmp_unlock_suspend_mx(thread); // unlock it; it won't sleep 8237 } else { // thread holds the lock and may sleep soon 8238 do { // until either the thread sleeps, or we can get the lock 8239 if (fl.is_sleeping()) { 8240 fl.resume(gtid); 8241 break; 8242 } else if (__kmp_try_suspend_mx(thread)) { 8243 __kmp_unlock_suspend_mx(thread); 8244 break; 8245 } 8246 } while (1); 8247 } 8248 } 8249 } 8250 } 8251 } 8252 8253 // This function is called via __kmpc_pause_resource. Returns 0 if successful. 8254 // TODO: add warning messages 8255 int __kmp_pause_resource(kmp_pause_status_t level) { 8256 if (level == kmp_not_paused) { // requesting resume 8257 if (__kmp_pause_status == kmp_not_paused) { 8258 // error message about runtime not being paused, so can't resume 8259 return 1; 8260 } else { 8261 KMP_DEBUG_ASSERT(__kmp_pause_status == kmp_soft_paused || 8262 __kmp_pause_status == kmp_hard_paused); 8263 __kmp_pause_status = kmp_not_paused; 8264 return 0; 8265 } 8266 } else if (level == kmp_soft_paused) { // requesting soft pause 8267 if (__kmp_pause_status != kmp_not_paused) { 8268 // error message about already being paused 8269 return 1; 8270 } else { 8271 __kmp_soft_pause(); 8272 return 0; 8273 } 8274 } else if (level == kmp_hard_paused) { // requesting hard pause 8275 if (__kmp_pause_status != kmp_not_paused) { 8276 // error message about already being paused 8277 return 1; 8278 } else { 8279 __kmp_hard_pause(); 8280 return 0; 8281 } 8282 } else { 8283 // error message about invalid level 8284 return 1; 8285 } 8286 } 8287 8288 #endif // OMP_50_ENABLED 8289