1 /* 2 * kmp_runtime.cpp -- KPTS runtime support library 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // The LLVM Compiler Infrastructure 8 // 9 // This file is dual licensed under the MIT and the University of Illinois Open 10 // Source Licenses. See LICENSE.txt for details. 11 // 12 //===----------------------------------------------------------------------===// 13 14 #include "kmp.h" 15 #include "kmp_affinity.h" 16 #include "kmp_atomic.h" 17 #include "kmp_environment.h" 18 #include "kmp_error.h" 19 #include "kmp_i18n.h" 20 #include "kmp_io.h" 21 #include "kmp_itt.h" 22 #include "kmp_settings.h" 23 #include "kmp_stats.h" 24 #include "kmp_str.h" 25 #include "kmp_wait_release.h" 26 #include "kmp_wrapper_getpid.h" 27 #include "kmp_dispatch.h" 28 #if KMP_USE_HIER_SCHED 29 #include "kmp_dispatch_hier.h" 30 #endif 31 32 #if OMPT_SUPPORT 33 #include "ompt-specific.h" 34 #endif 35 36 /* these are temporary issues to be dealt with */ 37 #define KMP_USE_PRCTL 0 38 39 #if KMP_OS_WINDOWS 40 #include <process.h> 41 #endif 42 43 #include "tsan_annotations.h" 44 45 #if defined(KMP_GOMP_COMPAT) 46 char const __kmp_version_alt_comp[] = 47 KMP_VERSION_PREFIX "alternative compiler support: yes"; 48 #endif /* defined(KMP_GOMP_COMPAT) */ 49 50 char const __kmp_version_omp_api[] = KMP_VERSION_PREFIX "API version: " 51 #if OMP_50_ENABLED 52 "5.0 (201611)"; 53 #elif OMP_45_ENABLED 54 "4.5 (201511)"; 55 #elif OMP_40_ENABLED 56 "4.0 (201307)"; 57 #else 58 "3.1 (201107)"; 59 #endif 60 61 #ifdef KMP_DEBUG 62 char const __kmp_version_lock[] = 63 KMP_VERSION_PREFIX "lock type: run time selectable"; 64 #endif /* KMP_DEBUG */ 65 66 #define KMP_MIN(x, y) ((x) < (y) ? (x) : (y)) 67 68 /* ------------------------------------------------------------------------ */ 69 70 #if KMP_USE_MONITOR 71 kmp_info_t __kmp_monitor; 72 #endif 73 74 /* Forward declarations */ 75 76 void __kmp_cleanup(void); 77 78 static void __kmp_initialize_info(kmp_info_t *, kmp_team_t *, int tid, 79 int gtid); 80 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 81 kmp_internal_control_t *new_icvs, 82 ident_t *loc); 83 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 84 static void __kmp_partition_places(kmp_team_t *team, 85 int update_master_only = 0); 86 #endif 87 static void __kmp_do_serial_initialize(void); 88 void __kmp_fork_barrier(int gtid, int tid); 89 void __kmp_join_barrier(int gtid); 90 void __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, 91 kmp_internal_control_t *new_icvs, ident_t *loc); 92 93 #ifdef USE_LOAD_BALANCE 94 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc); 95 #endif 96 97 static int __kmp_expand_threads(int nNeed); 98 #if KMP_OS_WINDOWS 99 static int __kmp_unregister_root_other_thread(int gtid); 100 #endif 101 static void __kmp_unregister_library(void); // called by __kmp_internal_end() 102 static void __kmp_reap_thread(kmp_info_t *thread, int is_root); 103 kmp_info_t *__kmp_thread_pool_insert_pt = NULL; 104 105 /* Calculate the identifier of the current thread */ 106 /* fast (and somewhat portable) way to get unique identifier of executing 107 thread. Returns KMP_GTID_DNE if we haven't been assigned a gtid. */ 108 int __kmp_get_global_thread_id() { 109 int i; 110 kmp_info_t **other_threads; 111 size_t stack_data; 112 char *stack_addr; 113 size_t stack_size; 114 char *stack_base; 115 116 KA_TRACE( 117 1000, 118 ("*** __kmp_get_global_thread_id: entering, nproc=%d all_nproc=%d\n", 119 __kmp_nth, __kmp_all_nth)); 120 121 /* JPH - to handle the case where __kmpc_end(0) is called immediately prior to 122 a parallel region, made it return KMP_GTID_DNE to force serial_initialize 123 by caller. Had to handle KMP_GTID_DNE at all call-sites, or else guarantee 124 __kmp_init_gtid for this to work. */ 125 126 if (!TCR_4(__kmp_init_gtid)) 127 return KMP_GTID_DNE; 128 129 #ifdef KMP_TDATA_GTID 130 if (TCR_4(__kmp_gtid_mode) >= 3) { 131 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using TDATA\n")); 132 return __kmp_gtid; 133 } 134 #endif 135 if (TCR_4(__kmp_gtid_mode) >= 2) { 136 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using keyed TLS\n")); 137 return __kmp_gtid_get_specific(); 138 } 139 KA_TRACE(1000, ("*** __kmp_get_global_thread_id: using internal alg.\n")); 140 141 stack_addr = (char *)&stack_data; 142 other_threads = __kmp_threads; 143 144 /* ATT: The code below is a source of potential bugs due to unsynchronized 145 access to __kmp_threads array. For example: 146 1. Current thread loads other_threads[i] to thr and checks it, it is 147 non-NULL. 148 2. Current thread is suspended by OS. 149 3. Another thread unregisters and finishes (debug versions of free() 150 may fill memory with something like 0xEF). 151 4. Current thread is resumed. 152 5. Current thread reads junk from *thr. 153 TODO: Fix it. --ln */ 154 155 for (i = 0; i < __kmp_threads_capacity; i++) { 156 157 kmp_info_t *thr = (kmp_info_t *)TCR_SYNC_PTR(other_threads[i]); 158 if (!thr) 159 continue; 160 161 stack_size = (size_t)TCR_PTR(thr->th.th_info.ds.ds_stacksize); 162 stack_base = (char *)TCR_PTR(thr->th.th_info.ds.ds_stackbase); 163 164 /* stack grows down -- search through all of the active threads */ 165 166 if (stack_addr <= stack_base) { 167 size_t stack_diff = stack_base - stack_addr; 168 169 if (stack_diff <= stack_size) { 170 /* The only way we can be closer than the allocated */ 171 /* stack size is if we are running on this thread. */ 172 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == i); 173 return i; 174 } 175 } 176 } 177 178 /* get specific to try and determine our gtid */ 179 KA_TRACE(1000, 180 ("*** __kmp_get_global_thread_id: internal alg. failed to find " 181 "thread, using TLS\n")); 182 i = __kmp_gtid_get_specific(); 183 184 /*fprintf( stderr, "=== %d\n", i ); */ /* GROO */ 185 186 /* if we havn't been assigned a gtid, then return code */ 187 if (i < 0) 188 return i; 189 190 /* dynamically updated stack window for uber threads to avoid get_specific 191 call */ 192 if (!TCR_4(other_threads[i]->th.th_info.ds.ds_stackgrow)) { 193 KMP_FATAL(StackOverflow, i); 194 } 195 196 stack_base = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 197 if (stack_addr > stack_base) { 198 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stackbase, stack_addr); 199 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 200 other_threads[i]->th.th_info.ds.ds_stacksize + stack_addr - 201 stack_base); 202 } else { 203 TCW_PTR(other_threads[i]->th.th_info.ds.ds_stacksize, 204 stack_base - stack_addr); 205 } 206 207 /* Reprint stack bounds for ubermaster since they have been refined */ 208 if (__kmp_storage_map) { 209 char *stack_end = (char *)other_threads[i]->th.th_info.ds.ds_stackbase; 210 char *stack_beg = stack_end - other_threads[i]->th.th_info.ds.ds_stacksize; 211 __kmp_print_storage_map_gtid(i, stack_beg, stack_end, 212 other_threads[i]->th.th_info.ds.ds_stacksize, 213 "th_%d stack (refinement)", i); 214 } 215 return i; 216 } 217 218 int __kmp_get_global_thread_id_reg() { 219 int gtid; 220 221 if (!__kmp_init_serial) { 222 gtid = KMP_GTID_DNE; 223 } else 224 #ifdef KMP_TDATA_GTID 225 if (TCR_4(__kmp_gtid_mode) >= 3) { 226 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using TDATA\n")); 227 gtid = __kmp_gtid; 228 } else 229 #endif 230 if (TCR_4(__kmp_gtid_mode) >= 2) { 231 KA_TRACE(1000, ("*** __kmp_get_global_thread_id_reg: using keyed TLS\n")); 232 gtid = __kmp_gtid_get_specific(); 233 } else { 234 KA_TRACE(1000, 235 ("*** __kmp_get_global_thread_id_reg: using internal alg.\n")); 236 gtid = __kmp_get_global_thread_id(); 237 } 238 239 /* we must be a new uber master sibling thread */ 240 if (gtid == KMP_GTID_DNE) { 241 KA_TRACE(10, 242 ("__kmp_get_global_thread_id_reg: Encountered new root thread. " 243 "Registering a new gtid.\n")); 244 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 245 if (!__kmp_init_serial) { 246 __kmp_do_serial_initialize(); 247 gtid = __kmp_gtid_get_specific(); 248 } else { 249 gtid = __kmp_register_root(FALSE); 250 } 251 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 252 /*__kmp_printf( "+++ %d\n", gtid ); */ /* GROO */ 253 } 254 255 KMP_DEBUG_ASSERT(gtid >= 0); 256 257 return gtid; 258 } 259 260 /* caller must hold forkjoin_lock */ 261 void __kmp_check_stack_overlap(kmp_info_t *th) { 262 int f; 263 char *stack_beg = NULL; 264 char *stack_end = NULL; 265 int gtid; 266 267 KA_TRACE(10, ("__kmp_check_stack_overlap: called\n")); 268 if (__kmp_storage_map) { 269 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 270 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 271 272 gtid = __kmp_gtid_from_thread(th); 273 274 if (gtid == KMP_GTID_MONITOR) { 275 __kmp_print_storage_map_gtid( 276 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 277 "th_%s stack (%s)", "mon", 278 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 279 } else { 280 __kmp_print_storage_map_gtid( 281 gtid, stack_beg, stack_end, th->th.th_info.ds.ds_stacksize, 282 "th_%d stack (%s)", gtid, 283 (th->th.th_info.ds.ds_stackgrow) ? "initial" : "actual"); 284 } 285 } 286 287 /* No point in checking ubermaster threads since they use refinement and 288 * cannot overlap */ 289 gtid = __kmp_gtid_from_thread(th); 290 if (__kmp_env_checks == TRUE && !KMP_UBER_GTID(gtid)) { 291 KA_TRACE(10, 292 ("__kmp_check_stack_overlap: performing extensive checking\n")); 293 if (stack_beg == NULL) { 294 stack_end = (char *)th->th.th_info.ds.ds_stackbase; 295 stack_beg = stack_end - th->th.th_info.ds.ds_stacksize; 296 } 297 298 for (f = 0; f < __kmp_threads_capacity; f++) { 299 kmp_info_t *f_th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[f]); 300 301 if (f_th && f_th != th) { 302 char *other_stack_end = 303 (char *)TCR_PTR(f_th->th.th_info.ds.ds_stackbase); 304 char *other_stack_beg = 305 other_stack_end - (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize); 306 if ((stack_beg > other_stack_beg && stack_beg < other_stack_end) || 307 (stack_end > other_stack_beg && stack_end < other_stack_end)) { 308 309 /* Print the other stack values before the abort */ 310 if (__kmp_storage_map) 311 __kmp_print_storage_map_gtid( 312 -1, other_stack_beg, other_stack_end, 313 (size_t)TCR_PTR(f_th->th.th_info.ds.ds_stacksize), 314 "th_%d stack (overlapped)", __kmp_gtid_from_thread(f_th)); 315 316 __kmp_fatal(KMP_MSG(StackOverlap), KMP_HNT(ChangeStackLimit), 317 __kmp_msg_null); 318 } 319 } 320 } 321 } 322 KA_TRACE(10, ("__kmp_check_stack_overlap: returning\n")); 323 } 324 325 /* ------------------------------------------------------------------------ */ 326 327 void __kmp_infinite_loop(void) { 328 static int done = FALSE; 329 330 while (!done) { 331 KMP_YIELD(1); 332 } 333 } 334 335 #define MAX_MESSAGE 512 336 337 void __kmp_print_storage_map_gtid(int gtid, void *p1, void *p2, size_t size, 338 char const *format, ...) { 339 char buffer[MAX_MESSAGE]; 340 va_list ap; 341 342 va_start(ap, format); 343 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP storage map: %p %p%8lu %s\n", p1, 344 p2, (unsigned long)size, format); 345 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 346 __kmp_vprintf(kmp_err, buffer, ap); 347 #if KMP_PRINT_DATA_PLACEMENT 348 int node; 349 if (gtid >= 0) { 350 if (p1 <= p2 && (char *)p2 - (char *)p1 == size) { 351 if (__kmp_storage_map_verbose) { 352 node = __kmp_get_host_node(p1); 353 if (node < 0) /* doesn't work, so don't try this next time */ 354 __kmp_storage_map_verbose = FALSE; 355 else { 356 char *last; 357 int lastNode; 358 int localProc = __kmp_get_cpu_from_gtid(gtid); 359 360 const int page_size = KMP_GET_PAGE_SIZE(); 361 362 p1 = (void *)((size_t)p1 & ~((size_t)page_size - 1)); 363 p2 = (void *)(((size_t)p2 - 1) & ~((size_t)page_size - 1)); 364 if (localProc >= 0) 365 __kmp_printf_no_lock(" GTID %d localNode %d\n", gtid, 366 localProc >> 1); 367 else 368 __kmp_printf_no_lock(" GTID %d\n", gtid); 369 #if KMP_USE_PRCTL 370 /* The more elaborate format is disabled for now because of the prctl 371 * hanging bug. */ 372 do { 373 last = p1; 374 lastNode = node; 375 /* This loop collates adjacent pages with the same host node. */ 376 do { 377 (char *)p1 += page_size; 378 } while (p1 <= p2 && (node = __kmp_get_host_node(p1)) == lastNode); 379 __kmp_printf_no_lock(" %p-%p memNode %d\n", last, (char *)p1 - 1, 380 lastNode); 381 } while (p1 <= p2); 382 #else 383 __kmp_printf_no_lock(" %p-%p memNode %d\n", p1, 384 (char *)p1 + (page_size - 1), 385 __kmp_get_host_node(p1)); 386 if (p1 < p2) { 387 __kmp_printf_no_lock(" %p-%p memNode %d\n", p2, 388 (char *)p2 + (page_size - 1), 389 __kmp_get_host_node(p2)); 390 } 391 #endif 392 } 393 } 394 } else 395 __kmp_printf_no_lock(" %s\n", KMP_I18N_STR(StorageMapWarning)); 396 } 397 #endif /* KMP_PRINT_DATA_PLACEMENT */ 398 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 399 } 400 401 void __kmp_warn(char const *format, ...) { 402 char buffer[MAX_MESSAGE]; 403 va_list ap; 404 405 if (__kmp_generate_warnings == kmp_warnings_off) { 406 return; 407 } 408 409 va_start(ap, format); 410 411 KMP_SNPRINTF(buffer, sizeof(buffer), "OMP warning: %s\n", format); 412 __kmp_acquire_bootstrap_lock(&__kmp_stdio_lock); 413 __kmp_vprintf(kmp_err, buffer, ap); 414 __kmp_release_bootstrap_lock(&__kmp_stdio_lock); 415 416 va_end(ap); 417 } 418 419 void __kmp_abort_process() { 420 // Later threads may stall here, but that's ok because abort() will kill them. 421 __kmp_acquire_bootstrap_lock(&__kmp_exit_lock); 422 423 if (__kmp_debug_buf) { 424 __kmp_dump_debug_buffer(); 425 } 426 427 if (KMP_OS_WINDOWS) { 428 // Let other threads know of abnormal termination and prevent deadlock 429 // if abort happened during library initialization or shutdown 430 __kmp_global.g.g_abort = SIGABRT; 431 432 /* On Windows* OS by default abort() causes pop-up error box, which stalls 433 nightly testing. Unfortunately, we cannot reliably suppress pop-up error 434 boxes. _set_abort_behavior() works well, but this function is not 435 available in VS7 (this is not problem for DLL, but it is a problem for 436 static OpenMP RTL). SetErrorMode (and so, timelimit utility) does not 437 help, at least in some versions of MS C RTL. 438 439 It seems following sequence is the only way to simulate abort() and 440 avoid pop-up error box. */ 441 raise(SIGABRT); 442 _exit(3); // Just in case, if signal ignored, exit anyway. 443 } else { 444 abort(); 445 } 446 447 __kmp_infinite_loop(); 448 __kmp_release_bootstrap_lock(&__kmp_exit_lock); 449 450 } // __kmp_abort_process 451 452 void __kmp_abort_thread(void) { 453 // TODO: Eliminate g_abort global variable and this function. 454 // In case of abort just call abort(), it will kill all the threads. 455 __kmp_infinite_loop(); 456 } // __kmp_abort_thread 457 458 /* Print out the storage map for the major kmp_info_t thread data structures 459 that are allocated together. */ 460 461 static void __kmp_print_thread_storage_map(kmp_info_t *thr, int gtid) { 462 __kmp_print_storage_map_gtid(gtid, thr, thr + 1, sizeof(kmp_info_t), "th_%d", 463 gtid); 464 465 __kmp_print_storage_map_gtid(gtid, &thr->th.th_info, &thr->th.th_team, 466 sizeof(kmp_desc_t), "th_%d.th_info", gtid); 467 468 __kmp_print_storage_map_gtid(gtid, &thr->th.th_local, &thr->th.th_pri_head, 469 sizeof(kmp_local_t), "th_%d.th_local", gtid); 470 471 __kmp_print_storage_map_gtid( 472 gtid, &thr->th.th_bar[0], &thr->th.th_bar[bs_last_barrier], 473 sizeof(kmp_balign_t) * bs_last_barrier, "th_%d.th_bar", gtid); 474 475 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_plain_barrier], 476 &thr->th.th_bar[bs_plain_barrier + 1], 477 sizeof(kmp_balign_t), "th_%d.th_bar[plain]", 478 gtid); 479 480 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_forkjoin_barrier], 481 &thr->th.th_bar[bs_forkjoin_barrier + 1], 482 sizeof(kmp_balign_t), "th_%d.th_bar[forkjoin]", 483 gtid); 484 485 #if KMP_FAST_REDUCTION_BARRIER 486 __kmp_print_storage_map_gtid(gtid, &thr->th.th_bar[bs_reduction_barrier], 487 &thr->th.th_bar[bs_reduction_barrier + 1], 488 sizeof(kmp_balign_t), "th_%d.th_bar[reduction]", 489 gtid); 490 #endif // KMP_FAST_REDUCTION_BARRIER 491 } 492 493 /* Print out the storage map for the major kmp_team_t team data structures 494 that are allocated together. */ 495 496 static void __kmp_print_team_storage_map(const char *header, kmp_team_t *team, 497 int team_id, int num_thr) { 498 int num_disp_buff = team->t.t_max_nproc > 1 ? __kmp_dispatch_num_buffers : 2; 499 __kmp_print_storage_map_gtid(-1, team, team + 1, sizeof(kmp_team_t), "%s_%d", 500 header, team_id); 501 502 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[0], 503 &team->t.t_bar[bs_last_barrier], 504 sizeof(kmp_balign_team_t) * bs_last_barrier, 505 "%s_%d.t_bar", header, team_id); 506 507 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_plain_barrier], 508 &team->t.t_bar[bs_plain_barrier + 1], 509 sizeof(kmp_balign_team_t), "%s_%d.t_bar[plain]", 510 header, team_id); 511 512 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_forkjoin_barrier], 513 &team->t.t_bar[bs_forkjoin_barrier + 1], 514 sizeof(kmp_balign_team_t), 515 "%s_%d.t_bar[forkjoin]", header, team_id); 516 517 #if KMP_FAST_REDUCTION_BARRIER 518 __kmp_print_storage_map_gtid(-1, &team->t.t_bar[bs_reduction_barrier], 519 &team->t.t_bar[bs_reduction_barrier + 1], 520 sizeof(kmp_balign_team_t), 521 "%s_%d.t_bar[reduction]", header, team_id); 522 #endif // KMP_FAST_REDUCTION_BARRIER 523 524 __kmp_print_storage_map_gtid( 525 -1, &team->t.t_dispatch[0], &team->t.t_dispatch[num_thr], 526 sizeof(kmp_disp_t) * num_thr, "%s_%d.t_dispatch", header, team_id); 527 528 __kmp_print_storage_map_gtid( 529 -1, &team->t.t_threads[0], &team->t.t_threads[num_thr], 530 sizeof(kmp_info_t *) * num_thr, "%s_%d.t_threads", header, team_id); 531 532 __kmp_print_storage_map_gtid(-1, &team->t.t_disp_buffer[0], 533 &team->t.t_disp_buffer[num_disp_buff], 534 sizeof(dispatch_shared_info_t) * num_disp_buff, 535 "%s_%d.t_disp_buffer", header, team_id); 536 537 __kmp_print_storage_map_gtid(-1, &team->t.t_taskq, &team->t.t_copypriv_data, 538 sizeof(kmp_taskq_t), "%s_%d.t_taskq", header, 539 team_id); 540 } 541 542 static void __kmp_init_allocator() {} 543 static void __kmp_fini_allocator() {} 544 545 /* ------------------------------------------------------------------------ */ 546 547 #ifdef KMP_DYNAMIC_LIB 548 #if KMP_OS_WINDOWS 549 550 static void __kmp_reset_lock(kmp_bootstrap_lock_t *lck) { 551 // TODO: Change to __kmp_break_bootstrap_lock(). 552 __kmp_init_bootstrap_lock(lck); // make the lock released 553 } 554 555 static void __kmp_reset_locks_on_process_detach(int gtid_req) { 556 int i; 557 int thread_count; 558 559 // PROCESS_DETACH is expected to be called by a thread that executes 560 // ProcessExit() or FreeLibrary(). OS terminates other threads (except the one 561 // calling ProcessExit or FreeLibrary). So, it might be safe to access the 562 // __kmp_threads[] without taking the forkjoin_lock. However, in fact, some 563 // threads can be still alive here, although being about to be terminated. The 564 // threads in the array with ds_thread==0 are most suspicious. Actually, it 565 // can be not safe to access the __kmp_threads[]. 566 567 // TODO: does it make sense to check __kmp_roots[] ? 568 569 // Let's check that there are no other alive threads registered with the OMP 570 // lib. 571 while (1) { 572 thread_count = 0; 573 for (i = 0; i < __kmp_threads_capacity; ++i) { 574 if (!__kmp_threads) 575 continue; 576 kmp_info_t *th = __kmp_threads[i]; 577 if (th == NULL) 578 continue; 579 int gtid = th->th.th_info.ds.ds_gtid; 580 if (gtid == gtid_req) 581 continue; 582 if (gtid < 0) 583 continue; 584 DWORD exit_val; 585 int alive = __kmp_is_thread_alive(th, &exit_val); 586 if (alive) { 587 ++thread_count; 588 } 589 } 590 if (thread_count == 0) 591 break; // success 592 } 593 594 // Assume that I'm alone. Now it might be safe to check and reset locks. 595 // __kmp_forkjoin_lock and __kmp_stdio_lock are expected to be reset. 596 __kmp_reset_lock(&__kmp_forkjoin_lock); 597 #ifdef KMP_DEBUG 598 __kmp_reset_lock(&__kmp_stdio_lock); 599 #endif // KMP_DEBUG 600 } 601 602 BOOL WINAPI DllMain(HINSTANCE hInstDLL, DWORD fdwReason, LPVOID lpReserved) { 603 //__kmp_acquire_bootstrap_lock( &__kmp_initz_lock ); 604 605 switch (fdwReason) { 606 607 case DLL_PROCESS_ATTACH: 608 KA_TRACE(10, ("DllMain: PROCESS_ATTACH\n")); 609 610 return TRUE; 611 612 case DLL_PROCESS_DETACH: 613 KA_TRACE(10, ("DllMain: PROCESS_DETACH T#%d\n", __kmp_gtid_get_specific())); 614 615 if (lpReserved != NULL) { 616 // lpReserved is used for telling the difference: 617 // lpReserved == NULL when FreeLibrary() was called, 618 // lpReserved != NULL when the process terminates. 619 // When FreeLibrary() is called, worker threads remain alive. So they will 620 // release the forkjoin lock by themselves. When the process terminates, 621 // worker threads disappear triggering the problem of unreleased forkjoin 622 // lock as described below. 623 624 // A worker thread can take the forkjoin lock. The problem comes up if 625 // that worker thread becomes dead before it releases the forkjoin lock. 626 // The forkjoin lock remains taken, while the thread executing 627 // DllMain()->PROCESS_DETACH->__kmp_internal_end_library() below will try 628 // to take the forkjoin lock and will always fail, so that the application 629 // will never finish [normally]. This scenario is possible if 630 // __kmpc_end() has not been executed. It looks like it's not a corner 631 // case, but common cases: 632 // - the main function was compiled by an alternative compiler; 633 // - the main function was compiled by icl but without /Qopenmp 634 // (application with plugins); 635 // - application terminates by calling C exit(), Fortran CALL EXIT() or 636 // Fortran STOP. 637 // - alive foreign thread prevented __kmpc_end from doing cleanup. 638 // 639 // This is a hack to work around the problem. 640 // TODO: !!! figure out something better. 641 __kmp_reset_locks_on_process_detach(__kmp_gtid_get_specific()); 642 } 643 644 __kmp_internal_end_library(__kmp_gtid_get_specific()); 645 646 return TRUE; 647 648 case DLL_THREAD_ATTACH: 649 KA_TRACE(10, ("DllMain: THREAD_ATTACH\n")); 650 651 /* if we want to register new siblings all the time here call 652 * __kmp_get_gtid(); */ 653 return TRUE; 654 655 case DLL_THREAD_DETACH: 656 KA_TRACE(10, ("DllMain: THREAD_DETACH T#%d\n", __kmp_gtid_get_specific())); 657 658 __kmp_internal_end_thread(__kmp_gtid_get_specific()); 659 return TRUE; 660 } 661 662 return TRUE; 663 } 664 665 #endif /* KMP_OS_WINDOWS */ 666 #endif /* KMP_DYNAMIC_LIB */ 667 668 /* Change the library type to "status" and return the old type */ 669 /* called from within initialization routines where __kmp_initz_lock is held */ 670 int __kmp_change_library(int status) { 671 int old_status; 672 673 old_status = __kmp_yield_init & 674 1; // check whether KMP_LIBRARY=throughput (even init count) 675 676 if (status) { 677 __kmp_yield_init |= 1; // throughput => turnaround (odd init count) 678 } else { 679 __kmp_yield_init &= ~1; // turnaround => throughput (even init count) 680 } 681 682 return old_status; // return previous setting of whether 683 // KMP_LIBRARY=throughput 684 } 685 686 /* __kmp_parallel_deo -- Wait until it's our turn. */ 687 void __kmp_parallel_deo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 688 int gtid = *gtid_ref; 689 #ifdef BUILD_PARALLEL_ORDERED 690 kmp_team_t *team = __kmp_team_from_gtid(gtid); 691 #endif /* BUILD_PARALLEL_ORDERED */ 692 693 if (__kmp_env_consistency_check) { 694 if (__kmp_threads[gtid]->th.th_root->r.r_active) 695 #if KMP_USE_DYNAMIC_LOCK 696 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL, 0); 697 #else 698 __kmp_push_sync(gtid, ct_ordered_in_parallel, loc_ref, NULL); 699 #endif 700 } 701 #ifdef BUILD_PARALLEL_ORDERED 702 if (!team->t.t_serialized) { 703 KMP_MB(); 704 KMP_WAIT_YIELD(&team->t.t_ordered.dt.t_value, __kmp_tid_from_gtid(gtid), 705 KMP_EQ, NULL); 706 KMP_MB(); 707 } 708 #endif /* BUILD_PARALLEL_ORDERED */ 709 } 710 711 /* __kmp_parallel_dxo -- Signal the next task. */ 712 void __kmp_parallel_dxo(int *gtid_ref, int *cid_ref, ident_t *loc_ref) { 713 int gtid = *gtid_ref; 714 #ifdef BUILD_PARALLEL_ORDERED 715 int tid = __kmp_tid_from_gtid(gtid); 716 kmp_team_t *team = __kmp_team_from_gtid(gtid); 717 #endif /* BUILD_PARALLEL_ORDERED */ 718 719 if (__kmp_env_consistency_check) { 720 if (__kmp_threads[gtid]->th.th_root->r.r_active) 721 __kmp_pop_sync(gtid, ct_ordered_in_parallel, loc_ref); 722 } 723 #ifdef BUILD_PARALLEL_ORDERED 724 if (!team->t.t_serialized) { 725 KMP_MB(); /* Flush all pending memory write invalidates. */ 726 727 /* use the tid of the next thread in this team */ 728 /* TODO replace with general release procedure */ 729 team->t.t_ordered.dt.t_value = ((tid + 1) % team->t.t_nproc); 730 731 KMP_MB(); /* Flush all pending memory write invalidates. */ 732 } 733 #endif /* BUILD_PARALLEL_ORDERED */ 734 } 735 736 /* ------------------------------------------------------------------------ */ 737 /* The BARRIER for a SINGLE process section is always explicit */ 738 739 int __kmp_enter_single(int gtid, ident_t *id_ref, int push_ws) { 740 int status; 741 kmp_info_t *th; 742 kmp_team_t *team; 743 744 if (!TCR_4(__kmp_init_parallel)) 745 __kmp_parallel_initialize(); 746 747 th = __kmp_threads[gtid]; 748 team = th->th.th_team; 749 status = 0; 750 751 th->th.th_ident = id_ref; 752 753 if (team->t.t_serialized) { 754 status = 1; 755 } else { 756 kmp_int32 old_this = th->th.th_local.this_construct; 757 758 ++th->th.th_local.this_construct; 759 /* try to set team count to thread count--success means thread got the 760 single block */ 761 /* TODO: Should this be acquire or release? */ 762 if (team->t.t_construct == old_this) { 763 status = __kmp_atomic_compare_store_acq(&team->t.t_construct, old_this, 764 th->th.th_local.this_construct); 765 } 766 #if USE_ITT_BUILD 767 if (__itt_metadata_add_ptr && __kmp_forkjoin_frames_mode == 3 && 768 KMP_MASTER_GTID(gtid) && 769 #if OMP_40_ENABLED 770 th->th.th_teams_microtask == NULL && 771 #endif 772 team->t.t_active_level == 773 1) { // Only report metadata by master of active team at level 1 774 __kmp_itt_metadata_single(id_ref); 775 } 776 #endif /* USE_ITT_BUILD */ 777 } 778 779 if (__kmp_env_consistency_check) { 780 if (status && push_ws) { 781 __kmp_push_workshare(gtid, ct_psingle, id_ref); 782 } else { 783 __kmp_check_workshare(gtid, ct_psingle, id_ref); 784 } 785 } 786 #if USE_ITT_BUILD 787 if (status) { 788 __kmp_itt_single_start(gtid); 789 } 790 #endif /* USE_ITT_BUILD */ 791 return status; 792 } 793 794 void __kmp_exit_single(int gtid) { 795 #if USE_ITT_BUILD 796 __kmp_itt_single_end(gtid); 797 #endif /* USE_ITT_BUILD */ 798 if (__kmp_env_consistency_check) 799 __kmp_pop_workshare(gtid, ct_psingle, NULL); 800 } 801 802 /* determine if we can go parallel or must use a serialized parallel region and 803 * how many threads we can use 804 * set_nproc is the number of threads requested for the team 805 * returns 0 if we should serialize or only use one thread, 806 * otherwise the number of threads to use 807 * The forkjoin lock is held by the caller. */ 808 static int __kmp_reserve_threads(kmp_root_t *root, kmp_team_t *parent_team, 809 int master_tid, int set_nthreads 810 #if OMP_40_ENABLED 811 , 812 int enter_teams 813 #endif /* OMP_40_ENABLED */ 814 ) { 815 int capacity; 816 int new_nthreads; 817 KMP_DEBUG_ASSERT(__kmp_init_serial); 818 KMP_DEBUG_ASSERT(root && parent_team); 819 820 // If dyn-var is set, dynamically adjust the number of desired threads, 821 // according to the method specified by dynamic_mode. 822 new_nthreads = set_nthreads; 823 if (!get__dynamic_2(parent_team, master_tid)) { 824 ; 825 } 826 #ifdef USE_LOAD_BALANCE 827 else if (__kmp_global.g.g_dynamic_mode == dynamic_load_balance) { 828 new_nthreads = __kmp_load_balance_nproc(root, set_nthreads); 829 if (new_nthreads == 1) { 830 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 831 "reservation to 1 thread\n", 832 master_tid)); 833 return 1; 834 } 835 if (new_nthreads < set_nthreads) { 836 KC_TRACE(10, ("__kmp_reserve_threads: T#%d load balance reduced " 837 "reservation to %d threads\n", 838 master_tid, new_nthreads)); 839 } 840 } 841 #endif /* USE_LOAD_BALANCE */ 842 else if (__kmp_global.g.g_dynamic_mode == dynamic_thread_limit) { 843 new_nthreads = __kmp_avail_proc - __kmp_nth + 844 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 845 if (new_nthreads <= 1) { 846 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 847 "reservation to 1 thread\n", 848 master_tid)); 849 return 1; 850 } 851 if (new_nthreads < set_nthreads) { 852 KC_TRACE(10, ("__kmp_reserve_threads: T#%d thread limit reduced " 853 "reservation to %d threads\n", 854 master_tid, new_nthreads)); 855 } else { 856 new_nthreads = set_nthreads; 857 } 858 } else if (__kmp_global.g.g_dynamic_mode == dynamic_random) { 859 if (set_nthreads > 2) { 860 new_nthreads = __kmp_get_random(parent_team->t.t_threads[master_tid]); 861 new_nthreads = (new_nthreads % set_nthreads) + 1; 862 if (new_nthreads == 1) { 863 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 864 "reservation to 1 thread\n", 865 master_tid)); 866 return 1; 867 } 868 if (new_nthreads < set_nthreads) { 869 KC_TRACE(10, ("__kmp_reserve_threads: T#%d dynamic random reduced " 870 "reservation to %d threads\n", 871 master_tid, new_nthreads)); 872 } 873 } 874 } else { 875 KMP_ASSERT(0); 876 } 877 878 // Respect KMP_ALL_THREADS/KMP_DEVICE_THREAD_LIMIT. 879 if (__kmp_nth + new_nthreads - 880 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 881 __kmp_max_nth) { 882 int tl_nthreads = __kmp_max_nth - __kmp_nth + 883 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 884 if (tl_nthreads <= 0) { 885 tl_nthreads = 1; 886 } 887 888 // If dyn-var is false, emit a 1-time warning. 889 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 890 __kmp_reserve_warn = 1; 891 __kmp_msg(kmp_ms_warning, 892 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 893 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 894 } 895 if (tl_nthreads == 1) { 896 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT " 897 "reduced reservation to 1 thread\n", 898 master_tid)); 899 return 1; 900 } 901 KC_TRACE(10, ("__kmp_reserve_threads: T#%d KMP_DEVICE_THREAD_LIMIT reduced " 902 "reservation to %d threads\n", 903 master_tid, tl_nthreads)); 904 new_nthreads = tl_nthreads; 905 } 906 907 // Respect OMP_THREAD_LIMIT 908 if (root->r.r_cg_nthreads + new_nthreads - 909 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 910 __kmp_cg_max_nth) { 911 int tl_nthreads = __kmp_cg_max_nth - root->r.r_cg_nthreads + 912 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 913 if (tl_nthreads <= 0) { 914 tl_nthreads = 1; 915 } 916 917 // If dyn-var is false, emit a 1-time warning. 918 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 919 __kmp_reserve_warn = 1; 920 __kmp_msg(kmp_ms_warning, 921 KMP_MSG(CantFormThrTeam, set_nthreads, tl_nthreads), 922 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 923 } 924 if (tl_nthreads == 1) { 925 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT " 926 "reduced reservation to 1 thread\n", 927 master_tid)); 928 return 1; 929 } 930 KC_TRACE(10, ("__kmp_reserve_threads: T#%d OMP_THREAD_LIMIT reduced " 931 "reservation to %d threads\n", 932 master_tid, tl_nthreads)); 933 new_nthreads = tl_nthreads; 934 } 935 936 // Check if the threads array is large enough, or needs expanding. 937 // See comment in __kmp_register_root() about the adjustment if 938 // __kmp_threads[0] == NULL. 939 capacity = __kmp_threads_capacity; 940 if (TCR_PTR(__kmp_threads[0]) == NULL) { 941 --capacity; 942 } 943 if (__kmp_nth + new_nthreads - 944 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) > 945 capacity) { 946 // Expand the threads array. 947 int slotsRequired = __kmp_nth + new_nthreads - 948 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc) - 949 capacity; 950 int slotsAdded = __kmp_expand_threads(slotsRequired); 951 if (slotsAdded < slotsRequired) { 952 // The threads array was not expanded enough. 953 new_nthreads -= (slotsRequired - slotsAdded); 954 KMP_ASSERT(new_nthreads >= 1); 955 956 // If dyn-var is false, emit a 1-time warning. 957 if (!get__dynamic_2(parent_team, master_tid) && (!__kmp_reserve_warn)) { 958 __kmp_reserve_warn = 1; 959 if (__kmp_tp_cached) { 960 __kmp_msg(kmp_ms_warning, 961 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 962 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 963 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 964 } else { 965 __kmp_msg(kmp_ms_warning, 966 KMP_MSG(CantFormThrTeam, set_nthreads, new_nthreads), 967 KMP_HNT(SystemLimitOnThreads), __kmp_msg_null); 968 } 969 } 970 } 971 } 972 973 #ifdef KMP_DEBUG 974 if (new_nthreads == 1) { 975 KC_TRACE(10, 976 ("__kmp_reserve_threads: T#%d serializing team after reclaiming " 977 "dead roots and rechecking; requested %d threads\n", 978 __kmp_get_gtid(), set_nthreads)); 979 } else { 980 KC_TRACE(10, ("__kmp_reserve_threads: T#%d allocating %d threads; requested" 981 " %d threads\n", 982 __kmp_get_gtid(), new_nthreads, set_nthreads)); 983 } 984 #endif // KMP_DEBUG 985 return new_nthreads; 986 } 987 988 /* Allocate threads from the thread pool and assign them to the new team. We are 989 assured that there are enough threads available, because we checked on that 990 earlier within critical section forkjoin */ 991 static void __kmp_fork_team_threads(kmp_root_t *root, kmp_team_t *team, 992 kmp_info_t *master_th, int master_gtid) { 993 int i; 994 int use_hot_team; 995 996 KA_TRACE(10, ("__kmp_fork_team_threads: new_nprocs = %d\n", team->t.t_nproc)); 997 KMP_DEBUG_ASSERT(master_gtid == __kmp_get_gtid()); 998 KMP_MB(); 999 1000 /* first, let's setup the master thread */ 1001 master_th->th.th_info.ds.ds_tid = 0; 1002 master_th->th.th_team = team; 1003 master_th->th.th_team_nproc = team->t.t_nproc; 1004 master_th->th.th_team_master = master_th; 1005 master_th->th.th_team_serialized = FALSE; 1006 master_th->th.th_dispatch = &team->t.t_dispatch[0]; 1007 1008 /* make sure we are not the optimized hot team */ 1009 #if KMP_NESTED_HOT_TEAMS 1010 use_hot_team = 0; 1011 kmp_hot_team_ptr_t *hot_teams = master_th->th.th_hot_teams; 1012 if (hot_teams) { // hot teams array is not allocated if 1013 // KMP_HOT_TEAMS_MAX_LEVEL=0 1014 int level = team->t.t_active_level - 1; // index in array of hot teams 1015 if (master_th->th.th_teams_microtask) { // are we inside the teams? 1016 if (master_th->th.th_teams_size.nteams > 1) { 1017 ++level; // level was not increased in teams construct for 1018 // team_of_masters 1019 } 1020 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 1021 master_th->th.th_teams_level == team->t.t_level) { 1022 ++level; // level was not increased in teams construct for 1023 // team_of_workers before the parallel 1024 } // team->t.t_level will be increased inside parallel 1025 } 1026 if (level < __kmp_hot_teams_max_level) { 1027 if (hot_teams[level].hot_team) { 1028 // hot team has already been allocated for given level 1029 KMP_DEBUG_ASSERT(hot_teams[level].hot_team == team); 1030 use_hot_team = 1; // the team is ready to use 1031 } else { 1032 use_hot_team = 0; // AC: threads are not allocated yet 1033 hot_teams[level].hot_team = team; // remember new hot team 1034 hot_teams[level].hot_team_nth = team->t.t_nproc; 1035 } 1036 } else { 1037 use_hot_team = 0; 1038 } 1039 } 1040 #else 1041 use_hot_team = team == root->r.r_hot_team; 1042 #endif 1043 if (!use_hot_team) { 1044 1045 /* install the master thread */ 1046 team->t.t_threads[0] = master_th; 1047 __kmp_initialize_info(master_th, team, 0, master_gtid); 1048 1049 /* now, install the worker threads */ 1050 for (i = 1; i < team->t.t_nproc; i++) { 1051 1052 /* fork or reallocate a new thread and install it in team */ 1053 kmp_info_t *thr = __kmp_allocate_thread(root, team, i); 1054 team->t.t_threads[i] = thr; 1055 KMP_DEBUG_ASSERT(thr); 1056 KMP_DEBUG_ASSERT(thr->th.th_team == team); 1057 /* align team and thread arrived states */ 1058 KA_TRACE(20, ("__kmp_fork_team_threads: T#%d(%d:%d) init arrived " 1059 "T#%d(%d:%d) join =%llu, plain=%llu\n", 1060 __kmp_gtid_from_tid(0, team), team->t.t_id, 0, 1061 __kmp_gtid_from_tid(i, team), team->t.t_id, i, 1062 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 1063 team->t.t_bar[bs_plain_barrier].b_arrived)); 1064 #if OMP_40_ENABLED 1065 thr->th.th_teams_microtask = master_th->th.th_teams_microtask; 1066 thr->th.th_teams_level = master_th->th.th_teams_level; 1067 thr->th.th_teams_size = master_th->th.th_teams_size; 1068 #endif 1069 { // Initialize threads' barrier data. 1070 int b; 1071 kmp_balign_t *balign = team->t.t_threads[i]->th.th_bar; 1072 for (b = 0; b < bs_last_barrier; ++b) { 1073 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 1074 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 1075 #if USE_DEBUGGER 1076 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 1077 #endif 1078 } 1079 } 1080 } 1081 1082 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 1083 __kmp_partition_places(team); 1084 #endif 1085 } 1086 1087 KMP_MB(); 1088 } 1089 1090 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 1091 // Propagate any changes to the floating point control registers out to the team 1092 // We try to avoid unnecessary writes to the relevant cache line in the team 1093 // structure, so we don't make changes unless they are needed. 1094 inline static void propagateFPControl(kmp_team_t *team) { 1095 if (__kmp_inherit_fp_control) { 1096 kmp_int16 x87_fpu_control_word; 1097 kmp_uint32 mxcsr; 1098 1099 // Get master values of FPU control flags (both X87 and vector) 1100 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1101 __kmp_store_mxcsr(&mxcsr); 1102 mxcsr &= KMP_X86_MXCSR_MASK; 1103 1104 // There is no point looking at t_fp_control_saved here. 1105 // If it is TRUE, we still have to update the values if they are different 1106 // from those we now have. If it is FALSE we didn't save anything yet, but 1107 // our objective is the same. We have to ensure that the values in the team 1108 // are the same as those we have. 1109 // So, this code achieves what we need whether or not t_fp_control_saved is 1110 // true. By checking whether the value needs updating we avoid unnecessary 1111 // writes that would put the cache-line into a written state, causing all 1112 // threads in the team to have to read it again. 1113 KMP_CHECK_UPDATE(team->t.t_x87_fpu_control_word, x87_fpu_control_word); 1114 KMP_CHECK_UPDATE(team->t.t_mxcsr, mxcsr); 1115 // Although we don't use this value, other code in the runtime wants to know 1116 // whether it should restore them. So we must ensure it is correct. 1117 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, TRUE); 1118 } else { 1119 // Similarly here. Don't write to this cache-line in the team structure 1120 // unless we have to. 1121 KMP_CHECK_UPDATE(team->t.t_fp_control_saved, FALSE); 1122 } 1123 } 1124 1125 // Do the opposite, setting the hardware registers to the updated values from 1126 // the team. 1127 inline static void updateHWFPControl(kmp_team_t *team) { 1128 if (__kmp_inherit_fp_control && team->t.t_fp_control_saved) { 1129 // Only reset the fp control regs if they have been changed in the team. 1130 // the parallel region that we are exiting. 1131 kmp_int16 x87_fpu_control_word; 1132 kmp_uint32 mxcsr; 1133 __kmp_store_x87_fpu_control_word(&x87_fpu_control_word); 1134 __kmp_store_mxcsr(&mxcsr); 1135 mxcsr &= KMP_X86_MXCSR_MASK; 1136 1137 if (team->t.t_x87_fpu_control_word != x87_fpu_control_word) { 1138 __kmp_clear_x87_fpu_status_word(); 1139 __kmp_load_x87_fpu_control_word(&team->t.t_x87_fpu_control_word); 1140 } 1141 1142 if (team->t.t_mxcsr != mxcsr) { 1143 __kmp_load_mxcsr(&team->t.t_mxcsr); 1144 } 1145 } 1146 } 1147 #else 1148 #define propagateFPControl(x) ((void)0) 1149 #define updateHWFPControl(x) ((void)0) 1150 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1151 1152 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, 1153 int realloc); // forward declaration 1154 1155 /* Run a parallel region that has been serialized, so runs only in a team of the 1156 single master thread. */ 1157 void __kmp_serialized_parallel(ident_t *loc, kmp_int32 global_tid) { 1158 kmp_info_t *this_thr; 1159 kmp_team_t *serial_team; 1160 1161 KC_TRACE(10, ("__kmpc_serialized_parallel: called by T#%d\n", global_tid)); 1162 1163 /* Skip all this code for autopar serialized loops since it results in 1164 unacceptable overhead */ 1165 if (loc != NULL && (loc->flags & KMP_IDENT_AUTOPAR)) 1166 return; 1167 1168 if (!TCR_4(__kmp_init_parallel)) 1169 __kmp_parallel_initialize(); 1170 1171 this_thr = __kmp_threads[global_tid]; 1172 serial_team = this_thr->th.th_serial_team; 1173 1174 /* utilize the serialized team held by this thread */ 1175 KMP_DEBUG_ASSERT(serial_team); 1176 KMP_MB(); 1177 1178 if (__kmp_tasking_mode != tskm_immediate_exec) { 1179 KMP_DEBUG_ASSERT( 1180 this_thr->th.th_task_team == 1181 this_thr->th.th_team->t.t_task_team[this_thr->th.th_task_state]); 1182 KMP_DEBUG_ASSERT(serial_team->t.t_task_team[this_thr->th.th_task_state] == 1183 NULL); 1184 KA_TRACE(20, ("__kmpc_serialized_parallel: T#%d pushing task_team %p / " 1185 "team %p, new task_team = NULL\n", 1186 global_tid, this_thr->th.th_task_team, this_thr->th.th_team)); 1187 this_thr->th.th_task_team = NULL; 1188 } 1189 1190 #if OMP_40_ENABLED 1191 kmp_proc_bind_t proc_bind = this_thr->th.th_set_proc_bind; 1192 if (this_thr->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1193 proc_bind = proc_bind_false; 1194 } else if (proc_bind == proc_bind_default) { 1195 // No proc_bind clause was specified, so use the current value 1196 // of proc-bind-var for this parallel region. 1197 proc_bind = this_thr->th.th_current_task->td_icvs.proc_bind; 1198 } 1199 // Reset for next parallel region 1200 this_thr->th.th_set_proc_bind = proc_bind_default; 1201 #endif /* OMP_40_ENABLED */ 1202 1203 #if OMPT_SUPPORT 1204 ompt_data_t ompt_parallel_data; 1205 ompt_parallel_data.ptr = NULL; 1206 ompt_data_t *implicit_task_data; 1207 void *codeptr = OMPT_LOAD_RETURN_ADDRESS(global_tid); 1208 if (ompt_enabled.enabled && 1209 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 1210 1211 ompt_task_info_t *parent_task_info; 1212 parent_task_info = OMPT_CUR_TASK_INFO(this_thr); 1213 1214 parent_task_info->frame.enter_frame = OMPT_GET_FRAME_ADDRESS(1); 1215 if (ompt_enabled.ompt_callback_parallel_begin) { 1216 int team_size = 1; 1217 1218 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1219 &(parent_task_info->task_data), &(parent_task_info->frame), 1220 &ompt_parallel_data, team_size, ompt_invoker_program, codeptr); 1221 } 1222 } 1223 #endif // OMPT_SUPPORT 1224 1225 if (this_thr->th.th_team != serial_team) { 1226 // Nested level will be an index in the nested nthreads array 1227 int level = this_thr->th.th_team->t.t_level; 1228 1229 if (serial_team->t.t_serialized) { 1230 /* this serial team was already used 1231 TODO increase performance by making this locks more specific */ 1232 kmp_team_t *new_team; 1233 1234 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1235 1236 new_team = __kmp_allocate_team(this_thr->th.th_root, 1, 1, 1237 #if OMPT_SUPPORT 1238 ompt_parallel_data, 1239 #endif 1240 #if OMP_40_ENABLED 1241 proc_bind, 1242 #endif 1243 &this_thr->th.th_current_task->td_icvs, 1244 0 USE_NESTED_HOT_ARG(NULL)); 1245 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1246 KMP_ASSERT(new_team); 1247 1248 /* setup new serialized team and install it */ 1249 new_team->t.t_threads[0] = this_thr; 1250 new_team->t.t_parent = this_thr->th.th_team; 1251 serial_team = new_team; 1252 this_thr->th.th_serial_team = serial_team; 1253 1254 KF_TRACE( 1255 10, 1256 ("__kmpc_serialized_parallel: T#%d allocated new serial team %p\n", 1257 global_tid, serial_team)); 1258 1259 /* TODO the above breaks the requirement that if we run out of resources, 1260 then we can still guarantee that serialized teams are ok, since we may 1261 need to allocate a new one */ 1262 } else { 1263 KF_TRACE( 1264 10, 1265 ("__kmpc_serialized_parallel: T#%d reusing cached serial team %p\n", 1266 global_tid, serial_team)); 1267 } 1268 1269 /* we have to initialize this serial team */ 1270 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1271 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1272 KMP_DEBUG_ASSERT(this_thr->th.th_team != serial_team); 1273 serial_team->t.t_ident = loc; 1274 serial_team->t.t_serialized = 1; 1275 serial_team->t.t_nproc = 1; 1276 serial_team->t.t_parent = this_thr->th.th_team; 1277 serial_team->t.t_sched.sched = this_thr->th.th_team->t.t_sched.sched; 1278 this_thr->th.th_team = serial_team; 1279 serial_team->t.t_master_tid = this_thr->th.th_info.ds.ds_tid; 1280 1281 KF_TRACE(10, ("__kmpc_serialized_parallel: T#d curtask=%p\n", global_tid, 1282 this_thr->th.th_current_task)); 1283 KMP_ASSERT(this_thr->th.th_current_task->td_flags.executing == 1); 1284 this_thr->th.th_current_task->td_flags.executing = 0; 1285 1286 __kmp_push_current_task_to_thread(this_thr, serial_team, 0); 1287 1288 /* TODO: GEH: do ICVs work for nested serialized teams? Don't we need an 1289 implicit task for each serialized task represented by 1290 team->t.t_serialized? */ 1291 copy_icvs(&this_thr->th.th_current_task->td_icvs, 1292 &this_thr->th.th_current_task->td_parent->td_icvs); 1293 1294 // Thread value exists in the nested nthreads array for the next nested 1295 // level 1296 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1297 this_thr->th.th_current_task->td_icvs.nproc = 1298 __kmp_nested_nth.nth[level + 1]; 1299 } 1300 1301 #if OMP_40_ENABLED 1302 if (__kmp_nested_proc_bind.used && 1303 (level + 1 < __kmp_nested_proc_bind.used)) { 1304 this_thr->th.th_current_task->td_icvs.proc_bind = 1305 __kmp_nested_proc_bind.bind_types[level + 1]; 1306 } 1307 #endif /* OMP_40_ENABLED */ 1308 1309 #if USE_DEBUGGER 1310 serial_team->t.t_pkfn = (microtask_t)(~0); // For the debugger. 1311 #endif 1312 this_thr->th.th_info.ds.ds_tid = 0; 1313 1314 /* set thread cache values */ 1315 this_thr->th.th_team_nproc = 1; 1316 this_thr->th.th_team_master = this_thr; 1317 this_thr->th.th_team_serialized = 1; 1318 1319 serial_team->t.t_level = serial_team->t.t_parent->t.t_level + 1; 1320 serial_team->t.t_active_level = serial_team->t.t_parent->t.t_active_level; 1321 1322 propagateFPControl(serial_team); 1323 1324 /* check if we need to allocate dispatch buffers stack */ 1325 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1326 if (!serial_team->t.t_dispatch->th_disp_buffer) { 1327 serial_team->t.t_dispatch->th_disp_buffer = 1328 (dispatch_private_info_t *)__kmp_allocate( 1329 sizeof(dispatch_private_info_t)); 1330 } 1331 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1332 1333 KMP_MB(); 1334 1335 } else { 1336 /* this serialized team is already being used, 1337 * that's fine, just add another nested level */ 1338 KMP_DEBUG_ASSERT(this_thr->th.th_team == serial_team); 1339 KMP_DEBUG_ASSERT(serial_team->t.t_threads); 1340 KMP_DEBUG_ASSERT(serial_team->t.t_threads[0] == this_thr); 1341 ++serial_team->t.t_serialized; 1342 this_thr->th.th_team_serialized = serial_team->t.t_serialized; 1343 1344 // Nested level will be an index in the nested nthreads array 1345 int level = this_thr->th.th_team->t.t_level; 1346 // Thread value exists in the nested nthreads array for the next nested 1347 // level 1348 if (__kmp_nested_nth.used && (level + 1 < __kmp_nested_nth.used)) { 1349 this_thr->th.th_current_task->td_icvs.nproc = 1350 __kmp_nested_nth.nth[level + 1]; 1351 } 1352 serial_team->t.t_level++; 1353 KF_TRACE(10, ("__kmpc_serialized_parallel: T#%d increasing nesting level " 1354 "of serial team %p to %d\n", 1355 global_tid, serial_team, serial_team->t.t_level)); 1356 1357 /* allocate/push dispatch buffers stack */ 1358 KMP_DEBUG_ASSERT(serial_team->t.t_dispatch); 1359 { 1360 dispatch_private_info_t *disp_buffer = 1361 (dispatch_private_info_t *)__kmp_allocate( 1362 sizeof(dispatch_private_info_t)); 1363 disp_buffer->next = serial_team->t.t_dispatch->th_disp_buffer; 1364 serial_team->t.t_dispatch->th_disp_buffer = disp_buffer; 1365 } 1366 this_thr->th.th_dispatch = serial_team->t.t_dispatch; 1367 1368 KMP_MB(); 1369 } 1370 #if OMP_40_ENABLED 1371 KMP_CHECK_UPDATE(serial_team->t.t_cancel_request, cancel_noreq); 1372 #endif 1373 1374 if (__kmp_env_consistency_check) 1375 __kmp_push_parallel(global_tid, NULL); 1376 #if OMPT_SUPPORT 1377 serial_team->t.ompt_team_info.master_return_address = codeptr; 1378 if (ompt_enabled.enabled && 1379 this_thr->th.ompt_thread_info.state != omp_state_overhead) { 1380 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1); 1381 1382 ompt_lw_taskteam_t lw_taskteam; 1383 __ompt_lw_taskteam_init(&lw_taskteam, this_thr, global_tid, 1384 &ompt_parallel_data, codeptr); 1385 1386 __ompt_lw_taskteam_link(&lw_taskteam, this_thr, 1); 1387 // don't use lw_taskteam after linking. content was swaped 1388 1389 /* OMPT implicit task begin */ 1390 implicit_task_data = OMPT_CUR_TASK_DATA(this_thr); 1391 if (ompt_enabled.ompt_callback_implicit_task) { 1392 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1393 ompt_scope_begin, OMPT_CUR_TEAM_DATA(this_thr), 1394 OMPT_CUR_TASK_DATA(this_thr), 1, __kmp_tid_from_gtid(global_tid)); 1395 OMPT_CUR_TASK_INFO(this_thr) 1396 ->thread_num = __kmp_tid_from_gtid(global_tid); 1397 } 1398 1399 /* OMPT state */ 1400 this_thr->th.ompt_thread_info.state = omp_state_work_parallel; 1401 OMPT_CUR_TASK_INFO(this_thr)->frame.exit_frame = OMPT_GET_FRAME_ADDRESS(1); 1402 } 1403 #endif 1404 } 1405 1406 /* most of the work for a fork */ 1407 /* return true if we really went parallel, false if serialized */ 1408 int __kmp_fork_call(ident_t *loc, int gtid, 1409 enum fork_context_e call_context, // Intel, GNU, ... 1410 kmp_int32 argc, microtask_t microtask, launch_t invoker, 1411 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1412 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1413 va_list *ap 1414 #else 1415 va_list ap 1416 #endif 1417 ) { 1418 void **argv; 1419 int i; 1420 int master_tid; 1421 int master_this_cons; 1422 kmp_team_t *team; 1423 kmp_team_t *parent_team; 1424 kmp_info_t *master_th; 1425 kmp_root_t *root; 1426 int nthreads; 1427 int master_active; 1428 int master_set_numthreads; 1429 int level; 1430 #if OMP_40_ENABLED 1431 int active_level; 1432 int teams_level; 1433 #endif 1434 #if KMP_NESTED_HOT_TEAMS 1435 kmp_hot_team_ptr_t **p_hot_teams; 1436 #endif 1437 { // KMP_TIME_BLOCK 1438 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_fork_call); 1439 KMP_COUNT_VALUE(OMP_PARALLEL_args, argc); 1440 1441 KA_TRACE(20, ("__kmp_fork_call: enter T#%d\n", gtid)); 1442 if (__kmp_stkpadding > 0 && __kmp_root[gtid] != NULL) { 1443 /* Some systems prefer the stack for the root thread(s) to start with */ 1444 /* some gap from the parent stack to prevent false sharing. */ 1445 void *dummy = KMP_ALLOCA(__kmp_stkpadding); 1446 /* These 2 lines below are so this does not get optimized out */ 1447 if (__kmp_stkpadding > KMP_MAX_STKPADDING) 1448 __kmp_stkpadding += (short)((kmp_int64)dummy); 1449 } 1450 1451 /* initialize if needed */ 1452 KMP_DEBUG_ASSERT( 1453 __kmp_init_serial); // AC: potentially unsafe, not in sync with shutdown 1454 if (!TCR_4(__kmp_init_parallel)) 1455 __kmp_parallel_initialize(); 1456 1457 /* setup current data */ 1458 master_th = __kmp_threads[gtid]; // AC: potentially unsafe, not in sync with 1459 // shutdown 1460 parent_team = master_th->th.th_team; 1461 master_tid = master_th->th.th_info.ds.ds_tid; 1462 master_this_cons = master_th->th.th_local.this_construct; 1463 root = master_th->th.th_root; 1464 master_active = root->r.r_active; 1465 master_set_numthreads = master_th->th.th_set_nproc; 1466 1467 #if OMPT_SUPPORT 1468 ompt_data_t ompt_parallel_data; 1469 ompt_parallel_data.ptr = NULL; 1470 ompt_data_t *parent_task_data; 1471 omp_frame_t *ompt_frame; 1472 ompt_data_t *implicit_task_data; 1473 void *return_address = NULL; 1474 1475 if (ompt_enabled.enabled) { 1476 __ompt_get_task_info_internal(0, NULL, &parent_task_data, &ompt_frame, 1477 NULL, NULL); 1478 return_address = OMPT_LOAD_RETURN_ADDRESS(gtid); 1479 } 1480 #endif 1481 1482 // Nested level will be an index in the nested nthreads array 1483 level = parent_team->t.t_level; 1484 // used to launch non-serial teams even if nested is not allowed 1485 active_level = parent_team->t.t_active_level; 1486 #if OMP_40_ENABLED 1487 // needed to check nesting inside the teams 1488 teams_level = master_th->th.th_teams_level; 1489 #endif 1490 #if KMP_NESTED_HOT_TEAMS 1491 p_hot_teams = &master_th->th.th_hot_teams; 1492 if (*p_hot_teams == NULL && __kmp_hot_teams_max_level > 0) { 1493 *p_hot_teams = (kmp_hot_team_ptr_t *)__kmp_allocate( 1494 sizeof(kmp_hot_team_ptr_t) * __kmp_hot_teams_max_level); 1495 (*p_hot_teams)[0].hot_team = root->r.r_hot_team; 1496 // it is either actual or not needed (when active_level > 0) 1497 (*p_hot_teams)[0].hot_team_nth = 1; 1498 } 1499 #endif 1500 1501 #if OMPT_SUPPORT 1502 if (ompt_enabled.enabled) { 1503 if (ompt_enabled.ompt_callback_parallel_begin) { 1504 int team_size = master_set_numthreads 1505 ? master_set_numthreads 1506 : get__nproc_2(parent_team, master_tid); 1507 ompt_callbacks.ompt_callback(ompt_callback_parallel_begin)( 1508 parent_task_data, ompt_frame, &ompt_parallel_data, team_size, 1509 OMPT_INVOKER(call_context), return_address); 1510 } 1511 master_th->th.ompt_thread_info.state = omp_state_overhead; 1512 } 1513 #endif 1514 1515 master_th->th.th_ident = loc; 1516 1517 #if OMP_40_ENABLED 1518 if (master_th->th.th_teams_microtask && ap && 1519 microtask != (microtask_t)__kmp_teams_master && level == teams_level) { 1520 // AC: This is start of parallel that is nested inside teams construct. 1521 // The team is actual (hot), all workers are ready at the fork barrier. 1522 // No lock needed to initialize the team a bit, then free workers. 1523 parent_team->t.t_ident = loc; 1524 __kmp_alloc_argv_entries(argc, parent_team, TRUE); 1525 parent_team->t.t_argc = argc; 1526 argv = (void **)parent_team->t.t_argv; 1527 for (i = argc - 1; i >= 0; --i) 1528 /* TODO: revert workaround for Intel(R) 64 tracker #96 */ 1529 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1530 *argv++ = va_arg(*ap, void *); 1531 #else 1532 *argv++ = va_arg(ap, void *); 1533 #endif 1534 // Increment our nested depth levels, but not increase the serialization 1535 if (parent_team == master_th->th.th_serial_team) { 1536 // AC: we are in serialized parallel 1537 __kmpc_serialized_parallel(loc, gtid); 1538 KMP_DEBUG_ASSERT(parent_team->t.t_serialized > 1); 1539 // AC: need this in order enquiry functions work 1540 // correctly, will restore at join time 1541 parent_team->t.t_serialized--; 1542 #if OMPT_SUPPORT 1543 void *dummy; 1544 void **exit_runtime_p; 1545 1546 ompt_lw_taskteam_t lw_taskteam; 1547 1548 if (ompt_enabled.enabled) { 1549 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1550 &ompt_parallel_data, return_address); 1551 exit_runtime_p = &(lw_taskteam.ompt_task_info.frame.exit_frame); 1552 1553 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1554 // don't use lw_taskteam after linking. content was swaped 1555 1556 /* OMPT implicit task begin */ 1557 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1558 if (ompt_enabled.ompt_callback_implicit_task) { 1559 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1560 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1561 implicit_task_data, 1, __kmp_tid_from_gtid(gtid)); 1562 OMPT_CUR_TASK_INFO(master_th) 1563 ->thread_num = __kmp_tid_from_gtid(gtid); 1564 } 1565 1566 /* OMPT state */ 1567 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 1568 } else { 1569 exit_runtime_p = &dummy; 1570 } 1571 #endif 1572 1573 { 1574 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1575 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1576 __kmp_invoke_microtask(microtask, gtid, 0, argc, parent_team->t.t_argv 1577 #if OMPT_SUPPORT 1578 , 1579 exit_runtime_p 1580 #endif 1581 ); 1582 } 1583 1584 #if OMPT_SUPPORT 1585 *exit_runtime_p = NULL; 1586 if (ompt_enabled.enabled) { 1587 OMPT_CUR_TASK_INFO(master_th)->frame.exit_frame = NULL; 1588 if (ompt_enabled.ompt_callback_implicit_task) { 1589 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1590 ompt_scope_end, NULL, implicit_task_data, 1, 1591 OMPT_CUR_TASK_INFO(master_th)->thread_num); 1592 } 1593 __ompt_lw_taskteam_unlink(master_th); 1594 1595 if (ompt_enabled.ompt_callback_parallel_end) { 1596 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1597 OMPT_CUR_TEAM_DATA(master_th), OMPT_CUR_TASK_DATA(master_th), 1598 OMPT_INVOKER(call_context), return_address); 1599 } 1600 master_th->th.ompt_thread_info.state = omp_state_overhead; 1601 } 1602 #endif 1603 return TRUE; 1604 } 1605 1606 parent_team->t.t_pkfn = microtask; 1607 parent_team->t.t_invoke = invoker; 1608 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1609 parent_team->t.t_active_level++; 1610 parent_team->t.t_level++; 1611 1612 /* Change number of threads in the team if requested */ 1613 if (master_set_numthreads) { // The parallel has num_threads clause 1614 if (master_set_numthreads < master_th->th.th_teams_size.nth) { 1615 // AC: only can reduce number of threads dynamically, can't increase 1616 kmp_info_t **other_threads = parent_team->t.t_threads; 1617 parent_team->t.t_nproc = master_set_numthreads; 1618 for (i = 0; i < master_set_numthreads; ++i) { 1619 other_threads[i]->th.th_team_nproc = master_set_numthreads; 1620 } 1621 // Keep extra threads hot in the team for possible next parallels 1622 } 1623 master_th->th.th_set_nproc = 0; 1624 } 1625 1626 #if USE_DEBUGGER 1627 if (__kmp_debugging) { // Let debugger override number of threads. 1628 int nth = __kmp_omp_num_threads(loc); 1629 if (nth > 0) { // 0 means debugger doesn't want to change num threads 1630 master_set_numthreads = nth; 1631 } 1632 } 1633 #endif 1634 1635 KF_TRACE(10, ("__kmp_fork_call: before internal fork: root=%p, team=%p, " 1636 "master_th=%p, gtid=%d\n", 1637 root, parent_team, master_th, gtid)); 1638 __kmp_internal_fork(loc, gtid, parent_team); 1639 KF_TRACE(10, ("__kmp_fork_call: after internal fork: root=%p, team=%p, " 1640 "master_th=%p, gtid=%d\n", 1641 root, parent_team, master_th, gtid)); 1642 1643 /* Invoke microtask for MASTER thread */ 1644 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 1645 parent_team->t.t_id, parent_team->t.t_pkfn)); 1646 1647 if (!parent_team->t.t_invoke(gtid)) { 1648 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 1649 } 1650 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 1651 parent_team->t.t_id, parent_team->t.t_pkfn)); 1652 KMP_MB(); /* Flush all pending memory write invalidates. */ 1653 1654 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 1655 1656 return TRUE; 1657 } // Parallel closely nested in teams construct 1658 #endif /* OMP_40_ENABLED */ 1659 1660 #if KMP_DEBUG 1661 if (__kmp_tasking_mode != tskm_immediate_exec) { 1662 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 1663 parent_team->t.t_task_team[master_th->th.th_task_state]); 1664 } 1665 #endif 1666 1667 if (parent_team->t.t_active_level >= 1668 master_th->th.th_current_task->td_icvs.max_active_levels) { 1669 nthreads = 1; 1670 } else { 1671 #if OMP_40_ENABLED 1672 int enter_teams = ((ap == NULL && active_level == 0) || 1673 (ap && teams_level > 0 && teams_level == level)); 1674 #endif 1675 nthreads = 1676 master_set_numthreads 1677 ? master_set_numthreads 1678 : get__nproc_2( 1679 parent_team, 1680 master_tid); // TODO: get nproc directly from current task 1681 1682 // Check if we need to take forkjoin lock? (no need for serialized 1683 // parallel out of teams construct). This code moved here from 1684 // __kmp_reserve_threads() to speedup nested serialized parallels. 1685 if (nthreads > 1) { 1686 if ((!get__nested(master_th) && (root->r.r_in_parallel 1687 #if OMP_40_ENABLED 1688 && !enter_teams 1689 #endif /* OMP_40_ENABLED */ 1690 )) || 1691 (__kmp_library == library_serial)) { 1692 KC_TRACE(10, ("__kmp_fork_call: T#%d serializing team; requested %d" 1693 " threads\n", 1694 gtid, nthreads)); 1695 nthreads = 1; 1696 } 1697 } 1698 if (nthreads > 1) { 1699 /* determine how many new threads we can use */ 1700 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 1701 nthreads = __kmp_reserve_threads( 1702 root, parent_team, master_tid, nthreads 1703 #if OMP_40_ENABLED 1704 /* AC: If we execute teams from parallel region (on host), then 1705 teams should be created but each can only have 1 thread if 1706 nesting is disabled. If teams called from serial region, then 1707 teams and their threads should be created regardless of the 1708 nesting setting. */ 1709 , 1710 enter_teams 1711 #endif /* OMP_40_ENABLED */ 1712 ); 1713 if (nthreads == 1) { 1714 // Free lock for single thread execution here; for multi-thread 1715 // execution it will be freed later after team of threads created 1716 // and initialized 1717 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 1718 } 1719 } 1720 } 1721 KMP_DEBUG_ASSERT(nthreads > 0); 1722 1723 // If we temporarily changed the set number of threads then restore it now 1724 master_th->th.th_set_nproc = 0; 1725 1726 /* create a serialized parallel region? */ 1727 if (nthreads == 1) { 1728 /* josh todo: hypothetical question: what do we do for OS X*? */ 1729 #if KMP_OS_LINUX && \ 1730 (KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) 1731 void *args[argc]; 1732 #else 1733 void **args = (void **)KMP_ALLOCA(argc * sizeof(void *)); 1734 #endif /* KMP_OS_LINUX && ( KMP_ARCH_X86 || KMP_ARCH_X86_64 || KMP_ARCH_ARM || \ 1735 KMP_ARCH_AARCH64) */ 1736 1737 KA_TRACE(20, 1738 ("__kmp_fork_call: T#%d serializing parallel region\n", gtid)); 1739 1740 __kmpc_serialized_parallel(loc, gtid); 1741 1742 if (call_context == fork_context_intel) { 1743 /* TODO this sucks, use the compiler itself to pass args! :) */ 1744 master_th->th.th_serial_team->t.t_ident = loc; 1745 #if OMP_40_ENABLED 1746 if (!ap) { 1747 // revert change made in __kmpc_serialized_parallel() 1748 master_th->th.th_serial_team->t.t_level--; 1749 // Get args from parent team for teams construct 1750 1751 #if OMPT_SUPPORT 1752 void *dummy; 1753 void **exit_runtime_p; 1754 ompt_task_info_t *task_info; 1755 1756 ompt_lw_taskteam_t lw_taskteam; 1757 1758 if (ompt_enabled.enabled) { 1759 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1760 &ompt_parallel_data, return_address); 1761 1762 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1763 // don't use lw_taskteam after linking. content was swaped 1764 1765 task_info = OMPT_CUR_TASK_INFO(master_th); 1766 exit_runtime_p = &(task_info->frame.exit_frame); 1767 if (ompt_enabled.ompt_callback_implicit_task) { 1768 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1769 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1770 &(task_info->task_data), 1, __kmp_tid_from_gtid(gtid)); 1771 OMPT_CUR_TASK_INFO(master_th) 1772 ->thread_num = __kmp_tid_from_gtid(gtid); 1773 } 1774 1775 /* OMPT state */ 1776 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 1777 } else { 1778 exit_runtime_p = &dummy; 1779 } 1780 #endif 1781 1782 { 1783 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1784 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1785 __kmp_invoke_microtask(microtask, gtid, 0, argc, 1786 parent_team->t.t_argv 1787 #if OMPT_SUPPORT 1788 , 1789 exit_runtime_p 1790 #endif 1791 ); 1792 } 1793 1794 #if OMPT_SUPPORT 1795 if (ompt_enabled.enabled) { 1796 exit_runtime_p = NULL; 1797 if (ompt_enabled.ompt_callback_implicit_task) { 1798 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1799 ompt_scope_end, NULL, &(task_info->task_data), 1, 1800 OMPT_CUR_TASK_INFO(master_th)->thread_num); 1801 } 1802 1803 __ompt_lw_taskteam_unlink(master_th); 1804 if (ompt_enabled.ompt_callback_parallel_end) { 1805 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1806 OMPT_CUR_TEAM_DATA(master_th), parent_task_data, 1807 OMPT_INVOKER(call_context), return_address); 1808 } 1809 master_th->th.ompt_thread_info.state = omp_state_overhead; 1810 } 1811 #endif 1812 } else if (microtask == (microtask_t)__kmp_teams_master) { 1813 KMP_DEBUG_ASSERT(master_th->th.th_team == 1814 master_th->th.th_serial_team); 1815 team = master_th->th.th_team; 1816 // team->t.t_pkfn = microtask; 1817 team->t.t_invoke = invoker; 1818 __kmp_alloc_argv_entries(argc, team, TRUE); 1819 team->t.t_argc = argc; 1820 argv = (void **)team->t.t_argv; 1821 if (ap) { 1822 for (i = argc - 1; i >= 0; --i) 1823 // TODO: revert workaround for Intel(R) 64 tracker #96 1824 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1825 *argv++ = va_arg(*ap, void *); 1826 #else 1827 *argv++ = va_arg(ap, void *); 1828 #endif 1829 } else { 1830 for (i = 0; i < argc; ++i) 1831 // Get args from parent team for teams construct 1832 argv[i] = parent_team->t.t_argv[i]; 1833 } 1834 // AC: revert change made in __kmpc_serialized_parallel() 1835 // because initial code in teams should have level=0 1836 team->t.t_level--; 1837 // AC: call special invoker for outer "parallel" of teams construct 1838 invoker(gtid); 1839 } else { 1840 #endif /* OMP_40_ENABLED */ 1841 argv = args; 1842 for (i = argc - 1; i >= 0; --i) 1843 // TODO: revert workaround for Intel(R) 64 tracker #96 1844 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 1845 *argv++ = va_arg(*ap, void *); 1846 #else 1847 *argv++ = va_arg(ap, void *); 1848 #endif 1849 KMP_MB(); 1850 1851 #if OMPT_SUPPORT 1852 void *dummy; 1853 void **exit_runtime_p; 1854 ompt_task_info_t *task_info; 1855 1856 ompt_lw_taskteam_t lw_taskteam; 1857 1858 if (ompt_enabled.enabled) { 1859 __ompt_lw_taskteam_init(&lw_taskteam, master_th, gtid, 1860 &ompt_parallel_data, return_address); 1861 __ompt_lw_taskteam_link(&lw_taskteam, master_th, 0); 1862 // don't use lw_taskteam after linking. content was swaped 1863 task_info = OMPT_CUR_TASK_INFO(master_th); 1864 exit_runtime_p = &(task_info->frame.exit_frame); 1865 1866 /* OMPT implicit task begin */ 1867 implicit_task_data = OMPT_CUR_TASK_DATA(master_th); 1868 if (ompt_enabled.ompt_callback_implicit_task) { 1869 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1870 ompt_scope_begin, OMPT_CUR_TEAM_DATA(master_th), 1871 implicit_task_data, 1, __kmp_tid_from_gtid(gtid)); 1872 OMPT_CUR_TASK_INFO(master_th) 1873 ->thread_num = __kmp_tid_from_gtid(gtid); 1874 } 1875 1876 /* OMPT state */ 1877 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 1878 } else { 1879 exit_runtime_p = &dummy; 1880 } 1881 #endif 1882 1883 { 1884 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 1885 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 1886 __kmp_invoke_microtask(microtask, gtid, 0, argc, args 1887 #if OMPT_SUPPORT 1888 , 1889 exit_runtime_p 1890 #endif 1891 ); 1892 } 1893 1894 #if OMPT_SUPPORT 1895 if (ompt_enabled.enabled) { 1896 *exit_runtime_p = NULL; 1897 if (ompt_enabled.ompt_callback_implicit_task) { 1898 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 1899 ompt_scope_end, NULL, &(task_info->task_data), 1, 1900 OMPT_CUR_TASK_INFO(master_th)->thread_num); 1901 } 1902 1903 ompt_parallel_data = *OMPT_CUR_TEAM_DATA(master_th); 1904 __ompt_lw_taskteam_unlink(master_th); 1905 if (ompt_enabled.ompt_callback_parallel_end) { 1906 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 1907 &ompt_parallel_data, parent_task_data, 1908 OMPT_INVOKER(call_context), return_address); 1909 } 1910 master_th->th.ompt_thread_info.state = omp_state_overhead; 1911 } 1912 #endif 1913 #if OMP_40_ENABLED 1914 } 1915 #endif /* OMP_40_ENABLED */ 1916 } else if (call_context == fork_context_gnu) { 1917 #if OMPT_SUPPORT 1918 ompt_lw_taskteam_t lwt; 1919 __ompt_lw_taskteam_init(&lwt, master_th, gtid, &ompt_parallel_data, 1920 return_address); 1921 1922 lwt.ompt_task_info.frame.exit_frame = NULL; 1923 __ompt_lw_taskteam_link(&lwt, master_th, 1); 1924 // don't use lw_taskteam after linking. content was swaped 1925 #endif 1926 1927 // we were called from GNU native code 1928 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1929 return FALSE; 1930 } else { 1931 KMP_ASSERT2(call_context < fork_context_last, 1932 "__kmp_fork_call: unknown fork_context parameter"); 1933 } 1934 1935 KA_TRACE(20, ("__kmp_fork_call: T#%d serial exit\n", gtid)); 1936 KMP_MB(); 1937 return FALSE; 1938 } 1939 1940 // GEH: only modify the executing flag in the case when not serialized 1941 // serialized case is handled in kmpc_serialized_parallel 1942 KF_TRACE(10, ("__kmp_fork_call: parent_team_aclevel=%d, master_th=%p, " 1943 "curtask=%p, curtask_max_aclevel=%d\n", 1944 parent_team->t.t_active_level, master_th, 1945 master_th->th.th_current_task, 1946 master_th->th.th_current_task->td_icvs.max_active_levels)); 1947 // TODO: GEH - cannot do this assertion because root thread not set up as 1948 // executing 1949 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 1 ); 1950 master_th->th.th_current_task->td_flags.executing = 0; 1951 1952 #if OMP_40_ENABLED 1953 if (!master_th->th.th_teams_microtask || level > teams_level) 1954 #endif /* OMP_40_ENABLED */ 1955 { 1956 /* Increment our nested depth level */ 1957 KMP_ATOMIC_INC(&root->r.r_in_parallel); 1958 } 1959 1960 // See if we need to make a copy of the ICVs. 1961 int nthreads_icv = master_th->th.th_current_task->td_icvs.nproc; 1962 if ((level + 1 < __kmp_nested_nth.used) && 1963 (__kmp_nested_nth.nth[level + 1] != nthreads_icv)) { 1964 nthreads_icv = __kmp_nested_nth.nth[level + 1]; 1965 } else { 1966 nthreads_icv = 0; // don't update 1967 } 1968 1969 #if OMP_40_ENABLED 1970 // Figure out the proc_bind_policy for the new team. 1971 kmp_proc_bind_t proc_bind = master_th->th.th_set_proc_bind; 1972 kmp_proc_bind_t proc_bind_icv = 1973 proc_bind_default; // proc_bind_default means don't update 1974 if (master_th->th.th_current_task->td_icvs.proc_bind == proc_bind_false) { 1975 proc_bind = proc_bind_false; 1976 } else { 1977 if (proc_bind == proc_bind_default) { 1978 // No proc_bind clause specified; use current proc-bind-var for this 1979 // parallel region 1980 proc_bind = master_th->th.th_current_task->td_icvs.proc_bind; 1981 } 1982 /* else: The proc_bind policy was specified explicitly on parallel clause. 1983 This overrides proc-bind-var for this parallel region, but does not 1984 change proc-bind-var. */ 1985 // Figure the value of proc-bind-var for the child threads. 1986 if ((level + 1 < __kmp_nested_proc_bind.used) && 1987 (__kmp_nested_proc_bind.bind_types[level + 1] != 1988 master_th->th.th_current_task->td_icvs.proc_bind)) { 1989 proc_bind_icv = __kmp_nested_proc_bind.bind_types[level + 1]; 1990 } 1991 } 1992 1993 // Reset for next parallel region 1994 master_th->th.th_set_proc_bind = proc_bind_default; 1995 #endif /* OMP_40_ENABLED */ 1996 1997 if ((nthreads_icv > 0) 1998 #if OMP_40_ENABLED 1999 || (proc_bind_icv != proc_bind_default) 2000 #endif /* OMP_40_ENABLED */ 2001 ) { 2002 kmp_internal_control_t new_icvs; 2003 copy_icvs(&new_icvs, &master_th->th.th_current_task->td_icvs); 2004 new_icvs.next = NULL; 2005 if (nthreads_icv > 0) { 2006 new_icvs.nproc = nthreads_icv; 2007 } 2008 2009 #if OMP_40_ENABLED 2010 if (proc_bind_icv != proc_bind_default) { 2011 new_icvs.proc_bind = proc_bind_icv; 2012 } 2013 #endif /* OMP_40_ENABLED */ 2014 2015 /* allocate a new parallel team */ 2016 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2017 team = __kmp_allocate_team(root, nthreads, nthreads, 2018 #if OMPT_SUPPORT 2019 ompt_parallel_data, 2020 #endif 2021 #if OMP_40_ENABLED 2022 proc_bind, 2023 #endif 2024 &new_icvs, argc USE_NESTED_HOT_ARG(master_th)); 2025 } else { 2026 /* allocate a new parallel team */ 2027 KF_TRACE(10, ("__kmp_fork_call: before __kmp_allocate_team\n")); 2028 team = __kmp_allocate_team(root, nthreads, nthreads, 2029 #if OMPT_SUPPORT 2030 ompt_parallel_data, 2031 #endif 2032 #if OMP_40_ENABLED 2033 proc_bind, 2034 #endif 2035 &master_th->th.th_current_task->td_icvs, 2036 argc USE_NESTED_HOT_ARG(master_th)); 2037 } 2038 KF_TRACE( 2039 10, ("__kmp_fork_call: after __kmp_allocate_team - team = %p\n", team)); 2040 2041 /* setup the new team */ 2042 KMP_CHECK_UPDATE(team->t.t_master_tid, master_tid); 2043 KMP_CHECK_UPDATE(team->t.t_master_this_cons, master_this_cons); 2044 KMP_CHECK_UPDATE(team->t.t_ident, loc); 2045 KMP_CHECK_UPDATE(team->t.t_parent, parent_team); 2046 KMP_CHECK_UPDATE_SYNC(team->t.t_pkfn, microtask); 2047 #if OMPT_SUPPORT 2048 KMP_CHECK_UPDATE_SYNC(team->t.ompt_team_info.master_return_address, 2049 return_address); 2050 #endif 2051 KMP_CHECK_UPDATE(team->t.t_invoke, invoker); // TODO move to root, maybe 2052 // TODO: parent_team->t.t_level == INT_MAX ??? 2053 #if OMP_40_ENABLED 2054 if (!master_th->th.th_teams_microtask || level > teams_level) { 2055 #endif /* OMP_40_ENABLED */ 2056 int new_level = parent_team->t.t_level + 1; 2057 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2058 new_level = parent_team->t.t_active_level + 1; 2059 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2060 #if OMP_40_ENABLED 2061 } else { 2062 // AC: Do not increase parallel level at start of the teams construct 2063 int new_level = parent_team->t.t_level; 2064 KMP_CHECK_UPDATE(team->t.t_level, new_level); 2065 new_level = parent_team->t.t_active_level; 2066 KMP_CHECK_UPDATE(team->t.t_active_level, new_level); 2067 } 2068 #endif /* OMP_40_ENABLED */ 2069 kmp_r_sched_t new_sched = get__sched_2(parent_team, master_tid); 2070 // set master's schedule as new run-time schedule 2071 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 2072 2073 #if OMP_40_ENABLED 2074 KMP_CHECK_UPDATE(team->t.t_cancel_request, cancel_noreq); 2075 #endif 2076 2077 // Update the floating point rounding in the team if required. 2078 propagateFPControl(team); 2079 2080 if (__kmp_tasking_mode != tskm_immediate_exec) { 2081 // Set master's task team to team's task team. Unless this is hot team, it 2082 // should be NULL. 2083 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2084 parent_team->t.t_task_team[master_th->th.th_task_state]); 2085 KA_TRACE(20, ("__kmp_fork_call: Master T#%d pushing task_team %p / team " 2086 "%p, new task_team %p / team %p\n", 2087 __kmp_gtid_from_thread(master_th), 2088 master_th->th.th_task_team, parent_team, 2089 team->t.t_task_team[master_th->th.th_task_state], team)); 2090 2091 if (active_level || master_th->th.th_task_team) { 2092 // Take a memo of master's task_state 2093 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2094 if (master_th->th.th_task_state_top >= 2095 master_th->th.th_task_state_stack_sz) { // increase size 2096 kmp_uint32 new_size = 2 * master_th->th.th_task_state_stack_sz; 2097 kmp_uint8 *old_stack, *new_stack; 2098 kmp_uint32 i; 2099 new_stack = (kmp_uint8 *)__kmp_allocate(new_size); 2100 for (i = 0; i < master_th->th.th_task_state_stack_sz; ++i) { 2101 new_stack[i] = master_th->th.th_task_state_memo_stack[i]; 2102 } 2103 for (i = master_th->th.th_task_state_stack_sz; i < new_size; 2104 ++i) { // zero-init rest of stack 2105 new_stack[i] = 0; 2106 } 2107 old_stack = master_th->th.th_task_state_memo_stack; 2108 master_th->th.th_task_state_memo_stack = new_stack; 2109 master_th->th.th_task_state_stack_sz = new_size; 2110 __kmp_free(old_stack); 2111 } 2112 // Store master's task_state on stack 2113 master_th->th 2114 .th_task_state_memo_stack[master_th->th.th_task_state_top] = 2115 master_th->th.th_task_state; 2116 master_th->th.th_task_state_top++; 2117 #if KMP_NESTED_HOT_TEAMS 2118 if (team == master_th->th.th_hot_teams[active_level].hot_team) { 2119 // Restore master's nested state if nested hot team 2120 master_th->th.th_task_state = 2121 master_th->th 2122 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2123 } else { 2124 #endif 2125 master_th->th.th_task_state = 0; 2126 #if KMP_NESTED_HOT_TEAMS 2127 } 2128 #endif 2129 } 2130 #if !KMP_NESTED_HOT_TEAMS 2131 KMP_DEBUG_ASSERT((master_th->th.th_task_team == NULL) || 2132 (team == root->r.r_hot_team)); 2133 #endif 2134 } 2135 2136 KA_TRACE( 2137 20, 2138 ("__kmp_fork_call: T#%d(%d:%d)->(%d:0) created a team of %d threads\n", 2139 gtid, parent_team->t.t_id, team->t.t_master_tid, team->t.t_id, 2140 team->t.t_nproc)); 2141 KMP_DEBUG_ASSERT(team != root->r.r_hot_team || 2142 (team->t.t_master_tid == 0 && 2143 (team->t.t_parent == root->r.r_root_team || 2144 team->t.t_parent->t.t_serialized))); 2145 KMP_MB(); 2146 2147 /* now, setup the arguments */ 2148 argv = (void **)team->t.t_argv; 2149 #if OMP_40_ENABLED 2150 if (ap) { 2151 #endif /* OMP_40_ENABLED */ 2152 for (i = argc - 1; i >= 0; --i) { 2153 // TODO: revert workaround for Intel(R) 64 tracker #96 2154 #if (KMP_ARCH_X86_64 || KMP_ARCH_ARM || KMP_ARCH_AARCH64) && KMP_OS_LINUX 2155 void *new_argv = va_arg(*ap, void *); 2156 #else 2157 void *new_argv = va_arg(ap, void *); 2158 #endif 2159 KMP_CHECK_UPDATE(*argv, new_argv); 2160 argv++; 2161 } 2162 #if OMP_40_ENABLED 2163 } else { 2164 for (i = 0; i < argc; ++i) { 2165 // Get args from parent team for teams construct 2166 KMP_CHECK_UPDATE(argv[i], team->t.t_parent->t.t_argv[i]); 2167 } 2168 } 2169 #endif /* OMP_40_ENABLED */ 2170 2171 /* now actually fork the threads */ 2172 KMP_CHECK_UPDATE(team->t.t_master_active, master_active); 2173 if (!root->r.r_active) // Only do assignment if it prevents cache ping-pong 2174 root->r.r_active = TRUE; 2175 2176 __kmp_fork_team_threads(root, team, master_th, gtid); 2177 __kmp_setup_icv_copy(team, nthreads, 2178 &master_th->th.th_current_task->td_icvs, loc); 2179 2180 #if OMPT_SUPPORT 2181 master_th->th.ompt_thread_info.state = omp_state_work_parallel; 2182 #endif 2183 2184 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2185 2186 #if USE_ITT_BUILD 2187 if (team->t.t_active_level == 1 // only report frames at level 1 2188 #if OMP_40_ENABLED 2189 && !master_th->th.th_teams_microtask // not in teams construct 2190 #endif /* OMP_40_ENABLED */ 2191 ) { 2192 #if USE_ITT_NOTIFY 2193 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2194 (__kmp_forkjoin_frames_mode == 3 || 2195 __kmp_forkjoin_frames_mode == 1)) { 2196 kmp_uint64 tmp_time = 0; 2197 if (__itt_get_timestamp_ptr) 2198 tmp_time = __itt_get_timestamp(); 2199 // Internal fork - report frame begin 2200 master_th->th.th_frame_time = tmp_time; 2201 if (__kmp_forkjoin_frames_mode == 3) 2202 team->t.t_region_time = tmp_time; 2203 } else 2204 // only one notification scheme (either "submit" or "forking/joined", not both) 2205 #endif /* USE_ITT_NOTIFY */ 2206 if ((__itt_frame_begin_v3_ptr || KMP_ITT_DEBUG) && 2207 __kmp_forkjoin_frames && !__kmp_forkjoin_frames_mode) { 2208 // Mark start of "parallel" region for Intel(R) VTune(TM) analyzer. 2209 __kmp_itt_region_forking(gtid, team->t.t_nproc, 0); 2210 } 2211 } 2212 #endif /* USE_ITT_BUILD */ 2213 2214 /* now go on and do the work */ 2215 KMP_DEBUG_ASSERT(team == __kmp_threads[gtid]->th.th_team); 2216 KMP_MB(); 2217 KF_TRACE(10, 2218 ("__kmp_internal_fork : root=%p, team=%p, master_th=%p, gtid=%d\n", 2219 root, team, master_th, gtid)); 2220 2221 #if USE_ITT_BUILD 2222 if (__itt_stack_caller_create_ptr) { 2223 team->t.t_stack_id = 2224 __kmp_itt_stack_caller_create(); // create new stack stitching id 2225 // before entering fork barrier 2226 } 2227 #endif /* USE_ITT_BUILD */ 2228 2229 #if OMP_40_ENABLED 2230 // AC: skip __kmp_internal_fork at teams construct, let only master 2231 // threads execute 2232 if (ap) 2233 #endif /* OMP_40_ENABLED */ 2234 { 2235 __kmp_internal_fork(loc, gtid, team); 2236 KF_TRACE(10, ("__kmp_internal_fork : after : root=%p, team=%p, " 2237 "master_th=%p, gtid=%d\n", 2238 root, team, master_th, gtid)); 2239 } 2240 2241 if (call_context == fork_context_gnu) { 2242 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2243 return TRUE; 2244 } 2245 2246 /* Invoke microtask for MASTER thread */ 2247 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) invoke microtask = %p\n", gtid, 2248 team->t.t_id, team->t.t_pkfn)); 2249 } // END of timer KMP_fork_call block 2250 2251 if (!team->t.t_invoke(gtid)) { 2252 KMP_ASSERT2(0, "cannot invoke microtask for MASTER thread"); 2253 } 2254 KA_TRACE(20, ("__kmp_fork_call: T#%d(%d:0) done microtask = %p\n", gtid, 2255 team->t.t_id, team->t.t_pkfn)); 2256 KMP_MB(); /* Flush all pending memory write invalidates. */ 2257 2258 KA_TRACE(20, ("__kmp_fork_call: parallel exit T#%d\n", gtid)); 2259 2260 #if OMPT_SUPPORT 2261 if (ompt_enabled.enabled) { 2262 master_th->th.ompt_thread_info.state = omp_state_overhead; 2263 } 2264 #endif 2265 2266 return TRUE; 2267 } 2268 2269 #if OMPT_SUPPORT 2270 static inline void __kmp_join_restore_state(kmp_info_t *thread, 2271 kmp_team_t *team) { 2272 // restore state outside the region 2273 thread->th.ompt_thread_info.state = 2274 ((team->t.t_serialized) ? omp_state_work_serial 2275 : omp_state_work_parallel); 2276 } 2277 2278 static inline void __kmp_join_ompt(int gtid, kmp_info_t *thread, 2279 kmp_team_t *team, ompt_data_t *parallel_data, 2280 fork_context_e fork_context, void *codeptr) { 2281 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2282 if (ompt_enabled.ompt_callback_parallel_end) { 2283 ompt_callbacks.ompt_callback(ompt_callback_parallel_end)( 2284 parallel_data, &(task_info->task_data), OMPT_INVOKER(fork_context), 2285 codeptr); 2286 } 2287 2288 task_info->frame.enter_frame = NULL; 2289 __kmp_join_restore_state(thread, team); 2290 } 2291 #endif 2292 2293 void __kmp_join_call(ident_t *loc, int gtid 2294 #if OMPT_SUPPORT 2295 , 2296 enum fork_context_e fork_context 2297 #endif 2298 #if OMP_40_ENABLED 2299 , 2300 int exit_teams 2301 #endif /* OMP_40_ENABLED */ 2302 ) { 2303 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_join_call); 2304 kmp_team_t *team; 2305 kmp_team_t *parent_team; 2306 kmp_info_t *master_th; 2307 kmp_root_t *root; 2308 int master_active; 2309 int i; 2310 2311 KA_TRACE(20, ("__kmp_join_call: enter T#%d\n", gtid)); 2312 2313 /* setup current data */ 2314 master_th = __kmp_threads[gtid]; 2315 root = master_th->th.th_root; 2316 team = master_th->th.th_team; 2317 parent_team = team->t.t_parent; 2318 2319 master_th->th.th_ident = loc; 2320 2321 #if OMPT_SUPPORT 2322 if (ompt_enabled.enabled) { 2323 master_th->th.ompt_thread_info.state = omp_state_overhead; 2324 } 2325 #endif 2326 2327 #if KMP_DEBUG 2328 if (__kmp_tasking_mode != tskm_immediate_exec && !exit_teams) { 2329 KA_TRACE(20, ("__kmp_join_call: T#%d, old team = %p old task_team = %p, " 2330 "th_task_team = %p\n", 2331 __kmp_gtid_from_thread(master_th), team, 2332 team->t.t_task_team[master_th->th.th_task_state], 2333 master_th->th.th_task_team)); 2334 KMP_DEBUG_ASSERT(master_th->th.th_task_team == 2335 team->t.t_task_team[master_th->th.th_task_state]); 2336 } 2337 #endif 2338 2339 if (team->t.t_serialized) { 2340 #if OMP_40_ENABLED 2341 if (master_th->th.th_teams_microtask) { 2342 // We are in teams construct 2343 int level = team->t.t_level; 2344 int tlevel = master_th->th.th_teams_level; 2345 if (level == tlevel) { 2346 // AC: we haven't incremented it earlier at start of teams construct, 2347 // so do it here - at the end of teams construct 2348 team->t.t_level++; 2349 } else if (level == tlevel + 1) { 2350 // AC: we are exiting parallel inside teams, need to increment 2351 // serialization in order to restore it in the next call to 2352 // __kmpc_end_serialized_parallel 2353 team->t.t_serialized++; 2354 } 2355 } 2356 #endif /* OMP_40_ENABLED */ 2357 __kmpc_end_serialized_parallel(loc, gtid); 2358 2359 #if OMPT_SUPPORT 2360 if (ompt_enabled.enabled) { 2361 __kmp_join_restore_state(master_th, parent_team); 2362 } 2363 #endif 2364 2365 return; 2366 } 2367 2368 master_active = team->t.t_master_active; 2369 2370 #if OMP_40_ENABLED 2371 if (!exit_teams) 2372 #endif /* OMP_40_ENABLED */ 2373 { 2374 // AC: No barrier for internal teams at exit from teams construct. 2375 // But there is barrier for external team (league). 2376 __kmp_internal_join(loc, gtid, team); 2377 } 2378 #if OMP_40_ENABLED 2379 else { 2380 master_th->th.th_task_state = 2381 0; // AC: no tasking in teams (out of any parallel) 2382 } 2383 #endif /* OMP_40_ENABLED */ 2384 2385 KMP_MB(); 2386 2387 #if OMPT_SUPPORT 2388 ompt_data_t *parallel_data = &(team->t.ompt_team_info.parallel_data); 2389 void *codeptr = team->t.ompt_team_info.master_return_address; 2390 #endif 2391 2392 #if USE_ITT_BUILD 2393 if (__itt_stack_caller_create_ptr) { 2394 __kmp_itt_stack_caller_destroy( 2395 (__itt_caller)team->t 2396 .t_stack_id); // destroy the stack stitching id after join barrier 2397 } 2398 2399 // Mark end of "parallel" region for Intel(R) VTune(TM) analyzer. 2400 if (team->t.t_active_level == 1 2401 #if OMP_40_ENABLED 2402 && !master_th->th.th_teams_microtask /* not in teams construct */ 2403 #endif /* OMP_40_ENABLED */ 2404 ) { 2405 master_th->th.th_ident = loc; 2406 // only one notification scheme (either "submit" or "forking/joined", not 2407 // both) 2408 if ((__itt_frame_submit_v3_ptr || KMP_ITT_DEBUG) && 2409 __kmp_forkjoin_frames_mode == 3) 2410 __kmp_itt_frame_submit(gtid, team->t.t_region_time, 2411 master_th->th.th_frame_time, 0, loc, 2412 master_th->th.th_team_nproc, 1); 2413 else if ((__itt_frame_end_v3_ptr || KMP_ITT_DEBUG) && 2414 !__kmp_forkjoin_frames_mode && __kmp_forkjoin_frames) 2415 __kmp_itt_region_joined(gtid); 2416 } // active_level == 1 2417 #endif /* USE_ITT_BUILD */ 2418 2419 #if OMP_40_ENABLED 2420 if (master_th->th.th_teams_microtask && !exit_teams && 2421 team->t.t_pkfn != (microtask_t)__kmp_teams_master && 2422 team->t.t_level == master_th->th.th_teams_level + 1) { 2423 // AC: We need to leave the team structure intact at the end of parallel 2424 // inside the teams construct, so that at the next parallel same (hot) team 2425 // works, only adjust nesting levels 2426 2427 /* Decrement our nested depth level */ 2428 team->t.t_level--; 2429 team->t.t_active_level--; 2430 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2431 2432 /* Restore number of threads in the team if needed */ 2433 if (master_th->th.th_team_nproc < master_th->th.th_teams_size.nth) { 2434 int old_num = master_th->th.th_team_nproc; 2435 int new_num = master_th->th.th_teams_size.nth; 2436 kmp_info_t **other_threads = team->t.t_threads; 2437 team->t.t_nproc = new_num; 2438 for (i = 0; i < old_num; ++i) { 2439 other_threads[i]->th.th_team_nproc = new_num; 2440 } 2441 // Adjust states of non-used threads of the team 2442 for (i = old_num; i < new_num; ++i) { 2443 // Re-initialize thread's barrier data. 2444 int b; 2445 kmp_balign_t *balign = other_threads[i]->th.th_bar; 2446 for (b = 0; b < bs_last_barrier; ++b) { 2447 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 2448 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 2449 #if USE_DEBUGGER 2450 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 2451 #endif 2452 } 2453 if (__kmp_tasking_mode != tskm_immediate_exec) { 2454 // Synchronize thread's task state 2455 other_threads[i]->th.th_task_state = master_th->th.th_task_state; 2456 } 2457 } 2458 } 2459 2460 #if OMPT_SUPPORT 2461 if (ompt_enabled.enabled) { 2462 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2463 codeptr); 2464 } 2465 #endif 2466 2467 return; 2468 } 2469 #endif /* OMP_40_ENABLED */ 2470 2471 /* do cleanup and restore the parent team */ 2472 master_th->th.th_info.ds.ds_tid = team->t.t_master_tid; 2473 master_th->th.th_local.this_construct = team->t.t_master_this_cons; 2474 2475 master_th->th.th_dispatch = &parent_team->t.t_dispatch[team->t.t_master_tid]; 2476 2477 /* jc: The following lock has instructions with REL and ACQ semantics, 2478 separating the parallel user code called in this parallel region 2479 from the serial user code called after this function returns. */ 2480 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2481 2482 #if OMP_40_ENABLED 2483 if (!master_th->th.th_teams_microtask || 2484 team->t.t_level > master_th->th.th_teams_level) 2485 #endif /* OMP_40_ENABLED */ 2486 { 2487 /* Decrement our nested depth level */ 2488 KMP_ATOMIC_DEC(&root->r.r_in_parallel); 2489 } 2490 KMP_DEBUG_ASSERT(root->r.r_in_parallel >= 0); 2491 2492 #if OMPT_SUPPORT 2493 if (ompt_enabled.enabled) { 2494 ompt_task_info_t *task_info = __ompt_get_task_info_object(0); 2495 if (ompt_enabled.ompt_callback_implicit_task) { 2496 int ompt_team_size = team->t.t_nproc; 2497 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 2498 ompt_scope_end, NULL, &(task_info->task_data), ompt_team_size, 2499 OMPT_CUR_TASK_INFO(master_th)->thread_num); 2500 } 2501 2502 task_info->frame.exit_frame = NULL; 2503 task_info->task_data = ompt_data_none; 2504 } 2505 #endif 2506 2507 KF_TRACE(10, ("__kmp_join_call1: T#%d, this_thread=%p team=%p\n", 0, 2508 master_th, team)); 2509 __kmp_pop_current_task_from_thread(master_th); 2510 2511 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 2512 // Restore master thread's partition. 2513 master_th->th.th_first_place = team->t.t_first_place; 2514 master_th->th.th_last_place = team->t.t_last_place; 2515 #endif /* OMP_40_ENABLED */ 2516 2517 updateHWFPControl(team); 2518 2519 if (root->r.r_active != master_active) 2520 root->r.r_active = master_active; 2521 2522 __kmp_free_team(root, team USE_NESTED_HOT_ARG( 2523 master_th)); // this will free worker threads 2524 2525 /* this race was fun to find. make sure the following is in the critical 2526 region otherwise assertions may fail occasionally since the old team may be 2527 reallocated and the hierarchy appears inconsistent. it is actually safe to 2528 run and won't cause any bugs, but will cause those assertion failures. it's 2529 only one deref&assign so might as well put this in the critical region */ 2530 master_th->th.th_team = parent_team; 2531 master_th->th.th_team_nproc = parent_team->t.t_nproc; 2532 master_th->th.th_team_master = parent_team->t.t_threads[0]; 2533 master_th->th.th_team_serialized = parent_team->t.t_serialized; 2534 2535 /* restore serialized team, if need be */ 2536 if (parent_team->t.t_serialized && 2537 parent_team != master_th->th.th_serial_team && 2538 parent_team != root->r.r_root_team) { 2539 __kmp_free_team(root, 2540 master_th->th.th_serial_team USE_NESTED_HOT_ARG(NULL)); 2541 master_th->th.th_serial_team = parent_team; 2542 } 2543 2544 if (__kmp_tasking_mode != tskm_immediate_exec) { 2545 if (master_th->th.th_task_state_top > 2546 0) { // Restore task state from memo stack 2547 KMP_DEBUG_ASSERT(master_th->th.th_task_state_memo_stack); 2548 // Remember master's state if we re-use this nested hot team 2549 master_th->th.th_task_state_memo_stack[master_th->th.th_task_state_top] = 2550 master_th->th.th_task_state; 2551 --master_th->th.th_task_state_top; // pop 2552 // Now restore state at this level 2553 master_th->th.th_task_state = 2554 master_th->th 2555 .th_task_state_memo_stack[master_th->th.th_task_state_top]; 2556 } 2557 // Copy the task team from the parent team to the master thread 2558 master_th->th.th_task_team = 2559 parent_team->t.t_task_team[master_th->th.th_task_state]; 2560 KA_TRACE(20, 2561 ("__kmp_join_call: Master T#%d restoring task_team %p / team %p\n", 2562 __kmp_gtid_from_thread(master_th), master_th->th.th_task_team, 2563 parent_team)); 2564 } 2565 2566 // TODO: GEH - cannot do this assertion because root thread not set up as 2567 // executing 2568 // KMP_ASSERT( master_th->th.th_current_task->td_flags.executing == 0 ); 2569 master_th->th.th_current_task->td_flags.executing = 1; 2570 2571 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2572 2573 #if OMPT_SUPPORT 2574 if (ompt_enabled.enabled) { 2575 __kmp_join_ompt(gtid, master_th, parent_team, parallel_data, fork_context, 2576 codeptr); 2577 } 2578 #endif 2579 2580 KMP_MB(); 2581 KA_TRACE(20, ("__kmp_join_call: exit T#%d\n", gtid)); 2582 } 2583 2584 /* Check whether we should push an internal control record onto the 2585 serial team stack. If so, do it. */ 2586 void __kmp_save_internal_controls(kmp_info_t *thread) { 2587 2588 if (thread->th.th_team != thread->th.th_serial_team) { 2589 return; 2590 } 2591 if (thread->th.th_team->t.t_serialized > 1) { 2592 int push = 0; 2593 2594 if (thread->th.th_team->t.t_control_stack_top == NULL) { 2595 push = 1; 2596 } else { 2597 if (thread->th.th_team->t.t_control_stack_top->serial_nesting_level != 2598 thread->th.th_team->t.t_serialized) { 2599 push = 1; 2600 } 2601 } 2602 if (push) { /* push a record on the serial team's stack */ 2603 kmp_internal_control_t *control = 2604 (kmp_internal_control_t *)__kmp_allocate( 2605 sizeof(kmp_internal_control_t)); 2606 2607 copy_icvs(control, &thread->th.th_current_task->td_icvs); 2608 2609 control->serial_nesting_level = thread->th.th_team->t.t_serialized; 2610 2611 control->next = thread->th.th_team->t.t_control_stack_top; 2612 thread->th.th_team->t.t_control_stack_top = control; 2613 } 2614 } 2615 } 2616 2617 /* Changes set_nproc */ 2618 void __kmp_set_num_threads(int new_nth, int gtid) { 2619 kmp_info_t *thread; 2620 kmp_root_t *root; 2621 2622 KF_TRACE(10, ("__kmp_set_num_threads: new __kmp_nth = %d\n", new_nth)); 2623 KMP_DEBUG_ASSERT(__kmp_init_serial); 2624 2625 if (new_nth < 1) 2626 new_nth = 1; 2627 else if (new_nth > __kmp_max_nth) 2628 new_nth = __kmp_max_nth; 2629 2630 KMP_COUNT_VALUE(OMP_set_numthreads, new_nth); 2631 thread = __kmp_threads[gtid]; 2632 2633 __kmp_save_internal_controls(thread); 2634 2635 set__nproc(thread, new_nth); 2636 2637 // If this omp_set_num_threads() call will cause the hot team size to be 2638 // reduced (in the absence of a num_threads clause), then reduce it now, 2639 // rather than waiting for the next parallel region. 2640 root = thread->th.th_root; 2641 if (__kmp_init_parallel && (!root->r.r_active) && 2642 (root->r.r_hot_team->t.t_nproc > new_nth) 2643 #if KMP_NESTED_HOT_TEAMS 2644 && __kmp_hot_teams_max_level && !__kmp_hot_teams_mode 2645 #endif 2646 ) { 2647 kmp_team_t *hot_team = root->r.r_hot_team; 2648 int f; 2649 2650 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 2651 2652 // Release the extra threads we don't need any more. 2653 for (f = new_nth; f < hot_team->t.t_nproc; f++) { 2654 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2655 if (__kmp_tasking_mode != tskm_immediate_exec) { 2656 // When decreasing team size, threads no longer in the team should unref 2657 // task team. 2658 hot_team->t.t_threads[f]->th.th_task_team = NULL; 2659 } 2660 __kmp_free_thread(hot_team->t.t_threads[f]); 2661 hot_team->t.t_threads[f] = NULL; 2662 } 2663 hot_team->t.t_nproc = new_nth; 2664 #if KMP_NESTED_HOT_TEAMS 2665 if (thread->th.th_hot_teams) { 2666 KMP_DEBUG_ASSERT(hot_team == thread->th.th_hot_teams[0].hot_team); 2667 thread->th.th_hot_teams[0].hot_team_nth = new_nth; 2668 } 2669 #endif 2670 2671 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 2672 2673 // Update the t_nproc field in the threads that are still active. 2674 for (f = 0; f < new_nth; f++) { 2675 KMP_DEBUG_ASSERT(hot_team->t.t_threads[f] != NULL); 2676 hot_team->t.t_threads[f]->th.th_team_nproc = new_nth; 2677 } 2678 // Special flag in case omp_set_num_threads() call 2679 hot_team->t.t_size_changed = -1; 2680 } 2681 } 2682 2683 /* Changes max_active_levels */ 2684 void __kmp_set_max_active_levels(int gtid, int max_active_levels) { 2685 kmp_info_t *thread; 2686 2687 KF_TRACE(10, ("__kmp_set_max_active_levels: new max_active_levels for thread " 2688 "%d = (%d)\n", 2689 gtid, max_active_levels)); 2690 KMP_DEBUG_ASSERT(__kmp_init_serial); 2691 2692 // validate max_active_levels 2693 if (max_active_levels < 0) { 2694 KMP_WARNING(ActiveLevelsNegative, max_active_levels); 2695 // We ignore this call if the user has specified a negative value. 2696 // The current setting won't be changed. The last valid setting will be 2697 // used. A warning will be issued (if warnings are allowed as controlled by 2698 // the KMP_WARNINGS env var). 2699 KF_TRACE(10, ("__kmp_set_max_active_levels: the call is ignored: new " 2700 "max_active_levels for thread %d = (%d)\n", 2701 gtid, max_active_levels)); 2702 return; 2703 } 2704 if (max_active_levels <= KMP_MAX_ACTIVE_LEVELS_LIMIT) { 2705 // it's OK, the max_active_levels is within the valid range: [ 0; 2706 // KMP_MAX_ACTIVE_LEVELS_LIMIT ] 2707 // We allow a zero value. (implementation defined behavior) 2708 } else { 2709 KMP_WARNING(ActiveLevelsExceedLimit, max_active_levels, 2710 KMP_MAX_ACTIVE_LEVELS_LIMIT); 2711 max_active_levels = KMP_MAX_ACTIVE_LEVELS_LIMIT; 2712 // Current upper limit is MAX_INT. (implementation defined behavior) 2713 // If the input exceeds the upper limit, we correct the input to be the 2714 // upper limit. (implementation defined behavior) 2715 // Actually, the flow should never get here until we use MAX_INT limit. 2716 } 2717 KF_TRACE(10, ("__kmp_set_max_active_levels: after validation: new " 2718 "max_active_levels for thread %d = (%d)\n", 2719 gtid, max_active_levels)); 2720 2721 thread = __kmp_threads[gtid]; 2722 2723 __kmp_save_internal_controls(thread); 2724 2725 set__max_active_levels(thread, max_active_levels); 2726 } 2727 2728 /* Gets max_active_levels */ 2729 int __kmp_get_max_active_levels(int gtid) { 2730 kmp_info_t *thread; 2731 2732 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d\n", gtid)); 2733 KMP_DEBUG_ASSERT(__kmp_init_serial); 2734 2735 thread = __kmp_threads[gtid]; 2736 KMP_DEBUG_ASSERT(thread->th.th_current_task); 2737 KF_TRACE(10, ("__kmp_get_max_active_levels: thread %d, curtask=%p, " 2738 "curtask_maxaclevel=%d\n", 2739 gtid, thread->th.th_current_task, 2740 thread->th.th_current_task->td_icvs.max_active_levels)); 2741 return thread->th.th_current_task->td_icvs.max_active_levels; 2742 } 2743 2744 /* Changes def_sched_var ICV values (run-time schedule kind and chunk) */ 2745 void __kmp_set_schedule(int gtid, kmp_sched_t kind, int chunk) { 2746 kmp_info_t *thread; 2747 // kmp_team_t *team; 2748 2749 KF_TRACE(10, ("__kmp_set_schedule: new schedule for thread %d = (%d, %d)\n", 2750 gtid, (int)kind, chunk)); 2751 KMP_DEBUG_ASSERT(__kmp_init_serial); 2752 2753 // Check if the kind parameter is valid, correct if needed. 2754 // Valid parameters should fit in one of two intervals - standard or extended: 2755 // <lower>, <valid>, <upper_std>, <lower_ext>, <valid>, <upper> 2756 // 2008-01-25: 0, 1 - 4, 5, 100, 101 - 102, 103 2757 if (kind <= kmp_sched_lower || kind >= kmp_sched_upper || 2758 (kind <= kmp_sched_lower_ext && kind >= kmp_sched_upper_std)) { 2759 // TODO: Hint needs attention in case we change the default schedule. 2760 __kmp_msg(kmp_ms_warning, KMP_MSG(ScheduleKindOutOfRange, kind), 2761 KMP_HNT(DefaultScheduleKindUsed, "static, no chunk"), 2762 __kmp_msg_null); 2763 kind = kmp_sched_default; 2764 chunk = 0; // ignore chunk value in case of bad kind 2765 } 2766 2767 thread = __kmp_threads[gtid]; 2768 2769 __kmp_save_internal_controls(thread); 2770 2771 if (kind < kmp_sched_upper_std) { 2772 if (kind == kmp_sched_static && chunk < KMP_DEFAULT_CHUNK) { 2773 // differ static chunked vs. unchunked: chunk should be invalid to 2774 // indicate unchunked schedule (which is the default) 2775 thread->th.th_current_task->td_icvs.sched.r_sched_type = kmp_sch_static; 2776 } else { 2777 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2778 __kmp_sch_map[kind - kmp_sched_lower - 1]; 2779 } 2780 } else { 2781 // __kmp_sch_map[ kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2782 // kmp_sched_lower - 2 ]; 2783 thread->th.th_current_task->td_icvs.sched.r_sched_type = 2784 __kmp_sch_map[kind - kmp_sched_lower_ext + kmp_sched_upper_std - 2785 kmp_sched_lower - 2]; 2786 } 2787 if (kind == kmp_sched_auto || chunk < 1) { 2788 // ignore parameter chunk for schedule auto 2789 thread->th.th_current_task->td_icvs.sched.chunk = KMP_DEFAULT_CHUNK; 2790 } else { 2791 thread->th.th_current_task->td_icvs.sched.chunk = chunk; 2792 } 2793 } 2794 2795 /* Gets def_sched_var ICV values */ 2796 void __kmp_get_schedule(int gtid, kmp_sched_t *kind, int *chunk) { 2797 kmp_info_t *thread; 2798 enum sched_type th_type; 2799 2800 KF_TRACE(10, ("__kmp_get_schedule: thread %d\n", gtid)); 2801 KMP_DEBUG_ASSERT(__kmp_init_serial); 2802 2803 thread = __kmp_threads[gtid]; 2804 2805 th_type = thread->th.th_current_task->td_icvs.sched.r_sched_type; 2806 2807 switch (th_type) { 2808 case kmp_sch_static: 2809 case kmp_sch_static_greedy: 2810 case kmp_sch_static_balanced: 2811 *kind = kmp_sched_static; 2812 *chunk = 0; // chunk was not set, try to show this fact via zero value 2813 return; 2814 case kmp_sch_static_chunked: 2815 *kind = kmp_sched_static; 2816 break; 2817 case kmp_sch_dynamic_chunked: 2818 *kind = kmp_sched_dynamic; 2819 break; 2820 case kmp_sch_guided_chunked: 2821 case kmp_sch_guided_iterative_chunked: 2822 case kmp_sch_guided_analytical_chunked: 2823 *kind = kmp_sched_guided; 2824 break; 2825 case kmp_sch_auto: 2826 *kind = kmp_sched_auto; 2827 break; 2828 case kmp_sch_trapezoidal: 2829 *kind = kmp_sched_trapezoidal; 2830 break; 2831 #if KMP_STATIC_STEAL_ENABLED 2832 case kmp_sch_static_steal: 2833 *kind = kmp_sched_static_steal; 2834 break; 2835 #endif 2836 default: 2837 KMP_FATAL(UnknownSchedulingType, th_type); 2838 } 2839 2840 *chunk = thread->th.th_current_task->td_icvs.sched.chunk; 2841 } 2842 2843 int __kmp_get_ancestor_thread_num(int gtid, int level) { 2844 2845 int ii, dd; 2846 kmp_team_t *team; 2847 kmp_info_t *thr; 2848 2849 KF_TRACE(10, ("__kmp_get_ancestor_thread_num: thread %d %d\n", gtid, level)); 2850 KMP_DEBUG_ASSERT(__kmp_init_serial); 2851 2852 // validate level 2853 if (level == 0) 2854 return 0; 2855 if (level < 0) 2856 return -1; 2857 thr = __kmp_threads[gtid]; 2858 team = thr->th.th_team; 2859 ii = team->t.t_level; 2860 if (level > ii) 2861 return -1; 2862 2863 #if OMP_40_ENABLED 2864 if (thr->th.th_teams_microtask) { 2865 // AC: we are in teams region where multiple nested teams have same level 2866 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2867 if (level <= 2868 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2869 KMP_DEBUG_ASSERT(ii >= tlevel); 2870 // AC: As we need to pass by the teams league, we need to artificially 2871 // increase ii 2872 if (ii == tlevel) { 2873 ii += 2; // three teams have same level 2874 } else { 2875 ii++; // two teams have same level 2876 } 2877 } 2878 } 2879 #endif 2880 2881 if (ii == level) 2882 return __kmp_tid_from_gtid(gtid); 2883 2884 dd = team->t.t_serialized; 2885 level++; 2886 while (ii > level) { 2887 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2888 } 2889 if ((team->t.t_serialized) && (!dd)) { 2890 team = team->t.t_parent; 2891 continue; 2892 } 2893 if (ii > level) { 2894 team = team->t.t_parent; 2895 dd = team->t.t_serialized; 2896 ii--; 2897 } 2898 } 2899 2900 return (dd > 1) ? (0) : (team->t.t_master_tid); 2901 } 2902 2903 int __kmp_get_team_size(int gtid, int level) { 2904 2905 int ii, dd; 2906 kmp_team_t *team; 2907 kmp_info_t *thr; 2908 2909 KF_TRACE(10, ("__kmp_get_team_size: thread %d %d\n", gtid, level)); 2910 KMP_DEBUG_ASSERT(__kmp_init_serial); 2911 2912 // validate level 2913 if (level == 0) 2914 return 1; 2915 if (level < 0) 2916 return -1; 2917 thr = __kmp_threads[gtid]; 2918 team = thr->th.th_team; 2919 ii = team->t.t_level; 2920 if (level > ii) 2921 return -1; 2922 2923 #if OMP_40_ENABLED 2924 if (thr->th.th_teams_microtask) { 2925 // AC: we are in teams region where multiple nested teams have same level 2926 int tlevel = thr->th.th_teams_level; // the level of the teams construct 2927 if (level <= 2928 tlevel) { // otherwise usual algorithm works (will not touch the teams) 2929 KMP_DEBUG_ASSERT(ii >= tlevel); 2930 // AC: As we need to pass by the teams league, we need to artificially 2931 // increase ii 2932 if (ii == tlevel) { 2933 ii += 2; // three teams have same level 2934 } else { 2935 ii++; // two teams have same level 2936 } 2937 } 2938 } 2939 #endif 2940 2941 while (ii > level) { 2942 for (dd = team->t.t_serialized; (dd > 0) && (ii > level); dd--, ii--) { 2943 } 2944 if (team->t.t_serialized && (!dd)) { 2945 team = team->t.t_parent; 2946 continue; 2947 } 2948 if (ii > level) { 2949 team = team->t.t_parent; 2950 ii--; 2951 } 2952 } 2953 2954 return team->t.t_nproc; 2955 } 2956 2957 kmp_r_sched_t __kmp_get_schedule_global() { 2958 // This routine created because pairs (__kmp_sched, __kmp_chunk) and 2959 // (__kmp_static, __kmp_guided) may be changed by kmp_set_defaults 2960 // independently. So one can get the updated schedule here. 2961 2962 kmp_r_sched_t r_sched; 2963 2964 // create schedule from 4 globals: __kmp_sched, __kmp_chunk, __kmp_static, 2965 // __kmp_guided. __kmp_sched should keep original value, so that user can set 2966 // KMP_SCHEDULE multiple times, and thus have different run-time schedules in 2967 // different roots (even in OMP 2.5) 2968 if (__kmp_sched == kmp_sch_static) { 2969 // replace STATIC with more detailed schedule (balanced or greedy) 2970 r_sched.r_sched_type = __kmp_static; 2971 } else if (__kmp_sched == kmp_sch_guided_chunked) { 2972 // replace GUIDED with more detailed schedule (iterative or analytical) 2973 r_sched.r_sched_type = __kmp_guided; 2974 } else { // (STATIC_CHUNKED), or (DYNAMIC_CHUNKED), or other 2975 r_sched.r_sched_type = __kmp_sched; 2976 } 2977 2978 if (__kmp_chunk < KMP_DEFAULT_CHUNK) { 2979 // __kmp_chunk may be wrong here (if it was not ever set) 2980 r_sched.chunk = KMP_DEFAULT_CHUNK; 2981 } else { 2982 r_sched.chunk = __kmp_chunk; 2983 } 2984 2985 return r_sched; 2986 } 2987 2988 /* Allocate (realloc == FALSE) * or reallocate (realloc == TRUE) 2989 at least argc number of *t_argv entries for the requested team. */ 2990 static void __kmp_alloc_argv_entries(int argc, kmp_team_t *team, int realloc) { 2991 2992 KMP_DEBUG_ASSERT(team); 2993 if (!realloc || argc > team->t.t_max_argc) { 2994 2995 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: needed entries=%d, " 2996 "current entries=%d\n", 2997 team->t.t_id, argc, (realloc) ? team->t.t_max_argc : 0)); 2998 /* if previously allocated heap space for args, free them */ 2999 if (realloc && team->t.t_argv != &team->t.t_inline_argv[0]) 3000 __kmp_free((void *)team->t.t_argv); 3001 3002 if (argc <= KMP_INLINE_ARGV_ENTRIES) { 3003 /* use unused space in the cache line for arguments */ 3004 team->t.t_max_argc = KMP_INLINE_ARGV_ENTRIES; 3005 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: inline allocate %d " 3006 "argv entries\n", 3007 team->t.t_id, team->t.t_max_argc)); 3008 team->t.t_argv = &team->t.t_inline_argv[0]; 3009 if (__kmp_storage_map) { 3010 __kmp_print_storage_map_gtid( 3011 -1, &team->t.t_inline_argv[0], 3012 &team->t.t_inline_argv[KMP_INLINE_ARGV_ENTRIES], 3013 (sizeof(void *) * KMP_INLINE_ARGV_ENTRIES), "team_%d.t_inline_argv", 3014 team->t.t_id); 3015 } 3016 } else { 3017 /* allocate space for arguments in the heap */ 3018 team->t.t_max_argc = (argc <= (KMP_MIN_MALLOC_ARGV_ENTRIES >> 1)) 3019 ? KMP_MIN_MALLOC_ARGV_ENTRIES 3020 : 2 * argc; 3021 KA_TRACE(100, ("__kmp_alloc_argv_entries: team %d: dynamic allocate %d " 3022 "argv entries\n", 3023 team->t.t_id, team->t.t_max_argc)); 3024 team->t.t_argv = 3025 (void **)__kmp_page_allocate(sizeof(void *) * team->t.t_max_argc); 3026 if (__kmp_storage_map) { 3027 __kmp_print_storage_map_gtid(-1, &team->t.t_argv[0], 3028 &team->t.t_argv[team->t.t_max_argc], 3029 sizeof(void *) * team->t.t_max_argc, 3030 "team_%d.t_argv", team->t.t_id); 3031 } 3032 } 3033 } 3034 } 3035 3036 static void __kmp_allocate_team_arrays(kmp_team_t *team, int max_nth) { 3037 int i; 3038 int num_disp_buff = max_nth > 1 ? __kmp_dispatch_num_buffers : 2; 3039 team->t.t_threads = 3040 (kmp_info_t **)__kmp_allocate(sizeof(kmp_info_t *) * max_nth); 3041 team->t.t_disp_buffer = (dispatch_shared_info_t *)__kmp_allocate( 3042 sizeof(dispatch_shared_info_t) * num_disp_buff); 3043 team->t.t_dispatch = 3044 (kmp_disp_t *)__kmp_allocate(sizeof(kmp_disp_t) * max_nth); 3045 team->t.t_implicit_task_taskdata = 3046 (kmp_taskdata_t *)__kmp_allocate(sizeof(kmp_taskdata_t) * max_nth); 3047 team->t.t_max_nproc = max_nth; 3048 3049 /* setup dispatch buffers */ 3050 for (i = 0; i < num_disp_buff; ++i) { 3051 team->t.t_disp_buffer[i].buffer_index = i; 3052 #if OMP_45_ENABLED 3053 team->t.t_disp_buffer[i].doacross_buf_idx = i; 3054 #endif 3055 } 3056 } 3057 3058 static void __kmp_free_team_arrays(kmp_team_t *team) { 3059 /* Note: this does not free the threads in t_threads (__kmp_free_threads) */ 3060 int i; 3061 for (i = 0; i < team->t.t_max_nproc; ++i) { 3062 if (team->t.t_dispatch[i].th_disp_buffer != NULL) { 3063 __kmp_free(team->t.t_dispatch[i].th_disp_buffer); 3064 team->t.t_dispatch[i].th_disp_buffer = NULL; 3065 } 3066 } 3067 #if KMP_USE_HIER_SCHED 3068 __kmp_dispatch_free_hierarchies(team); 3069 #endif 3070 __kmp_free(team->t.t_threads); 3071 __kmp_free(team->t.t_disp_buffer); 3072 __kmp_free(team->t.t_dispatch); 3073 __kmp_free(team->t.t_implicit_task_taskdata); 3074 team->t.t_threads = NULL; 3075 team->t.t_disp_buffer = NULL; 3076 team->t.t_dispatch = NULL; 3077 team->t.t_implicit_task_taskdata = 0; 3078 } 3079 3080 static void __kmp_reallocate_team_arrays(kmp_team_t *team, int max_nth) { 3081 kmp_info_t **oldThreads = team->t.t_threads; 3082 3083 __kmp_free(team->t.t_disp_buffer); 3084 __kmp_free(team->t.t_dispatch); 3085 __kmp_free(team->t.t_implicit_task_taskdata); 3086 __kmp_allocate_team_arrays(team, max_nth); 3087 3088 KMP_MEMCPY(team->t.t_threads, oldThreads, 3089 team->t.t_nproc * sizeof(kmp_info_t *)); 3090 3091 __kmp_free(oldThreads); 3092 } 3093 3094 static kmp_internal_control_t __kmp_get_global_icvs(void) { 3095 3096 kmp_r_sched_t r_sched = 3097 __kmp_get_schedule_global(); // get current state of scheduling globals 3098 3099 #if OMP_40_ENABLED 3100 KMP_DEBUG_ASSERT(__kmp_nested_proc_bind.used > 0); 3101 #endif /* OMP_40_ENABLED */ 3102 3103 kmp_internal_control_t g_icvs = { 3104 0, // int serial_nesting_level; //corresponds to value of th_team_serialized 3105 (kmp_int8)__kmp_dflt_nested, // int nested; //internal control 3106 // for nested parallelism (per thread) 3107 (kmp_int8)__kmp_global.g.g_dynamic, // internal control for dynamic 3108 // adjustment of threads (per thread) 3109 (kmp_int8)__kmp_env_blocktime, // int bt_set; //internal control for 3110 // whether blocktime is explicitly set 3111 __kmp_dflt_blocktime, // int blocktime; //internal control for blocktime 3112 #if KMP_USE_MONITOR 3113 __kmp_bt_intervals, // int bt_intervals; //internal control for blocktime 3114 // intervals 3115 #endif 3116 __kmp_dflt_team_nth, // int nproc; //internal control for # of threads for 3117 // next parallel region (per thread) 3118 // (use a max ub on value if __kmp_parallel_initialize not called yet) 3119 __kmp_dflt_max_active_levels, // int max_active_levels; //internal control 3120 // for max_active_levels 3121 r_sched, // kmp_r_sched_t sched; //internal control for runtime schedule 3122 // {sched,chunk} pair 3123 #if OMP_40_ENABLED 3124 __kmp_nested_proc_bind.bind_types[0], 3125 __kmp_default_device, 3126 #endif /* OMP_40_ENABLED */ 3127 NULL // struct kmp_internal_control *next; 3128 }; 3129 3130 return g_icvs; 3131 } 3132 3133 static kmp_internal_control_t __kmp_get_x_global_icvs(const kmp_team_t *team) { 3134 3135 kmp_internal_control_t gx_icvs; 3136 gx_icvs.serial_nesting_level = 3137 0; // probably =team->t.t_serial like in save_inter_controls 3138 copy_icvs(&gx_icvs, &team->t.t_threads[0]->th.th_current_task->td_icvs); 3139 gx_icvs.next = NULL; 3140 3141 return gx_icvs; 3142 } 3143 3144 static void __kmp_initialize_root(kmp_root_t *root) { 3145 int f; 3146 kmp_team_t *root_team; 3147 kmp_team_t *hot_team; 3148 int hot_team_max_nth; 3149 kmp_r_sched_t r_sched = 3150 __kmp_get_schedule_global(); // get current state of scheduling globals 3151 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3152 KMP_DEBUG_ASSERT(root); 3153 KMP_ASSERT(!root->r.r_begin); 3154 3155 /* setup the root state structure */ 3156 __kmp_init_lock(&root->r.r_begin_lock); 3157 root->r.r_begin = FALSE; 3158 root->r.r_active = FALSE; 3159 root->r.r_in_parallel = 0; 3160 root->r.r_blocktime = __kmp_dflt_blocktime; 3161 root->r.r_nested = __kmp_dflt_nested; 3162 root->r.r_cg_nthreads = 1; 3163 3164 /* setup the root team for this task */ 3165 /* allocate the root team structure */ 3166 KF_TRACE(10, ("__kmp_initialize_root: before root_team\n")); 3167 3168 root_team = 3169 __kmp_allocate_team(root, 3170 1, // new_nproc 3171 1, // max_nproc 3172 #if OMPT_SUPPORT 3173 ompt_data_none, // root parallel id 3174 #endif 3175 #if OMP_40_ENABLED 3176 __kmp_nested_proc_bind.bind_types[0], 3177 #endif 3178 &r_icvs, 3179 0 // argc 3180 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3181 ); 3182 #if USE_DEBUGGER 3183 // Non-NULL value should be assigned to make the debugger display the root 3184 // team. 3185 TCW_SYNC_PTR(root_team->t.t_pkfn, (microtask_t)(~0)); 3186 #endif 3187 3188 KF_TRACE(10, ("__kmp_initialize_root: after root_team = %p\n", root_team)); 3189 3190 root->r.r_root_team = root_team; 3191 root_team->t.t_control_stack_top = NULL; 3192 3193 /* initialize root team */ 3194 root_team->t.t_threads[0] = NULL; 3195 root_team->t.t_nproc = 1; 3196 root_team->t.t_serialized = 1; 3197 // TODO???: root_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3198 root_team->t.t_sched.sched = r_sched.sched; 3199 KA_TRACE( 3200 20, 3201 ("__kmp_initialize_root: init root team %d arrived: join=%u, plain=%u\n", 3202 root_team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 3203 3204 /* setup the hot team for this task */ 3205 /* allocate the hot team structure */ 3206 KF_TRACE(10, ("__kmp_initialize_root: before hot_team\n")); 3207 3208 hot_team = 3209 __kmp_allocate_team(root, 3210 1, // new_nproc 3211 __kmp_dflt_team_nth_ub * 2, // max_nproc 3212 #if OMPT_SUPPORT 3213 ompt_data_none, // root parallel id 3214 #endif 3215 #if OMP_40_ENABLED 3216 __kmp_nested_proc_bind.bind_types[0], 3217 #endif 3218 &r_icvs, 3219 0 // argc 3220 USE_NESTED_HOT_ARG(NULL) // master thread is unknown 3221 ); 3222 KF_TRACE(10, ("__kmp_initialize_root: after hot_team = %p\n", hot_team)); 3223 3224 root->r.r_hot_team = hot_team; 3225 root_team->t.t_control_stack_top = NULL; 3226 3227 /* first-time initialization */ 3228 hot_team->t.t_parent = root_team; 3229 3230 /* initialize hot team */ 3231 hot_team_max_nth = hot_team->t.t_max_nproc; 3232 for (f = 0; f < hot_team_max_nth; ++f) { 3233 hot_team->t.t_threads[f] = NULL; 3234 } 3235 hot_team->t.t_nproc = 1; 3236 // TODO???: hot_team->t.t_max_active_levels = __kmp_dflt_max_active_levels; 3237 hot_team->t.t_sched.sched = r_sched.sched; 3238 hot_team->t.t_size_changed = 0; 3239 } 3240 3241 #ifdef KMP_DEBUG 3242 3243 typedef struct kmp_team_list_item { 3244 kmp_team_p const *entry; 3245 struct kmp_team_list_item *next; 3246 } kmp_team_list_item_t; 3247 typedef kmp_team_list_item_t *kmp_team_list_t; 3248 3249 static void __kmp_print_structure_team_accum( // Add team to list of teams. 3250 kmp_team_list_t list, // List of teams. 3251 kmp_team_p const *team // Team to add. 3252 ) { 3253 3254 // List must terminate with item where both entry and next are NULL. 3255 // Team is added to the list only once. 3256 // List is sorted in ascending order by team id. 3257 // Team id is *not* a key. 3258 3259 kmp_team_list_t l; 3260 3261 KMP_DEBUG_ASSERT(list != NULL); 3262 if (team == NULL) { 3263 return; 3264 } 3265 3266 __kmp_print_structure_team_accum(list, team->t.t_parent); 3267 __kmp_print_structure_team_accum(list, team->t.t_next_pool); 3268 3269 // Search list for the team. 3270 l = list; 3271 while (l->next != NULL && l->entry != team) { 3272 l = l->next; 3273 } 3274 if (l->next != NULL) { 3275 return; // Team has been added before, exit. 3276 } 3277 3278 // Team is not found. Search list again for insertion point. 3279 l = list; 3280 while (l->next != NULL && l->entry->t.t_id <= team->t.t_id) { 3281 l = l->next; 3282 } 3283 3284 // Insert team. 3285 { 3286 kmp_team_list_item_t *item = (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC( 3287 sizeof(kmp_team_list_item_t)); 3288 *item = *l; 3289 l->entry = team; 3290 l->next = item; 3291 } 3292 } 3293 3294 static void __kmp_print_structure_team(char const *title, kmp_team_p const *team 3295 3296 ) { 3297 __kmp_printf("%s", title); 3298 if (team != NULL) { 3299 __kmp_printf("%2x %p\n", team->t.t_id, team); 3300 } else { 3301 __kmp_printf(" - (nil)\n"); 3302 } 3303 } 3304 3305 static void __kmp_print_structure_thread(char const *title, 3306 kmp_info_p const *thread) { 3307 __kmp_printf("%s", title); 3308 if (thread != NULL) { 3309 __kmp_printf("%2d %p\n", thread->th.th_info.ds.ds_gtid, thread); 3310 } else { 3311 __kmp_printf(" - (nil)\n"); 3312 } 3313 } 3314 3315 void __kmp_print_structure(void) { 3316 3317 kmp_team_list_t list; 3318 3319 // Initialize list of teams. 3320 list = 3321 (kmp_team_list_item_t *)KMP_INTERNAL_MALLOC(sizeof(kmp_team_list_item_t)); 3322 list->entry = NULL; 3323 list->next = NULL; 3324 3325 __kmp_printf("\n------------------------------\nGlobal Thread " 3326 "Table\n------------------------------\n"); 3327 { 3328 int gtid; 3329 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3330 __kmp_printf("%2d", gtid); 3331 if (__kmp_threads != NULL) { 3332 __kmp_printf(" %p", __kmp_threads[gtid]); 3333 } 3334 if (__kmp_root != NULL) { 3335 __kmp_printf(" %p", __kmp_root[gtid]); 3336 } 3337 __kmp_printf("\n"); 3338 } 3339 } 3340 3341 // Print out __kmp_threads array. 3342 __kmp_printf("\n------------------------------\nThreads\n--------------------" 3343 "----------\n"); 3344 if (__kmp_threads != NULL) { 3345 int gtid; 3346 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3347 kmp_info_t const *thread = __kmp_threads[gtid]; 3348 if (thread != NULL) { 3349 __kmp_printf("GTID %2d %p:\n", gtid, thread); 3350 __kmp_printf(" Our Root: %p\n", thread->th.th_root); 3351 __kmp_print_structure_team(" Our Team: ", thread->th.th_team); 3352 __kmp_print_structure_team(" Serial Team: ", 3353 thread->th.th_serial_team); 3354 __kmp_printf(" Threads: %2d\n", thread->th.th_team_nproc); 3355 __kmp_print_structure_thread(" Master: ", 3356 thread->th.th_team_master); 3357 __kmp_printf(" Serialized?: %2d\n", thread->th.th_team_serialized); 3358 __kmp_printf(" Set NProc: %2d\n", thread->th.th_set_nproc); 3359 #if OMP_40_ENABLED 3360 __kmp_printf(" Set Proc Bind: %2d\n", thread->th.th_set_proc_bind); 3361 #endif 3362 __kmp_print_structure_thread(" Next in pool: ", 3363 thread->th.th_next_pool); 3364 __kmp_printf("\n"); 3365 __kmp_print_structure_team_accum(list, thread->th.th_team); 3366 __kmp_print_structure_team_accum(list, thread->th.th_serial_team); 3367 } 3368 } 3369 } else { 3370 __kmp_printf("Threads array is not allocated.\n"); 3371 } 3372 3373 // Print out __kmp_root array. 3374 __kmp_printf("\n------------------------------\nUbers\n----------------------" 3375 "--------\n"); 3376 if (__kmp_root != NULL) { 3377 int gtid; 3378 for (gtid = 0; gtid < __kmp_threads_capacity; ++gtid) { 3379 kmp_root_t const *root = __kmp_root[gtid]; 3380 if (root != NULL) { 3381 __kmp_printf("GTID %2d %p:\n", gtid, root); 3382 __kmp_print_structure_team(" Root Team: ", root->r.r_root_team); 3383 __kmp_print_structure_team(" Hot Team: ", root->r.r_hot_team); 3384 __kmp_print_structure_thread(" Uber Thread: ", 3385 root->r.r_uber_thread); 3386 __kmp_printf(" Active?: %2d\n", root->r.r_active); 3387 __kmp_printf(" Nested?: %2d\n", root->r.r_nested); 3388 __kmp_printf(" In Parallel: %2d\n", 3389 KMP_ATOMIC_LD_RLX(&root->r.r_in_parallel)); 3390 __kmp_printf("\n"); 3391 __kmp_print_structure_team_accum(list, root->r.r_root_team); 3392 __kmp_print_structure_team_accum(list, root->r.r_hot_team); 3393 } 3394 } 3395 } else { 3396 __kmp_printf("Ubers array is not allocated.\n"); 3397 } 3398 3399 __kmp_printf("\n------------------------------\nTeams\n----------------------" 3400 "--------\n"); 3401 while (list->next != NULL) { 3402 kmp_team_p const *team = list->entry; 3403 int i; 3404 __kmp_printf("Team %2x %p:\n", team->t.t_id, team); 3405 __kmp_print_structure_team(" Parent Team: ", team->t.t_parent); 3406 __kmp_printf(" Master TID: %2d\n", team->t.t_master_tid); 3407 __kmp_printf(" Max threads: %2d\n", team->t.t_max_nproc); 3408 __kmp_printf(" Levels of serial: %2d\n", team->t.t_serialized); 3409 __kmp_printf(" Number threads: %2d\n", team->t.t_nproc); 3410 for (i = 0; i < team->t.t_nproc; ++i) { 3411 __kmp_printf(" Thread %2d: ", i); 3412 __kmp_print_structure_thread("", team->t.t_threads[i]); 3413 } 3414 __kmp_print_structure_team(" Next in pool: ", team->t.t_next_pool); 3415 __kmp_printf("\n"); 3416 list = list->next; 3417 } 3418 3419 // Print out __kmp_thread_pool and __kmp_team_pool. 3420 __kmp_printf("\n------------------------------\nPools\n----------------------" 3421 "--------\n"); 3422 __kmp_print_structure_thread("Thread pool: ", 3423 CCAST(kmp_info_t *, __kmp_thread_pool)); 3424 __kmp_print_structure_team("Team pool: ", 3425 CCAST(kmp_team_t *, __kmp_team_pool)); 3426 __kmp_printf("\n"); 3427 3428 // Free team list. 3429 while (list != NULL) { 3430 kmp_team_list_item_t *item = list; 3431 list = list->next; 3432 KMP_INTERNAL_FREE(item); 3433 } 3434 } 3435 3436 #endif 3437 3438 //--------------------------------------------------------------------------- 3439 // Stuff for per-thread fast random number generator 3440 // Table of primes 3441 static const unsigned __kmp_primes[] = { 3442 0x9e3779b1, 0xffe6cc59, 0x2109f6dd, 0x43977ab5, 0xba5703f5, 0xb495a877, 3443 0xe1626741, 0x79695e6b, 0xbc98c09f, 0xd5bee2b3, 0x287488f9, 0x3af18231, 3444 0x9677cd4d, 0xbe3a6929, 0xadc6a877, 0xdcf0674b, 0xbe4d6fe9, 0x5f15e201, 3445 0x99afc3fd, 0xf3f16801, 0xe222cfff, 0x24ba5fdb, 0x0620452d, 0x79f149e3, 3446 0xc8b93f49, 0x972702cd, 0xb07dd827, 0x6c97d5ed, 0x085a3d61, 0x46eb5ea7, 3447 0x3d9910ed, 0x2e687b5b, 0x29609227, 0x6eb081f1, 0x0954c4e1, 0x9d114db9, 3448 0x542acfa9, 0xb3e6bd7b, 0x0742d917, 0xe9f3ffa7, 0x54581edb, 0xf2480f45, 3449 0x0bb9288f, 0xef1affc7, 0x85fa0ca7, 0x3ccc14db, 0xe6baf34b, 0x343377f7, 3450 0x5ca19031, 0xe6d9293b, 0xf0a9f391, 0x5d2e980b, 0xfc411073, 0xc3749363, 3451 0xb892d829, 0x3549366b, 0x629750ad, 0xb98294e5, 0x892d9483, 0xc235baf3, 3452 0x3d2402a3, 0x6bdef3c9, 0xbec333cd, 0x40c9520f}; 3453 3454 //--------------------------------------------------------------------------- 3455 // __kmp_get_random: Get a random number using a linear congruential method. 3456 unsigned short __kmp_get_random(kmp_info_t *thread) { 3457 unsigned x = thread->th.th_x; 3458 unsigned short r = x >> 16; 3459 3460 thread->th.th_x = x * thread->th.th_a + 1; 3461 3462 KA_TRACE(30, ("__kmp_get_random: THREAD: %d, RETURN: %u\n", 3463 thread->th.th_info.ds.ds_tid, r)); 3464 3465 return r; 3466 } 3467 //-------------------------------------------------------- 3468 // __kmp_init_random: Initialize a random number generator 3469 void __kmp_init_random(kmp_info_t *thread) { 3470 unsigned seed = thread->th.th_info.ds.ds_tid; 3471 3472 thread->th.th_a = 3473 __kmp_primes[seed % (sizeof(__kmp_primes) / sizeof(__kmp_primes[0]))]; 3474 thread->th.th_x = (seed + 1) * thread->th.th_a + 1; 3475 KA_TRACE(30, 3476 ("__kmp_init_random: THREAD: %u; A: %u\n", seed, thread->th.th_a)); 3477 } 3478 3479 #if KMP_OS_WINDOWS 3480 /* reclaim array entries for root threads that are already dead, returns number 3481 * reclaimed */ 3482 static int __kmp_reclaim_dead_roots(void) { 3483 int i, r = 0; 3484 3485 for (i = 0; i < __kmp_threads_capacity; ++i) { 3486 if (KMP_UBER_GTID(i) && 3487 !__kmp_still_running((kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[i])) && 3488 !__kmp_root[i] 3489 ->r.r_active) { // AC: reclaim only roots died in non-active state 3490 r += __kmp_unregister_root_other_thread(i); 3491 } 3492 } 3493 return r; 3494 } 3495 #endif 3496 3497 /* This function attempts to create free entries in __kmp_threads and 3498 __kmp_root, and returns the number of free entries generated. 3499 3500 For Windows* OS static library, the first mechanism used is to reclaim array 3501 entries for root threads that are already dead. 3502 3503 On all platforms, expansion is attempted on the arrays __kmp_threads_ and 3504 __kmp_root, with appropriate update to __kmp_threads_capacity. Array 3505 capacity is increased by doubling with clipping to __kmp_tp_capacity, if 3506 threadprivate cache array has been created. Synchronization with 3507 __kmpc_threadprivate_cached is done using __kmp_tp_cached_lock. 3508 3509 After any dead root reclamation, if the clipping value allows array expansion 3510 to result in the generation of a total of nNeed free slots, the function does 3511 that expansion. If not, nothing is done beyond the possible initial root 3512 thread reclamation. 3513 3514 If any argument is negative, the behavior is undefined. */ 3515 static int __kmp_expand_threads(int nNeed) { 3516 int added = 0; 3517 int minimumRequiredCapacity; 3518 int newCapacity; 3519 kmp_info_t **newThreads; 3520 kmp_root_t **newRoot; 3521 3522 // All calls to __kmp_expand_threads should be under __kmp_forkjoin_lock, so 3523 // resizing __kmp_threads does not need additional protection if foreign 3524 // threads are present 3525 3526 #if KMP_OS_WINDOWS && !defined KMP_DYNAMIC_LIB 3527 /* only for Windows static library */ 3528 /* reclaim array entries for root threads that are already dead */ 3529 added = __kmp_reclaim_dead_roots(); 3530 3531 if (nNeed) { 3532 nNeed -= added; 3533 if (nNeed < 0) 3534 nNeed = 0; 3535 } 3536 #endif 3537 if (nNeed <= 0) 3538 return added; 3539 3540 // Note that __kmp_threads_capacity is not bounded by __kmp_max_nth. If 3541 // __kmp_max_nth is set to some value less than __kmp_sys_max_nth by the 3542 // user via KMP_DEVICE_THREAD_LIMIT, then __kmp_threads_capacity may become 3543 // > __kmp_max_nth in one of two ways: 3544 // 3545 // 1) The initialization thread (gtid = 0) exits. __kmp_threads[0] 3546 // may not be resused by another thread, so we may need to increase 3547 // __kmp_threads_capacity to __kmp_max_nth + 1. 3548 // 3549 // 2) New foreign root(s) are encountered. We always register new foreign 3550 // roots. This may cause a smaller # of threads to be allocated at 3551 // subsequent parallel regions, but the worker threads hang around (and 3552 // eventually go to sleep) and need slots in the __kmp_threads[] array. 3553 // 3554 // Anyway, that is the reason for moving the check to see if 3555 // __kmp_max_nth was exceeded into __kmp_reserve_threads() 3556 // instead of having it performed here. -BB 3557 3558 KMP_DEBUG_ASSERT(__kmp_sys_max_nth >= __kmp_threads_capacity); 3559 3560 /* compute expansion headroom to check if we can expand */ 3561 if (__kmp_sys_max_nth - __kmp_threads_capacity < nNeed) { 3562 /* possible expansion too small -- give up */ 3563 return added; 3564 } 3565 minimumRequiredCapacity = __kmp_threads_capacity + nNeed; 3566 3567 newCapacity = __kmp_threads_capacity; 3568 do { 3569 newCapacity = newCapacity <= (__kmp_sys_max_nth >> 1) ? (newCapacity << 1) 3570 : __kmp_sys_max_nth; 3571 } while (newCapacity < minimumRequiredCapacity); 3572 newThreads = (kmp_info_t **)__kmp_allocate( 3573 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * newCapacity + CACHE_LINE); 3574 newRoot = 3575 (kmp_root_t **)((char *)newThreads + sizeof(kmp_info_t *) * newCapacity); 3576 KMP_MEMCPY(newThreads, __kmp_threads, 3577 __kmp_threads_capacity * sizeof(kmp_info_t *)); 3578 KMP_MEMCPY(newRoot, __kmp_root, 3579 __kmp_threads_capacity * sizeof(kmp_root_t *)); 3580 3581 kmp_info_t **temp_threads = __kmp_threads; 3582 *(kmp_info_t * *volatile *)&__kmp_threads = newThreads; 3583 *(kmp_root_t * *volatile *)&__kmp_root = newRoot; 3584 __kmp_free(temp_threads); 3585 added += newCapacity - __kmp_threads_capacity; 3586 *(volatile int *)&__kmp_threads_capacity = newCapacity; 3587 3588 if (newCapacity > __kmp_tp_capacity) { 3589 __kmp_acquire_bootstrap_lock(&__kmp_tp_cached_lock); 3590 if (__kmp_tp_cached && newCapacity > __kmp_tp_capacity) { 3591 __kmp_threadprivate_resize_cache(newCapacity); 3592 } else { // increase __kmp_tp_capacity to correspond with kmp_threads size 3593 *(volatile int *)&__kmp_tp_capacity = newCapacity; 3594 } 3595 __kmp_release_bootstrap_lock(&__kmp_tp_cached_lock); 3596 } 3597 3598 return added; 3599 } 3600 3601 /* Register the current thread as a root thread and obtain our gtid. We must 3602 have the __kmp_initz_lock held at this point. Argument TRUE only if are the 3603 thread that calls from __kmp_do_serial_initialize() */ 3604 int __kmp_register_root(int initial_thread) { 3605 kmp_info_t *root_thread; 3606 kmp_root_t *root; 3607 int gtid; 3608 int capacity; 3609 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3610 KA_TRACE(20, ("__kmp_register_root: entered\n")); 3611 KMP_MB(); 3612 3613 /* 2007-03-02: 3614 If initial thread did not invoke OpenMP RTL yet, and this thread is not an 3615 initial one, "__kmp_all_nth >= __kmp_threads_capacity" condition does not 3616 work as expected -- it may return false (that means there is at least one 3617 empty slot in __kmp_threads array), but it is possible the only free slot 3618 is #0, which is reserved for initial thread and so cannot be used for this 3619 one. Following code workarounds this bug. 3620 3621 However, right solution seems to be not reserving slot #0 for initial 3622 thread because: 3623 (1) there is no magic in slot #0, 3624 (2) we cannot detect initial thread reliably (the first thread which does 3625 serial initialization may be not a real initial thread). 3626 */ 3627 capacity = __kmp_threads_capacity; 3628 if (!initial_thread && TCR_PTR(__kmp_threads[0]) == NULL) { 3629 --capacity; 3630 } 3631 3632 /* see if there are too many threads */ 3633 if (__kmp_all_nth >= capacity && !__kmp_expand_threads(1)) { 3634 if (__kmp_tp_cached) { 3635 __kmp_fatal(KMP_MSG(CantRegisterNewThread), 3636 KMP_HNT(Set_ALL_THREADPRIVATE, __kmp_tp_capacity), 3637 KMP_HNT(PossibleSystemLimitOnThreads), __kmp_msg_null); 3638 } else { 3639 __kmp_fatal(KMP_MSG(CantRegisterNewThread), KMP_HNT(SystemLimitOnThreads), 3640 __kmp_msg_null); 3641 } 3642 } 3643 3644 /* find an available thread slot */ 3645 /* Don't reassign the zero slot since we need that to only be used by initial 3646 thread */ 3647 for (gtid = (initial_thread ? 0 : 1); TCR_PTR(__kmp_threads[gtid]) != NULL; 3648 gtid++) 3649 ; 3650 KA_TRACE(1, 3651 ("__kmp_register_root: found slot in threads array: T#%d\n", gtid)); 3652 KMP_ASSERT(gtid < __kmp_threads_capacity); 3653 3654 /* update global accounting */ 3655 __kmp_all_nth++; 3656 TCW_4(__kmp_nth, __kmp_nth + 1); 3657 3658 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 3659 // numbers of procs, and method #2 (keyed API call) for higher numbers. 3660 if (__kmp_adjust_gtid_mode) { 3661 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 3662 if (TCR_4(__kmp_gtid_mode) != 2) { 3663 TCW_4(__kmp_gtid_mode, 2); 3664 } 3665 } else { 3666 if (TCR_4(__kmp_gtid_mode) != 1) { 3667 TCW_4(__kmp_gtid_mode, 1); 3668 } 3669 } 3670 } 3671 3672 #ifdef KMP_ADJUST_BLOCKTIME 3673 /* Adjust blocktime to zero if necessary */ 3674 /* Middle initialization might not have occurred yet */ 3675 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 3676 if (__kmp_nth > __kmp_avail_proc) { 3677 __kmp_zero_bt = TRUE; 3678 } 3679 } 3680 #endif /* KMP_ADJUST_BLOCKTIME */ 3681 3682 /* setup this new hierarchy */ 3683 if (!(root = __kmp_root[gtid])) { 3684 root = __kmp_root[gtid] = (kmp_root_t *)__kmp_allocate(sizeof(kmp_root_t)); 3685 KMP_DEBUG_ASSERT(!root->r.r_root_team); 3686 } 3687 3688 #if KMP_STATS_ENABLED 3689 // Initialize stats as soon as possible (right after gtid assignment). 3690 __kmp_stats_thread_ptr = __kmp_stats_list->push_back(gtid); 3691 __kmp_stats_thread_ptr->startLife(); 3692 KMP_SET_THREAD_STATE(SERIAL_REGION); 3693 KMP_INIT_PARTITIONED_TIMERS(OMP_serial); 3694 #endif 3695 __kmp_initialize_root(root); 3696 3697 /* setup new root thread structure */ 3698 if (root->r.r_uber_thread) { 3699 root_thread = root->r.r_uber_thread; 3700 } else { 3701 root_thread = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 3702 if (__kmp_storage_map) { 3703 __kmp_print_thread_storage_map(root_thread, gtid); 3704 } 3705 root_thread->th.th_info.ds.ds_gtid = gtid; 3706 #if OMPT_SUPPORT 3707 root_thread->th.ompt_thread_info.thread_data.ptr = NULL; 3708 #endif 3709 root_thread->th.th_root = root; 3710 if (__kmp_env_consistency_check) { 3711 root_thread->th.th_cons = __kmp_allocate_cons_stack(gtid); 3712 } 3713 #if USE_FAST_MEMORY 3714 __kmp_initialize_fast_memory(root_thread); 3715 #endif /* USE_FAST_MEMORY */ 3716 3717 #if KMP_USE_BGET 3718 KMP_DEBUG_ASSERT(root_thread->th.th_local.bget_data == NULL); 3719 __kmp_initialize_bget(root_thread); 3720 #endif 3721 __kmp_init_random(root_thread); // Initialize random number generator 3722 } 3723 3724 /* setup the serial team held in reserve by the root thread */ 3725 if (!root_thread->th.th_serial_team) { 3726 kmp_internal_control_t r_icvs = __kmp_get_global_icvs(); 3727 KF_TRACE(10, ("__kmp_register_root: before serial_team\n")); 3728 root_thread->th.th_serial_team = 3729 __kmp_allocate_team(root, 1, 1, 3730 #if OMPT_SUPPORT 3731 ompt_data_none, // root parallel id 3732 #endif 3733 #if OMP_40_ENABLED 3734 proc_bind_default, 3735 #endif 3736 &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 3737 } 3738 KMP_ASSERT(root_thread->th.th_serial_team); 3739 KF_TRACE(10, ("__kmp_register_root: after serial_team = %p\n", 3740 root_thread->th.th_serial_team)); 3741 3742 /* drop root_thread into place */ 3743 TCW_SYNC_PTR(__kmp_threads[gtid], root_thread); 3744 3745 root->r.r_root_team->t.t_threads[0] = root_thread; 3746 root->r.r_hot_team->t.t_threads[0] = root_thread; 3747 root_thread->th.th_serial_team->t.t_threads[0] = root_thread; 3748 // AC: the team created in reserve, not for execution (it is unused for now). 3749 root_thread->th.th_serial_team->t.t_serialized = 0; 3750 root->r.r_uber_thread = root_thread; 3751 3752 /* initialize the thread, get it ready to go */ 3753 __kmp_initialize_info(root_thread, root->r.r_root_team, 0, gtid); 3754 TCW_4(__kmp_init_gtid, TRUE); 3755 3756 /* prepare the master thread for get_gtid() */ 3757 __kmp_gtid_set_specific(gtid); 3758 3759 #if USE_ITT_BUILD 3760 __kmp_itt_thread_name(gtid); 3761 #endif /* USE_ITT_BUILD */ 3762 3763 #ifdef KMP_TDATA_GTID 3764 __kmp_gtid = gtid; 3765 #endif 3766 __kmp_create_worker(gtid, root_thread, __kmp_stksize); 3767 KMP_DEBUG_ASSERT(__kmp_gtid_get_specific() == gtid); 3768 3769 KA_TRACE(20, ("__kmp_register_root: T#%d init T#%d(%d:%d) arrived: join=%u, " 3770 "plain=%u\n", 3771 gtid, __kmp_gtid_from_tid(0, root->r.r_hot_team), 3772 root->r.r_hot_team->t.t_id, 0, KMP_INIT_BARRIER_STATE, 3773 KMP_INIT_BARRIER_STATE)); 3774 { // Initialize barrier data. 3775 int b; 3776 for (b = 0; b < bs_last_barrier; ++b) { 3777 root_thread->th.th_bar[b].bb.b_arrived = KMP_INIT_BARRIER_STATE; 3778 #if USE_DEBUGGER 3779 root_thread->th.th_bar[b].bb.b_worker_arrived = 0; 3780 #endif 3781 } 3782 } 3783 KMP_DEBUG_ASSERT(root->r.r_hot_team->t.t_bar[bs_forkjoin_barrier].b_arrived == 3784 KMP_INIT_BARRIER_STATE); 3785 3786 #if KMP_AFFINITY_SUPPORTED 3787 #if OMP_40_ENABLED 3788 root_thread->th.th_current_place = KMP_PLACE_UNDEFINED; 3789 root_thread->th.th_new_place = KMP_PLACE_UNDEFINED; 3790 root_thread->th.th_first_place = KMP_PLACE_UNDEFINED; 3791 root_thread->th.th_last_place = KMP_PLACE_UNDEFINED; 3792 #endif 3793 3794 if (TCR_4(__kmp_init_middle)) { 3795 __kmp_affinity_set_init_mask(gtid, TRUE); 3796 } 3797 #endif /* KMP_AFFINITY_SUPPORTED */ 3798 3799 __kmp_root_counter++; 3800 3801 #if OMPT_SUPPORT 3802 if (!initial_thread && ompt_enabled.enabled) { 3803 3804 ompt_thread_t *root_thread = ompt_get_thread(); 3805 3806 ompt_set_thread_state(root_thread, omp_state_overhead); 3807 3808 if (ompt_enabled.ompt_callback_thread_begin) { 3809 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 3810 ompt_thread_initial, __ompt_get_thread_data_internal()); 3811 } 3812 ompt_data_t *task_data; 3813 __ompt_get_task_info_internal(0, NULL, &task_data, NULL, NULL, NULL); 3814 if (ompt_enabled.ompt_callback_task_create) { 3815 ompt_callbacks.ompt_callback(ompt_callback_task_create)( 3816 NULL, NULL, task_data, ompt_task_initial, 0, NULL); 3817 // initial task has nothing to return to 3818 } 3819 3820 ompt_set_thread_state(root_thread, omp_state_work_serial); 3821 } 3822 #endif 3823 3824 KMP_MB(); 3825 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3826 3827 return gtid; 3828 } 3829 3830 #if KMP_NESTED_HOT_TEAMS 3831 static int __kmp_free_hot_teams(kmp_root_t *root, kmp_info_t *thr, int level, 3832 const int max_level) { 3833 int i, n, nth; 3834 kmp_hot_team_ptr_t *hot_teams = thr->th.th_hot_teams; 3835 if (!hot_teams || !hot_teams[level].hot_team) { 3836 return 0; 3837 } 3838 KMP_DEBUG_ASSERT(level < max_level); 3839 kmp_team_t *team = hot_teams[level].hot_team; 3840 nth = hot_teams[level].hot_team_nth; 3841 n = nth - 1; // master is not freed 3842 if (level < max_level - 1) { 3843 for (i = 0; i < nth; ++i) { 3844 kmp_info_t *th = team->t.t_threads[i]; 3845 n += __kmp_free_hot_teams(root, th, level + 1, max_level); 3846 if (i > 0 && th->th.th_hot_teams) { 3847 __kmp_free(th->th.th_hot_teams); 3848 th->th.th_hot_teams = NULL; 3849 } 3850 } 3851 } 3852 __kmp_free_team(root, team, NULL); 3853 return n; 3854 } 3855 #endif 3856 3857 // Resets a root thread and clear its root and hot teams. 3858 // Returns the number of __kmp_threads entries directly and indirectly freed. 3859 static int __kmp_reset_root(int gtid, kmp_root_t *root) { 3860 kmp_team_t *root_team = root->r.r_root_team; 3861 kmp_team_t *hot_team = root->r.r_hot_team; 3862 int n = hot_team->t.t_nproc; 3863 int i; 3864 3865 KMP_DEBUG_ASSERT(!root->r.r_active); 3866 3867 root->r.r_root_team = NULL; 3868 root->r.r_hot_team = NULL; 3869 // __kmp_free_team() does not free hot teams, so we have to clear r_hot_team 3870 // before call to __kmp_free_team(). 3871 __kmp_free_team(root, root_team USE_NESTED_HOT_ARG(NULL)); 3872 #if KMP_NESTED_HOT_TEAMS 3873 if (__kmp_hot_teams_max_level > 3874 0) { // need to free nested hot teams and their threads if any 3875 for (i = 0; i < hot_team->t.t_nproc; ++i) { 3876 kmp_info_t *th = hot_team->t.t_threads[i]; 3877 if (__kmp_hot_teams_max_level > 1) { 3878 n += __kmp_free_hot_teams(root, th, 1, __kmp_hot_teams_max_level); 3879 } 3880 if (th->th.th_hot_teams) { 3881 __kmp_free(th->th.th_hot_teams); 3882 th->th.th_hot_teams = NULL; 3883 } 3884 } 3885 } 3886 #endif 3887 __kmp_free_team(root, hot_team USE_NESTED_HOT_ARG(NULL)); 3888 3889 // Before we can reap the thread, we need to make certain that all other 3890 // threads in the teams that had this root as ancestor have stopped trying to 3891 // steal tasks. 3892 if (__kmp_tasking_mode != tskm_immediate_exec) { 3893 __kmp_wait_to_unref_task_teams(); 3894 } 3895 3896 #if KMP_OS_WINDOWS 3897 /* Close Handle of root duplicated in __kmp_create_worker (tr #62919) */ 3898 KA_TRACE( 3899 10, ("__kmp_reset_root: free handle, th = %p, handle = %" KMP_UINTPTR_SPEC 3900 "\n", 3901 (LPVOID) & (root->r.r_uber_thread->th), 3902 root->r.r_uber_thread->th.th_info.ds.ds_thread)); 3903 __kmp_free_handle(root->r.r_uber_thread->th.th_info.ds.ds_thread); 3904 #endif /* KMP_OS_WINDOWS */ 3905 3906 #if OMPT_SUPPORT 3907 if (ompt_enabled.ompt_callback_thread_end) { 3908 ompt_callbacks.ompt_callback(ompt_callback_thread_end)( 3909 &(root->r.r_uber_thread->th.ompt_thread_info.thread_data)); 3910 } 3911 #endif 3912 3913 TCW_4(__kmp_nth, 3914 __kmp_nth - 1); // __kmp_reap_thread will decrement __kmp_all_nth. 3915 root->r.r_cg_nthreads--; 3916 3917 __kmp_reap_thread(root->r.r_uber_thread, 1); 3918 3919 // We canot put root thread to __kmp_thread_pool, so we have to reap it istead 3920 // of freeing. 3921 root->r.r_uber_thread = NULL; 3922 /* mark root as no longer in use */ 3923 root->r.r_begin = FALSE; 3924 3925 return n; 3926 } 3927 3928 void __kmp_unregister_root_current_thread(int gtid) { 3929 KA_TRACE(1, ("__kmp_unregister_root_current_thread: enter T#%d\n", gtid)); 3930 /* this lock should be ok, since unregister_root_current_thread is never 3931 called during an abort, only during a normal close. furthermore, if you 3932 have the forkjoin lock, you should never try to get the initz lock */ 3933 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 3934 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 3935 KC_TRACE(10, ("__kmp_unregister_root_current_thread: already finished, " 3936 "exiting T#%d\n", 3937 gtid)); 3938 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3939 return; 3940 } 3941 kmp_root_t *root = __kmp_root[gtid]; 3942 3943 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3944 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3945 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3946 KMP_ASSERT(root->r.r_active == FALSE); 3947 3948 KMP_MB(); 3949 3950 #if OMP_45_ENABLED 3951 kmp_info_t *thread = __kmp_threads[gtid]; 3952 kmp_team_t *team = thread->th.th_team; 3953 kmp_task_team_t *task_team = thread->th.th_task_team; 3954 3955 // we need to wait for the proxy tasks before finishing the thread 3956 if (task_team != NULL && task_team->tt.tt_found_proxy_tasks) { 3957 #if OMPT_SUPPORT 3958 // the runtime is shutting down so we won't report any events 3959 thread->th.ompt_thread_info.state = omp_state_undefined; 3960 #endif 3961 __kmp_task_team_wait(thread, team USE_ITT_BUILD_ARG(NULL)); 3962 } 3963 #endif 3964 3965 __kmp_reset_root(gtid, root); 3966 3967 /* free up this thread slot */ 3968 __kmp_gtid_set_specific(KMP_GTID_DNE); 3969 #ifdef KMP_TDATA_GTID 3970 __kmp_gtid = KMP_GTID_DNE; 3971 #endif 3972 3973 KMP_MB(); 3974 KC_TRACE(10, 3975 ("__kmp_unregister_root_current_thread: T#%d unregistered\n", gtid)); 3976 3977 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 3978 } 3979 3980 #if KMP_OS_WINDOWS 3981 /* __kmp_forkjoin_lock must be already held 3982 Unregisters a root thread that is not the current thread. Returns the number 3983 of __kmp_threads entries freed as a result. */ 3984 static int __kmp_unregister_root_other_thread(int gtid) { 3985 kmp_root_t *root = __kmp_root[gtid]; 3986 int r; 3987 3988 KA_TRACE(1, ("__kmp_unregister_root_other_thread: enter T#%d\n", gtid)); 3989 KMP_DEBUG_ASSERT(__kmp_threads && __kmp_threads[gtid]); 3990 KMP_ASSERT(KMP_UBER_GTID(gtid)); 3991 KMP_ASSERT(root == __kmp_threads[gtid]->th.th_root); 3992 KMP_ASSERT(root->r.r_active == FALSE); 3993 3994 r = __kmp_reset_root(gtid, root); 3995 KC_TRACE(10, 3996 ("__kmp_unregister_root_other_thread: T#%d unregistered\n", gtid)); 3997 return r; 3998 } 3999 #endif 4000 4001 #if KMP_DEBUG 4002 void __kmp_task_info() { 4003 4004 kmp_int32 gtid = __kmp_entry_gtid(); 4005 kmp_int32 tid = __kmp_tid_from_gtid(gtid); 4006 kmp_info_t *this_thr = __kmp_threads[gtid]; 4007 kmp_team_t *steam = this_thr->th.th_serial_team; 4008 kmp_team_t *team = this_thr->th.th_team; 4009 4010 __kmp_printf( 4011 "__kmp_task_info: gtid=%d tid=%d t_thread=%p team=%p steam=%p curtask=%p " 4012 "ptask=%p\n", 4013 gtid, tid, this_thr, team, steam, this_thr->th.th_current_task, 4014 team->t.t_implicit_task_taskdata[tid].td_parent); 4015 } 4016 #endif // KMP_DEBUG 4017 4018 /* TODO optimize with one big memclr, take out what isn't needed, split 4019 responsibility to workers as much as possible, and delay initialization of 4020 features as much as possible */ 4021 static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, 4022 int tid, int gtid) { 4023 /* this_thr->th.th_info.ds.ds_gtid is setup in 4024 kmp_allocate_thread/create_worker. 4025 this_thr->th.th_serial_team is setup in __kmp_allocate_thread */ 4026 kmp_info_t *master = team->t.t_threads[0]; 4027 KMP_DEBUG_ASSERT(this_thr != NULL); 4028 KMP_DEBUG_ASSERT(this_thr->th.th_serial_team); 4029 KMP_DEBUG_ASSERT(team); 4030 KMP_DEBUG_ASSERT(team->t.t_threads); 4031 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4032 KMP_DEBUG_ASSERT(master); 4033 KMP_DEBUG_ASSERT(master->th.th_root); 4034 4035 KMP_MB(); 4036 4037 TCW_SYNC_PTR(this_thr->th.th_team, team); 4038 4039 this_thr->th.th_info.ds.ds_tid = tid; 4040 this_thr->th.th_set_nproc = 0; 4041 if (__kmp_tasking_mode != tskm_immediate_exec) 4042 // When tasking is possible, threads are not safe to reap until they are 4043 // done tasking; this will be set when tasking code is exited in wait 4044 this_thr->th.th_reap_state = KMP_NOT_SAFE_TO_REAP; 4045 else // no tasking --> always safe to reap 4046 this_thr->th.th_reap_state = KMP_SAFE_TO_REAP; 4047 #if OMP_40_ENABLED 4048 this_thr->th.th_set_proc_bind = proc_bind_default; 4049 #if KMP_AFFINITY_SUPPORTED 4050 this_thr->th.th_new_place = this_thr->th.th_current_place; 4051 #endif 4052 #endif 4053 this_thr->th.th_root = master->th.th_root; 4054 4055 /* setup the thread's cache of the team structure */ 4056 this_thr->th.th_team_nproc = team->t.t_nproc; 4057 this_thr->th.th_team_master = master; 4058 this_thr->th.th_team_serialized = team->t.t_serialized; 4059 TCW_PTR(this_thr->th.th_sleep_loc, NULL); 4060 4061 KMP_DEBUG_ASSERT(team->t.t_implicit_task_taskdata); 4062 4063 KF_TRACE(10, ("__kmp_initialize_info1: T#%d:%d this_thread=%p curtask=%p\n", 4064 tid, gtid, this_thr, this_thr->th.th_current_task)); 4065 4066 __kmp_init_implicit_task(this_thr->th.th_team_master->th.th_ident, this_thr, 4067 team, tid, TRUE); 4068 4069 KF_TRACE(10, ("__kmp_initialize_info2: T#%d:%d this_thread=%p curtask=%p\n", 4070 tid, gtid, this_thr, this_thr->th.th_current_task)); 4071 // TODO: Initialize ICVs from parent; GEH - isn't that already done in 4072 // __kmp_initialize_team()? 4073 4074 /* TODO no worksharing in speculative threads */ 4075 this_thr->th.th_dispatch = &team->t.t_dispatch[tid]; 4076 4077 this_thr->th.th_local.this_construct = 0; 4078 4079 if (!this_thr->th.th_pri_common) { 4080 this_thr->th.th_pri_common = 4081 (struct common_table *)__kmp_allocate(sizeof(struct common_table)); 4082 if (__kmp_storage_map) { 4083 __kmp_print_storage_map_gtid( 4084 gtid, this_thr->th.th_pri_common, this_thr->th.th_pri_common + 1, 4085 sizeof(struct common_table), "th_%d.th_pri_common\n", gtid); 4086 } 4087 this_thr->th.th_pri_head = NULL; 4088 } 4089 4090 /* Initialize dynamic dispatch */ 4091 { 4092 volatile kmp_disp_t *dispatch = this_thr->th.th_dispatch; 4093 // Use team max_nproc since this will never change for the team. 4094 size_t disp_size = 4095 sizeof(dispatch_private_info_t) * 4096 (team->t.t_max_nproc == 1 ? 1 : __kmp_dispatch_num_buffers); 4097 KD_TRACE(10, ("__kmp_initialize_info: T#%d max_nproc: %d\n", gtid, 4098 team->t.t_max_nproc)); 4099 KMP_ASSERT(dispatch); 4100 KMP_DEBUG_ASSERT(team->t.t_dispatch); 4101 KMP_DEBUG_ASSERT(dispatch == &team->t.t_dispatch[tid]); 4102 4103 dispatch->th_disp_index = 0; 4104 #if OMP_45_ENABLED 4105 dispatch->th_doacross_buf_idx = 0; 4106 #endif 4107 if (!dispatch->th_disp_buffer) { 4108 dispatch->th_disp_buffer = 4109 (dispatch_private_info_t *)__kmp_allocate(disp_size); 4110 4111 if (__kmp_storage_map) { 4112 __kmp_print_storage_map_gtid( 4113 gtid, &dispatch->th_disp_buffer[0], 4114 &dispatch->th_disp_buffer[team->t.t_max_nproc == 1 4115 ? 1 4116 : __kmp_dispatch_num_buffers], 4117 disp_size, "th_%d.th_dispatch.th_disp_buffer " 4118 "(team_%d.t_dispatch[%d].th_disp_buffer)", 4119 gtid, team->t.t_id, gtid); 4120 } 4121 } else { 4122 memset(&dispatch->th_disp_buffer[0], '\0', disp_size); 4123 } 4124 4125 dispatch->th_dispatch_pr_current = 0; 4126 dispatch->th_dispatch_sh_current = 0; 4127 4128 dispatch->th_deo_fcn = 0; /* ORDERED */ 4129 dispatch->th_dxo_fcn = 0; /* END ORDERED */ 4130 } 4131 4132 this_thr->th.th_next_pool = NULL; 4133 4134 if (!this_thr->th.th_task_state_memo_stack) { 4135 size_t i; 4136 this_thr->th.th_task_state_memo_stack = 4137 (kmp_uint8 *)__kmp_allocate(4 * sizeof(kmp_uint8)); 4138 this_thr->th.th_task_state_top = 0; 4139 this_thr->th.th_task_state_stack_sz = 4; 4140 for (i = 0; i < this_thr->th.th_task_state_stack_sz; 4141 ++i) // zero init the stack 4142 this_thr->th.th_task_state_memo_stack[i] = 0; 4143 } 4144 4145 KMP_DEBUG_ASSERT(!this_thr->th.th_spin_here); 4146 KMP_DEBUG_ASSERT(this_thr->th.th_next_waiting == 0); 4147 4148 KMP_MB(); 4149 } 4150 4151 /* allocate a new thread for the requesting team. this is only called from 4152 within a forkjoin critical section. we will first try to get an available 4153 thread from the thread pool. if none is available, we will fork a new one 4154 assuming we are able to create a new one. this should be assured, as the 4155 caller should check on this first. */ 4156 kmp_info_t *__kmp_allocate_thread(kmp_root_t *root, kmp_team_t *team, 4157 int new_tid) { 4158 kmp_team_t *serial_team; 4159 kmp_info_t *new_thr; 4160 int new_gtid; 4161 4162 KA_TRACE(20, ("__kmp_allocate_thread: T#%d\n", __kmp_get_gtid())); 4163 KMP_DEBUG_ASSERT(root && team); 4164 #if !KMP_NESTED_HOT_TEAMS 4165 KMP_DEBUG_ASSERT(KMP_MASTER_GTID(__kmp_get_gtid())); 4166 #endif 4167 KMP_MB(); 4168 4169 /* first, try to get one from the thread pool */ 4170 if (__kmp_thread_pool) { 4171 4172 new_thr = CCAST(kmp_info_t *, __kmp_thread_pool); 4173 __kmp_thread_pool = (volatile kmp_info_t *)new_thr->th.th_next_pool; 4174 if (new_thr == __kmp_thread_pool_insert_pt) { 4175 __kmp_thread_pool_insert_pt = NULL; 4176 } 4177 TCW_4(new_thr->th.th_in_pool, FALSE); 4178 // Don't touch th_active_in_pool or th_active. 4179 // The worker thread adjusts those flags as it sleeps/awakens. 4180 __kmp_thread_pool_nth--; 4181 4182 KA_TRACE(20, ("__kmp_allocate_thread: T#%d using thread T#%d\n", 4183 __kmp_get_gtid(), new_thr->th.th_info.ds.ds_gtid)); 4184 KMP_ASSERT(!new_thr->th.th_team); 4185 KMP_DEBUG_ASSERT(__kmp_nth < __kmp_threads_capacity); 4186 KMP_DEBUG_ASSERT(__kmp_thread_pool_nth >= 0); 4187 4188 /* setup the thread structure */ 4189 __kmp_initialize_info(new_thr, team, new_tid, 4190 new_thr->th.th_info.ds.ds_gtid); 4191 KMP_DEBUG_ASSERT(new_thr->th.th_serial_team); 4192 4193 TCW_4(__kmp_nth, __kmp_nth + 1); 4194 root->r.r_cg_nthreads++; 4195 4196 new_thr->th.th_task_state = 0; 4197 new_thr->th.th_task_state_top = 0; 4198 new_thr->th.th_task_state_stack_sz = 4; 4199 4200 #ifdef KMP_ADJUST_BLOCKTIME 4201 /* Adjust blocktime back to zero if necessary */ 4202 /* Middle initialization might not have occurred yet */ 4203 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4204 if (__kmp_nth > __kmp_avail_proc) { 4205 __kmp_zero_bt = TRUE; 4206 } 4207 } 4208 #endif /* KMP_ADJUST_BLOCKTIME */ 4209 4210 #if KMP_DEBUG 4211 // If thread entered pool via __kmp_free_thread, wait_flag should != 4212 // KMP_BARRIER_PARENT_FLAG. 4213 int b; 4214 kmp_balign_t *balign = new_thr->th.th_bar; 4215 for (b = 0; b < bs_last_barrier; ++b) 4216 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 4217 #endif 4218 4219 KF_TRACE(10, ("__kmp_allocate_thread: T#%d using thread %p T#%d\n", 4220 __kmp_get_gtid(), new_thr, new_thr->th.th_info.ds.ds_gtid)); 4221 4222 KMP_MB(); 4223 return new_thr; 4224 } 4225 4226 /* no, well fork a new one */ 4227 KMP_ASSERT(__kmp_nth == __kmp_all_nth); 4228 KMP_ASSERT(__kmp_all_nth < __kmp_threads_capacity); 4229 4230 #if KMP_USE_MONITOR 4231 // If this is the first worker thread the RTL is creating, then also 4232 // launch the monitor thread. We try to do this as early as possible. 4233 if (!TCR_4(__kmp_init_monitor)) { 4234 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 4235 if (!TCR_4(__kmp_init_monitor)) { 4236 KF_TRACE(10, ("before __kmp_create_monitor\n")); 4237 TCW_4(__kmp_init_monitor, 1); 4238 __kmp_create_monitor(&__kmp_monitor); 4239 KF_TRACE(10, ("after __kmp_create_monitor\n")); 4240 #if KMP_OS_WINDOWS 4241 // AC: wait until monitor has started. This is a fix for CQ232808. 4242 // The reason is that if the library is loaded/unloaded in a loop with 4243 // small (parallel) work in between, then there is high probability that 4244 // monitor thread started after the library shutdown. At shutdown it is 4245 // too late to cope with the problem, because when the master is in 4246 // DllMain (process detach) the monitor has no chances to start (it is 4247 // blocked), and master has no means to inform the monitor that the 4248 // library has gone, because all the memory which the monitor can access 4249 // is going to be released/reset. 4250 while (TCR_4(__kmp_init_monitor) < 2) { 4251 KMP_YIELD(TRUE); 4252 } 4253 KF_TRACE(10, ("after monitor thread has started\n")); 4254 #endif 4255 } 4256 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 4257 } 4258 #endif 4259 4260 KMP_MB(); 4261 for (new_gtid = 1; TCR_PTR(__kmp_threads[new_gtid]) != NULL; ++new_gtid) { 4262 KMP_DEBUG_ASSERT(new_gtid < __kmp_threads_capacity); 4263 } 4264 4265 /* allocate space for it. */ 4266 new_thr = (kmp_info_t *)__kmp_allocate(sizeof(kmp_info_t)); 4267 4268 TCW_SYNC_PTR(__kmp_threads[new_gtid], new_thr); 4269 4270 if (__kmp_storage_map) { 4271 __kmp_print_thread_storage_map(new_thr, new_gtid); 4272 } 4273 4274 // add the reserve serialized team, initialized from the team's master thread 4275 { 4276 kmp_internal_control_t r_icvs = __kmp_get_x_global_icvs(team); 4277 KF_TRACE(10, ("__kmp_allocate_thread: before th_serial/serial_team\n")); 4278 new_thr->th.th_serial_team = serial_team = 4279 (kmp_team_t *)__kmp_allocate_team(root, 1, 1, 4280 #if OMPT_SUPPORT 4281 ompt_data_none, // root parallel id 4282 #endif 4283 #if OMP_40_ENABLED 4284 proc_bind_default, 4285 #endif 4286 &r_icvs, 0 USE_NESTED_HOT_ARG(NULL)); 4287 } 4288 KMP_ASSERT(serial_team); 4289 serial_team->t.t_serialized = 0; // AC: the team created in reserve, not for 4290 // execution (it is unused for now). 4291 serial_team->t.t_threads[0] = new_thr; 4292 KF_TRACE(10, 4293 ("__kmp_allocate_thread: after th_serial/serial_team : new_thr=%p\n", 4294 new_thr)); 4295 4296 /* setup the thread structures */ 4297 __kmp_initialize_info(new_thr, team, new_tid, new_gtid); 4298 4299 #if USE_FAST_MEMORY 4300 __kmp_initialize_fast_memory(new_thr); 4301 #endif /* USE_FAST_MEMORY */ 4302 4303 #if KMP_USE_BGET 4304 KMP_DEBUG_ASSERT(new_thr->th.th_local.bget_data == NULL); 4305 __kmp_initialize_bget(new_thr); 4306 #endif 4307 4308 __kmp_init_random(new_thr); // Initialize random number generator 4309 4310 /* Initialize these only once when thread is grabbed for a team allocation */ 4311 KA_TRACE(20, 4312 ("__kmp_allocate_thread: T#%d init go fork=%u, plain=%u\n", 4313 __kmp_get_gtid(), KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 4314 4315 int b; 4316 kmp_balign_t *balign = new_thr->th.th_bar; 4317 for (b = 0; b < bs_last_barrier; ++b) { 4318 balign[b].bb.b_go = KMP_INIT_BARRIER_STATE; 4319 balign[b].bb.team = NULL; 4320 balign[b].bb.wait_flag = KMP_BARRIER_NOT_WAITING; 4321 balign[b].bb.use_oncore_barrier = 0; 4322 } 4323 4324 new_thr->th.th_spin_here = FALSE; 4325 new_thr->th.th_next_waiting = 0; 4326 #if KMP_OS_UNIX 4327 new_thr->th.th_blocking = false; 4328 #endif 4329 4330 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 4331 new_thr->th.th_current_place = KMP_PLACE_UNDEFINED; 4332 new_thr->th.th_new_place = KMP_PLACE_UNDEFINED; 4333 new_thr->th.th_first_place = KMP_PLACE_UNDEFINED; 4334 new_thr->th.th_last_place = KMP_PLACE_UNDEFINED; 4335 #endif 4336 4337 TCW_4(new_thr->th.th_in_pool, FALSE); 4338 new_thr->th.th_active_in_pool = FALSE; 4339 TCW_4(new_thr->th.th_active, TRUE); 4340 4341 /* adjust the global counters */ 4342 __kmp_all_nth++; 4343 __kmp_nth++; 4344 4345 root->r.r_cg_nthreads++; 4346 4347 // if __kmp_adjust_gtid_mode is set, then we use method #1 (sp search) for low 4348 // numbers of procs, and method #2 (keyed API call) for higher numbers. 4349 if (__kmp_adjust_gtid_mode) { 4350 if (__kmp_all_nth >= __kmp_tls_gtid_min) { 4351 if (TCR_4(__kmp_gtid_mode) != 2) { 4352 TCW_4(__kmp_gtid_mode, 2); 4353 } 4354 } else { 4355 if (TCR_4(__kmp_gtid_mode) != 1) { 4356 TCW_4(__kmp_gtid_mode, 1); 4357 } 4358 } 4359 } 4360 4361 #ifdef KMP_ADJUST_BLOCKTIME 4362 /* Adjust blocktime back to zero if necessary */ 4363 /* Middle initialization might not have occurred yet */ 4364 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 4365 if (__kmp_nth > __kmp_avail_proc) { 4366 __kmp_zero_bt = TRUE; 4367 } 4368 } 4369 #endif /* KMP_ADJUST_BLOCKTIME */ 4370 4371 /* actually fork it and create the new worker thread */ 4372 KF_TRACE( 4373 10, ("__kmp_allocate_thread: before __kmp_create_worker: %p\n", new_thr)); 4374 __kmp_create_worker(new_gtid, new_thr, __kmp_stksize); 4375 KF_TRACE(10, 4376 ("__kmp_allocate_thread: after __kmp_create_worker: %p\n", new_thr)); 4377 4378 KA_TRACE(20, ("__kmp_allocate_thread: T#%d forked T#%d\n", __kmp_get_gtid(), 4379 new_gtid)); 4380 KMP_MB(); 4381 return new_thr; 4382 } 4383 4384 /* Reinitialize team for reuse. 4385 The hot team code calls this case at every fork barrier, so EPCC barrier 4386 test are extremely sensitive to changes in it, esp. writes to the team 4387 struct, which cause a cache invalidation in all threads. 4388 IF YOU TOUCH THIS ROUTINE, RUN EPCC C SYNCBENCH ON A BIG-IRON MACHINE!!! */ 4389 static void __kmp_reinitialize_team(kmp_team_t *team, 4390 kmp_internal_control_t *new_icvs, 4391 ident_t *loc) { 4392 KF_TRACE(10, ("__kmp_reinitialize_team: enter this_thread=%p team=%p\n", 4393 team->t.t_threads[0], team)); 4394 KMP_DEBUG_ASSERT(team && new_icvs); 4395 KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc); 4396 KMP_CHECK_UPDATE(team->t.t_ident, loc); 4397 4398 KMP_CHECK_UPDATE(team->t.t_id, KMP_GEN_TEAM_ID()); 4399 // Copy ICVs to the master thread's implicit taskdata 4400 __kmp_init_implicit_task(loc, team->t.t_threads[0], team, 0, FALSE); 4401 copy_icvs(&team->t.t_implicit_task_taskdata[0].td_icvs, new_icvs); 4402 4403 KF_TRACE(10, ("__kmp_reinitialize_team: exit this_thread=%p team=%p\n", 4404 team->t.t_threads[0], team)); 4405 } 4406 4407 /* Initialize the team data structure. 4408 This assumes the t_threads and t_max_nproc are already set. 4409 Also, we don't touch the arguments */ 4410 static void __kmp_initialize_team(kmp_team_t *team, int new_nproc, 4411 kmp_internal_control_t *new_icvs, 4412 ident_t *loc) { 4413 KF_TRACE(10, ("__kmp_initialize_team: enter: team=%p\n", team)); 4414 4415 /* verify */ 4416 KMP_DEBUG_ASSERT(team); 4417 KMP_DEBUG_ASSERT(new_nproc <= team->t.t_max_nproc); 4418 KMP_DEBUG_ASSERT(team->t.t_threads); 4419 KMP_MB(); 4420 4421 team->t.t_master_tid = 0; /* not needed */ 4422 /* team->t.t_master_bar; not needed */ 4423 team->t.t_serialized = new_nproc > 1 ? 0 : 1; 4424 team->t.t_nproc = new_nproc; 4425 4426 /* team->t.t_parent = NULL; TODO not needed & would mess up hot team */ 4427 team->t.t_next_pool = NULL; 4428 /* memset( team->t.t_threads, 0, sizeof(kmp_info_t*)*new_nproc ); would mess 4429 * up hot team */ 4430 4431 TCW_SYNC_PTR(team->t.t_pkfn, NULL); /* not needed */ 4432 team->t.t_invoke = NULL; /* not needed */ 4433 4434 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4435 team->t.t_sched.sched = new_icvs->sched.sched; 4436 4437 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 4438 team->t.t_fp_control_saved = FALSE; /* not needed */ 4439 team->t.t_x87_fpu_control_word = 0; /* not needed */ 4440 team->t.t_mxcsr = 0; /* not needed */ 4441 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 4442 4443 team->t.t_construct = 0; 4444 4445 team->t.t_ordered.dt.t_value = 0; 4446 team->t.t_master_active = FALSE; 4447 4448 memset(&team->t.t_taskq, '\0', sizeof(kmp_taskq_t)); 4449 4450 #ifdef KMP_DEBUG 4451 team->t.t_copypriv_data = NULL; /* not necessary, but nice for debugging */ 4452 #endif 4453 #if KMP_OS_WINDOWS 4454 team->t.t_copyin_counter = 0; /* for barrier-free copyin implementation */ 4455 #endif 4456 4457 team->t.t_control_stack_top = NULL; 4458 4459 __kmp_reinitialize_team(team, new_icvs, loc); 4460 4461 KMP_MB(); 4462 KF_TRACE(10, ("__kmp_initialize_team: exit: team=%p\n", team)); 4463 } 4464 4465 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 4466 /* Sets full mask for thread and returns old mask, no changes to structures. */ 4467 static void 4468 __kmp_set_thread_affinity_mask_full_tmp(kmp_affin_mask_t *old_mask) { 4469 if (KMP_AFFINITY_CAPABLE()) { 4470 int status; 4471 if (old_mask != NULL) { 4472 status = __kmp_get_system_affinity(old_mask, TRUE); 4473 int error = errno; 4474 if (status != 0) { 4475 __kmp_fatal(KMP_MSG(ChangeThreadAffMaskError), KMP_ERR(error), 4476 __kmp_msg_null); 4477 } 4478 } 4479 __kmp_set_system_affinity(__kmp_affin_fullMask, TRUE); 4480 } 4481 } 4482 #endif 4483 4484 #if OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED 4485 4486 // __kmp_partition_places() is the heart of the OpenMP 4.0 affinity mechanism. 4487 // It calculats the worker + master thread's partition based upon the parent 4488 // thread's partition, and binds each worker to a thread in their partition. 4489 // The master thread's partition should already include its current binding. 4490 static void __kmp_partition_places(kmp_team_t *team, int update_master_only) { 4491 // Copy the master thread's place partion to the team struct 4492 kmp_info_t *master_th = team->t.t_threads[0]; 4493 KMP_DEBUG_ASSERT(master_th != NULL); 4494 kmp_proc_bind_t proc_bind = team->t.t_proc_bind; 4495 int first_place = master_th->th.th_first_place; 4496 int last_place = master_th->th.th_last_place; 4497 int masters_place = master_th->th.th_current_place; 4498 team->t.t_first_place = first_place; 4499 team->t.t_last_place = last_place; 4500 4501 KA_TRACE(20, ("__kmp_partition_places: enter: proc_bind = %d T#%d(%d:0) " 4502 "bound to place %d partition = [%d,%d]\n", 4503 proc_bind, __kmp_gtid_from_thread(team->t.t_threads[0]), 4504 team->t.t_id, masters_place, first_place, last_place)); 4505 4506 switch (proc_bind) { 4507 4508 case proc_bind_default: 4509 // serial teams might have the proc_bind policy set to proc_bind_default. It 4510 // doesn't matter, as we don't rebind master thread for any proc_bind policy 4511 KMP_DEBUG_ASSERT(team->t.t_nproc == 1); 4512 break; 4513 4514 case proc_bind_master: { 4515 int f; 4516 int n_th = team->t.t_nproc; 4517 for (f = 1; f < n_th; f++) { 4518 kmp_info_t *th = team->t.t_threads[f]; 4519 KMP_DEBUG_ASSERT(th != NULL); 4520 th->th.th_first_place = first_place; 4521 th->th.th_last_place = last_place; 4522 th->th.th_new_place = masters_place; 4523 4524 KA_TRACE(100, ("__kmp_partition_places: master: T#%d(%d:%d) place %d " 4525 "partition = [%d,%d]\n", 4526 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4527 f, masters_place, first_place, last_place)); 4528 } 4529 } break; 4530 4531 case proc_bind_close: { 4532 int f; 4533 int n_th = team->t.t_nproc; 4534 int n_places; 4535 if (first_place <= last_place) { 4536 n_places = last_place - first_place + 1; 4537 } else { 4538 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4539 } 4540 if (n_th <= n_places) { 4541 int place = masters_place; 4542 for (f = 1; f < n_th; f++) { 4543 kmp_info_t *th = team->t.t_threads[f]; 4544 KMP_DEBUG_ASSERT(th != NULL); 4545 4546 if (place == last_place) { 4547 place = first_place; 4548 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4549 place = 0; 4550 } else { 4551 place++; 4552 } 4553 th->th.th_first_place = first_place; 4554 th->th.th_last_place = last_place; 4555 th->th.th_new_place = place; 4556 4557 KA_TRACE(100, ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4558 "partition = [%d,%d]\n", 4559 __kmp_gtid_from_thread(team->t.t_threads[f]), 4560 team->t.t_id, f, place, first_place, last_place)); 4561 } 4562 } else { 4563 int S, rem, gap, s_count; 4564 S = n_th / n_places; 4565 s_count = 0; 4566 rem = n_th - (S * n_places); 4567 gap = rem > 0 ? n_places / rem : n_places; 4568 int place = masters_place; 4569 int gap_ct = gap; 4570 for (f = 0; f < n_th; f++) { 4571 kmp_info_t *th = team->t.t_threads[f]; 4572 KMP_DEBUG_ASSERT(th != NULL); 4573 4574 th->th.th_first_place = first_place; 4575 th->th.th_last_place = last_place; 4576 th->th.th_new_place = place; 4577 s_count++; 4578 4579 if ((s_count == S) && rem && (gap_ct == gap)) { 4580 // do nothing, add an extra thread to place on next iteration 4581 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4582 // we added an extra thread to this place; move to next place 4583 if (place == last_place) { 4584 place = first_place; 4585 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4586 place = 0; 4587 } else { 4588 place++; 4589 } 4590 s_count = 0; 4591 gap_ct = 1; 4592 rem--; 4593 } else if (s_count == S) { // place full; don't add extra 4594 if (place == last_place) { 4595 place = first_place; 4596 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4597 place = 0; 4598 } else { 4599 place++; 4600 } 4601 gap_ct++; 4602 s_count = 0; 4603 } 4604 4605 KA_TRACE(100, 4606 ("__kmp_partition_places: close: T#%d(%d:%d) place %d " 4607 "partition = [%d,%d]\n", 4608 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, f, 4609 th->th.th_new_place, first_place, last_place)); 4610 } 4611 KMP_DEBUG_ASSERT(place == masters_place); 4612 } 4613 } break; 4614 4615 case proc_bind_spread: { 4616 int f; 4617 int n_th = team->t.t_nproc; 4618 int n_places; 4619 int thidx; 4620 if (first_place <= last_place) { 4621 n_places = last_place - first_place + 1; 4622 } else { 4623 n_places = __kmp_affinity_num_masks - first_place + last_place + 1; 4624 } 4625 if (n_th <= n_places) { 4626 int place = -1; 4627 4628 if (n_places != static_cast<int>(__kmp_affinity_num_masks)) { 4629 int S = n_places / n_th; 4630 int s_count, rem, gap, gap_ct; 4631 4632 place = masters_place; 4633 rem = n_places - n_th * S; 4634 gap = rem ? n_th / rem : 1; 4635 gap_ct = gap; 4636 thidx = n_th; 4637 if (update_master_only == 1) 4638 thidx = 1; 4639 for (f = 0; f < thidx; f++) { 4640 kmp_info_t *th = team->t.t_threads[f]; 4641 KMP_DEBUG_ASSERT(th != NULL); 4642 4643 th->th.th_first_place = place; 4644 th->th.th_new_place = place; 4645 s_count = 1; 4646 while (s_count < S) { 4647 if (place == last_place) { 4648 place = first_place; 4649 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4650 place = 0; 4651 } else { 4652 place++; 4653 } 4654 s_count++; 4655 } 4656 if (rem && (gap_ct == gap)) { 4657 if (place == last_place) { 4658 place = first_place; 4659 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4660 place = 0; 4661 } else { 4662 place++; 4663 } 4664 rem--; 4665 gap_ct = 0; 4666 } 4667 th->th.th_last_place = place; 4668 gap_ct++; 4669 4670 if (place == last_place) { 4671 place = first_place; 4672 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4673 place = 0; 4674 } else { 4675 place++; 4676 } 4677 4678 KA_TRACE(100, 4679 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4680 "partition = [%d,%d], __kmp_affinity_num_masks: %u\n", 4681 __kmp_gtid_from_thread(team->t.t_threads[f]), team->t.t_id, 4682 f, th->th.th_new_place, th->th.th_first_place, 4683 th->th.th_last_place, __kmp_affinity_num_masks)); 4684 } 4685 } else { 4686 /* Having uniform space of available computation places I can create 4687 T partitions of round(P/T) size and put threads into the first 4688 place of each partition. */ 4689 double current = static_cast<double>(masters_place); 4690 double spacing = 4691 (static_cast<double>(n_places + 1) / static_cast<double>(n_th)); 4692 int first, last; 4693 kmp_info_t *th; 4694 4695 thidx = n_th + 1; 4696 if (update_master_only == 1) 4697 thidx = 1; 4698 for (f = 0; f < thidx; f++) { 4699 first = static_cast<int>(current); 4700 last = static_cast<int>(current + spacing) - 1; 4701 KMP_DEBUG_ASSERT(last >= first); 4702 if (first >= n_places) { 4703 if (masters_place) { 4704 first -= n_places; 4705 last -= n_places; 4706 if (first == (masters_place + 1)) { 4707 KMP_DEBUG_ASSERT(f == n_th); 4708 first--; 4709 } 4710 if (last == masters_place) { 4711 KMP_DEBUG_ASSERT(f == (n_th - 1)); 4712 last--; 4713 } 4714 } else { 4715 KMP_DEBUG_ASSERT(f == n_th); 4716 first = 0; 4717 last = 0; 4718 } 4719 } 4720 if (last >= n_places) { 4721 last = (n_places - 1); 4722 } 4723 place = first; 4724 current += spacing; 4725 if (f < n_th) { 4726 KMP_DEBUG_ASSERT(0 <= first); 4727 KMP_DEBUG_ASSERT(n_places > first); 4728 KMP_DEBUG_ASSERT(0 <= last); 4729 KMP_DEBUG_ASSERT(n_places > last); 4730 KMP_DEBUG_ASSERT(last_place >= first_place); 4731 th = team->t.t_threads[f]; 4732 KMP_DEBUG_ASSERT(th); 4733 th->th.th_first_place = first; 4734 th->th.th_new_place = place; 4735 th->th.th_last_place = last; 4736 4737 KA_TRACE(100, 4738 ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4739 "partition = [%d,%d], spacing = %.4f\n", 4740 __kmp_gtid_from_thread(team->t.t_threads[f]), 4741 team->t.t_id, f, th->th.th_new_place, 4742 th->th.th_first_place, th->th.th_last_place, spacing)); 4743 } 4744 } 4745 } 4746 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4747 } else { 4748 int S, rem, gap, s_count; 4749 S = n_th / n_places; 4750 s_count = 0; 4751 rem = n_th - (S * n_places); 4752 gap = rem > 0 ? n_places / rem : n_places; 4753 int place = masters_place; 4754 int gap_ct = gap; 4755 thidx = n_th; 4756 if (update_master_only == 1) 4757 thidx = 1; 4758 for (f = 0; f < thidx; f++) { 4759 kmp_info_t *th = team->t.t_threads[f]; 4760 KMP_DEBUG_ASSERT(th != NULL); 4761 4762 th->th.th_first_place = place; 4763 th->th.th_last_place = place; 4764 th->th.th_new_place = place; 4765 s_count++; 4766 4767 if ((s_count == S) && rem && (gap_ct == gap)) { 4768 // do nothing, add an extra thread to place on next iteration 4769 } else if ((s_count == S + 1) && rem && (gap_ct == gap)) { 4770 // we added an extra thread to this place; move on to next place 4771 if (place == last_place) { 4772 place = first_place; 4773 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4774 place = 0; 4775 } else { 4776 place++; 4777 } 4778 s_count = 0; 4779 gap_ct = 1; 4780 rem--; 4781 } else if (s_count == S) { // place is full; don't add extra thread 4782 if (place == last_place) { 4783 place = first_place; 4784 } else if (place == (int)(__kmp_affinity_num_masks - 1)) { 4785 place = 0; 4786 } else { 4787 place++; 4788 } 4789 gap_ct++; 4790 s_count = 0; 4791 } 4792 4793 KA_TRACE(100, ("__kmp_partition_places: spread: T#%d(%d:%d) place %d " 4794 "partition = [%d,%d]\n", 4795 __kmp_gtid_from_thread(team->t.t_threads[f]), 4796 team->t.t_id, f, th->th.th_new_place, 4797 th->th.th_first_place, th->th.th_last_place)); 4798 } 4799 KMP_DEBUG_ASSERT(update_master_only || place == masters_place); 4800 } 4801 } break; 4802 4803 default: 4804 break; 4805 } 4806 4807 KA_TRACE(20, ("__kmp_partition_places: exit T#%d\n", team->t.t_id)); 4808 } 4809 4810 #endif /* OMP_40_ENABLED && KMP_AFFINITY_SUPPORTED */ 4811 4812 /* allocate a new team data structure to use. take one off of the free pool if 4813 available */ 4814 kmp_team_t * 4815 __kmp_allocate_team(kmp_root_t *root, int new_nproc, int max_nproc, 4816 #if OMPT_SUPPORT 4817 ompt_data_t ompt_parallel_data, 4818 #endif 4819 #if OMP_40_ENABLED 4820 kmp_proc_bind_t new_proc_bind, 4821 #endif 4822 kmp_internal_control_t *new_icvs, 4823 int argc USE_NESTED_HOT_ARG(kmp_info_t *master)) { 4824 KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_allocate_team); 4825 int f; 4826 kmp_team_t *team; 4827 int use_hot_team = !root->r.r_active; 4828 int level = 0; 4829 4830 KA_TRACE(20, ("__kmp_allocate_team: called\n")); 4831 KMP_DEBUG_ASSERT(new_nproc >= 1 && argc >= 0); 4832 KMP_DEBUG_ASSERT(max_nproc >= new_nproc); 4833 KMP_MB(); 4834 4835 #if KMP_NESTED_HOT_TEAMS 4836 kmp_hot_team_ptr_t *hot_teams; 4837 if (master) { 4838 team = master->th.th_team; 4839 level = team->t.t_active_level; 4840 if (master->th.th_teams_microtask) { // in teams construct? 4841 if (master->th.th_teams_size.nteams > 1 && 4842 ( // #teams > 1 4843 team->t.t_pkfn == 4844 (microtask_t)__kmp_teams_master || // inner fork of the teams 4845 master->th.th_teams_level < 4846 team->t.t_level)) { // or nested parallel inside the teams 4847 ++level; // not increment if #teams==1, or for outer fork of the teams; 4848 // increment otherwise 4849 } 4850 } 4851 hot_teams = master->th.th_hot_teams; 4852 if (level < __kmp_hot_teams_max_level && hot_teams && 4853 hot_teams[level] 4854 .hot_team) { // hot team has already been allocated for given level 4855 use_hot_team = 1; 4856 } else { 4857 use_hot_team = 0; 4858 } 4859 } 4860 #endif 4861 // Optimization to use a "hot" team 4862 if (use_hot_team && new_nproc > 1) { 4863 KMP_DEBUG_ASSERT(new_nproc == max_nproc); 4864 #if KMP_NESTED_HOT_TEAMS 4865 team = hot_teams[level].hot_team; 4866 #else 4867 team = root->r.r_hot_team; 4868 #endif 4869 #if KMP_DEBUG 4870 if (__kmp_tasking_mode != tskm_immediate_exec) { 4871 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 4872 "task_team[1] = %p before reinit\n", 4873 team->t.t_task_team[0], team->t.t_task_team[1])); 4874 } 4875 #endif 4876 4877 // Has the number of threads changed? 4878 /* Let's assume the most common case is that the number of threads is 4879 unchanged, and put that case first. */ 4880 if (team->t.t_nproc == new_nproc) { // Check changes in number of threads 4881 KA_TRACE(20, ("__kmp_allocate_team: reusing hot team\n")); 4882 // This case can mean that omp_set_num_threads() was called and the hot 4883 // team size was already reduced, so we check the special flag 4884 if (team->t.t_size_changed == -1) { 4885 team->t.t_size_changed = 1; 4886 } else { 4887 KMP_CHECK_UPDATE(team->t.t_size_changed, 0); 4888 } 4889 4890 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4891 kmp_r_sched_t new_sched = new_icvs->sched; 4892 // set master's schedule as new run-time schedule 4893 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_sched.sched); 4894 4895 __kmp_reinitialize_team(team, new_icvs, 4896 root->r.r_uber_thread->th.th_ident); 4897 4898 KF_TRACE(10, ("__kmp_allocate_team2: T#%d, this_thread=%p team=%p\n", 0, 4899 team->t.t_threads[0], team)); 4900 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4901 4902 #if OMP_40_ENABLED 4903 #if KMP_AFFINITY_SUPPORTED 4904 if ((team->t.t_size_changed == 0) && 4905 (team->t.t_proc_bind == new_proc_bind)) { 4906 if (new_proc_bind == proc_bind_spread) { 4907 __kmp_partition_places( 4908 team, 1); // add flag to update only master for spread 4909 } 4910 KA_TRACE(200, ("__kmp_allocate_team: reusing hot team #%d bindings: " 4911 "proc_bind = %d, partition = [%d,%d]\n", 4912 team->t.t_id, new_proc_bind, team->t.t_first_place, 4913 team->t.t_last_place)); 4914 } else { 4915 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4916 __kmp_partition_places(team); 4917 } 4918 #else 4919 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4920 #endif /* KMP_AFFINITY_SUPPORTED */ 4921 #endif /* OMP_40_ENABLED */ 4922 } else if (team->t.t_nproc > new_nproc) { 4923 KA_TRACE(20, 4924 ("__kmp_allocate_team: decreasing hot team thread count to %d\n", 4925 new_nproc)); 4926 4927 team->t.t_size_changed = 1; 4928 #if KMP_NESTED_HOT_TEAMS 4929 if (__kmp_hot_teams_mode == 0) { 4930 // AC: saved number of threads should correspond to team's value in this 4931 // mode, can be bigger in mode 1, when hot team has threads in reserve 4932 KMP_DEBUG_ASSERT(hot_teams[level].hot_team_nth == team->t.t_nproc); 4933 hot_teams[level].hot_team_nth = new_nproc; 4934 #endif // KMP_NESTED_HOT_TEAMS 4935 /* release the extra threads we don't need any more */ 4936 for (f = new_nproc; f < team->t.t_nproc; f++) { 4937 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 4938 if (__kmp_tasking_mode != tskm_immediate_exec) { 4939 // When decreasing team size, threads no longer in the team should 4940 // unref task team. 4941 team->t.t_threads[f]->th.th_task_team = NULL; 4942 } 4943 __kmp_free_thread(team->t.t_threads[f]); 4944 team->t.t_threads[f] = NULL; 4945 } 4946 #if KMP_NESTED_HOT_TEAMS 4947 } // (__kmp_hot_teams_mode == 0) 4948 else { 4949 // When keeping extra threads in team, switch threads to wait on own 4950 // b_go flag 4951 for (f = new_nproc; f < team->t.t_nproc; ++f) { 4952 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 4953 kmp_balign_t *balign = team->t.t_threads[f]->th.th_bar; 4954 for (int b = 0; b < bs_last_barrier; ++b) { 4955 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) { 4956 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 4957 } 4958 KMP_CHECK_UPDATE(balign[b].bb.leaf_kids, 0); 4959 } 4960 } 4961 } 4962 #endif // KMP_NESTED_HOT_TEAMS 4963 team->t.t_nproc = new_nproc; 4964 // TODO???: team->t.t_max_active_levels = new_max_active_levels; 4965 KMP_CHECK_UPDATE(team->t.t_sched.sched, new_icvs->sched.sched); 4966 __kmp_reinitialize_team(team, new_icvs, 4967 root->r.r_uber_thread->th.th_ident); 4968 4969 /* update the remaining threads */ 4970 for (f = 0; f < new_nproc; ++f) { 4971 team->t.t_threads[f]->th.th_team_nproc = new_nproc; 4972 } 4973 // restore the current task state of the master thread: should be the 4974 // implicit task 4975 KF_TRACE(10, ("__kmp_allocate_team: T#%d, this_thread=%p team=%p\n", 0, 4976 team->t.t_threads[0], team)); 4977 4978 __kmp_push_current_task_to_thread(team->t.t_threads[0], team, 0); 4979 4980 #ifdef KMP_DEBUG 4981 for (f = 0; f < team->t.t_nproc; f++) { 4982 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 4983 team->t.t_threads[f]->th.th_team_nproc == 4984 team->t.t_nproc); 4985 } 4986 #endif 4987 4988 #if OMP_40_ENABLED 4989 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 4990 #if KMP_AFFINITY_SUPPORTED 4991 __kmp_partition_places(team); 4992 #endif 4993 #endif 4994 } else { // team->t.t_nproc < new_nproc 4995 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 4996 kmp_affin_mask_t *old_mask; 4997 if (KMP_AFFINITY_CAPABLE()) { 4998 KMP_CPU_ALLOC(old_mask); 4999 } 5000 #endif 5001 5002 KA_TRACE(20, 5003 ("__kmp_allocate_team: increasing hot team thread count to %d\n", 5004 new_nproc)); 5005 5006 team->t.t_size_changed = 1; 5007 5008 #if KMP_NESTED_HOT_TEAMS 5009 int avail_threads = hot_teams[level].hot_team_nth; 5010 if (new_nproc < avail_threads) 5011 avail_threads = new_nproc; 5012 kmp_info_t **other_threads = team->t.t_threads; 5013 for (f = team->t.t_nproc; f < avail_threads; ++f) { 5014 // Adjust barrier data of reserved threads (if any) of the team 5015 // Other data will be set in __kmp_initialize_info() below. 5016 int b; 5017 kmp_balign_t *balign = other_threads[f]->th.th_bar; 5018 for (b = 0; b < bs_last_barrier; ++b) { 5019 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5020 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5021 #if USE_DEBUGGER 5022 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5023 #endif 5024 } 5025 } 5026 if (hot_teams[level].hot_team_nth >= new_nproc) { 5027 // we have all needed threads in reserve, no need to allocate any 5028 // this only possible in mode 1, cannot have reserved threads in mode 0 5029 KMP_DEBUG_ASSERT(__kmp_hot_teams_mode == 1); 5030 team->t.t_nproc = new_nproc; // just get reserved threads involved 5031 } else { 5032 // we may have some threads in reserve, but not enough 5033 team->t.t_nproc = 5034 hot_teams[level] 5035 .hot_team_nth; // get reserved threads involved if any 5036 hot_teams[level].hot_team_nth = new_nproc; // adjust hot team max size 5037 #endif // KMP_NESTED_HOT_TEAMS 5038 if (team->t.t_max_nproc < new_nproc) { 5039 /* reallocate larger arrays */ 5040 __kmp_reallocate_team_arrays(team, new_nproc); 5041 __kmp_reinitialize_team(team, new_icvs, NULL); 5042 } 5043 5044 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5045 /* Temporarily set full mask for master thread before creation of 5046 workers. The reason is that workers inherit the affinity from master, 5047 so if a lot of workers are created on the single core quickly, they 5048 don't get a chance to set their own affinity for a long time. */ 5049 __kmp_set_thread_affinity_mask_full_tmp(old_mask); 5050 #endif 5051 5052 /* allocate new threads for the hot team */ 5053 for (f = team->t.t_nproc; f < new_nproc; f++) { 5054 kmp_info_t *new_worker = __kmp_allocate_thread(root, team, f); 5055 KMP_DEBUG_ASSERT(new_worker); 5056 team->t.t_threads[f] = new_worker; 5057 5058 KA_TRACE(20, 5059 ("__kmp_allocate_team: team %d init T#%d arrived: " 5060 "join=%llu, plain=%llu\n", 5061 team->t.t_id, __kmp_gtid_from_tid(f, team), team->t.t_id, f, 5062 team->t.t_bar[bs_forkjoin_barrier].b_arrived, 5063 team->t.t_bar[bs_plain_barrier].b_arrived)); 5064 5065 { // Initialize barrier data for new threads. 5066 int b; 5067 kmp_balign_t *balign = new_worker->th.th_bar; 5068 for (b = 0; b < bs_last_barrier; ++b) { 5069 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5070 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != 5071 KMP_BARRIER_PARENT_FLAG); 5072 #if USE_DEBUGGER 5073 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5074 #endif 5075 } 5076 } 5077 } 5078 5079 #if KMP_OS_LINUX && KMP_AFFINITY_SUPPORTED 5080 if (KMP_AFFINITY_CAPABLE()) { 5081 /* Restore initial master thread's affinity mask */ 5082 __kmp_set_system_affinity(old_mask, TRUE); 5083 KMP_CPU_FREE(old_mask); 5084 } 5085 #endif 5086 #if KMP_NESTED_HOT_TEAMS 5087 } // end of check of t_nproc vs. new_nproc vs. hot_team_nth 5088 #endif // KMP_NESTED_HOT_TEAMS 5089 /* make sure everyone is syncronized */ 5090 int old_nproc = team->t.t_nproc; // save old value and use to update only 5091 // new threads below 5092 __kmp_initialize_team(team, new_nproc, new_icvs, 5093 root->r.r_uber_thread->th.th_ident); 5094 5095 /* reinitialize the threads */ 5096 KMP_DEBUG_ASSERT(team->t.t_nproc == new_nproc); 5097 for (f = 0; f < team->t.t_nproc; ++f) 5098 __kmp_initialize_info(team->t.t_threads[f], team, f, 5099 __kmp_gtid_from_tid(f, team)); 5100 if (level) { // set th_task_state for new threads in nested hot team 5101 // __kmp_initialize_info() no longer zeroes th_task_state, so we should 5102 // only need to set the th_task_state for the new threads. th_task_state 5103 // for master thread will not be accurate until after this in 5104 // __kmp_fork_call(), so we look to the master's memo_stack to get the 5105 // correct value. 5106 for (f = old_nproc; f < team->t.t_nproc; ++f) 5107 team->t.t_threads[f]->th.th_task_state = 5108 team->t.t_threads[0]->th.th_task_state_memo_stack[level]; 5109 } else { // set th_task_state for new threads in non-nested hot team 5110 int old_state = 5111 team->t.t_threads[0]->th.th_task_state; // copy master's state 5112 for (f = old_nproc; f < team->t.t_nproc; ++f) 5113 team->t.t_threads[f]->th.th_task_state = old_state; 5114 } 5115 5116 #ifdef KMP_DEBUG 5117 for (f = 0; f < team->t.t_nproc; ++f) { 5118 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 5119 team->t.t_threads[f]->th.th_team_nproc == 5120 team->t.t_nproc); 5121 } 5122 #endif 5123 5124 #if OMP_40_ENABLED 5125 KMP_CHECK_UPDATE(team->t.t_proc_bind, new_proc_bind); 5126 #if KMP_AFFINITY_SUPPORTED 5127 __kmp_partition_places(team); 5128 #endif 5129 #endif 5130 } // Check changes in number of threads 5131 5132 #if OMP_40_ENABLED 5133 kmp_info_t *master = team->t.t_threads[0]; 5134 if (master->th.th_teams_microtask) { 5135 for (f = 1; f < new_nproc; ++f) { 5136 // propagate teams construct specific info to workers 5137 kmp_info_t *thr = team->t.t_threads[f]; 5138 thr->th.th_teams_microtask = master->th.th_teams_microtask; 5139 thr->th.th_teams_level = master->th.th_teams_level; 5140 thr->th.th_teams_size = master->th.th_teams_size; 5141 } 5142 } 5143 #endif /* OMP_40_ENABLED */ 5144 #if KMP_NESTED_HOT_TEAMS 5145 if (level) { 5146 // Sync barrier state for nested hot teams, not needed for outermost hot 5147 // team. 5148 for (f = 1; f < new_nproc; ++f) { 5149 kmp_info_t *thr = team->t.t_threads[f]; 5150 int b; 5151 kmp_balign_t *balign = thr->th.th_bar; 5152 for (b = 0; b < bs_last_barrier; ++b) { 5153 balign[b].bb.b_arrived = team->t.t_bar[b].b_arrived; 5154 KMP_DEBUG_ASSERT(balign[b].bb.wait_flag != KMP_BARRIER_PARENT_FLAG); 5155 #if USE_DEBUGGER 5156 balign[b].bb.b_worker_arrived = team->t.t_bar[b].b_team_arrived; 5157 #endif 5158 } 5159 } 5160 } 5161 #endif // KMP_NESTED_HOT_TEAMS 5162 5163 /* reallocate space for arguments if necessary */ 5164 __kmp_alloc_argv_entries(argc, team, TRUE); 5165 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5166 // The hot team re-uses the previous task team, 5167 // if untouched during the previous release->gather phase. 5168 5169 KF_TRACE(10, (" hot_team = %p\n", team)); 5170 5171 #if KMP_DEBUG 5172 if (__kmp_tasking_mode != tskm_immediate_exec) { 5173 KA_TRACE(20, ("__kmp_allocate_team: hot team task_team[0] = %p " 5174 "task_team[1] = %p after reinit\n", 5175 team->t.t_task_team[0], team->t.t_task_team[1])); 5176 } 5177 #endif 5178 5179 #if OMPT_SUPPORT 5180 __ompt_team_assign_id(team, ompt_parallel_data); 5181 #endif 5182 5183 KMP_MB(); 5184 5185 return team; 5186 } 5187 5188 /* next, let's try to take one from the team pool */ 5189 KMP_MB(); 5190 for (team = CCAST(kmp_team_t *, __kmp_team_pool); (team);) { 5191 /* TODO: consider resizing undersized teams instead of reaping them, now 5192 that we have a resizing mechanism */ 5193 if (team->t.t_max_nproc >= max_nproc) { 5194 /* take this team from the team pool */ 5195 __kmp_team_pool = team->t.t_next_pool; 5196 5197 /* setup the team for fresh use */ 5198 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5199 5200 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and " 5201 "task_team[1] %p to NULL\n", 5202 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5203 team->t.t_task_team[0] = NULL; 5204 team->t.t_task_team[1] = NULL; 5205 5206 /* reallocate space for arguments if necessary */ 5207 __kmp_alloc_argv_entries(argc, team, TRUE); 5208 KMP_CHECK_UPDATE(team->t.t_argc, argc); 5209 5210 KA_TRACE( 5211 20, ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5212 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5213 { // Initialize barrier data. 5214 int b; 5215 for (b = 0; b < bs_last_barrier; ++b) { 5216 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5217 #if USE_DEBUGGER 5218 team->t.t_bar[b].b_master_arrived = 0; 5219 team->t.t_bar[b].b_team_arrived = 0; 5220 #endif 5221 } 5222 } 5223 5224 #if OMP_40_ENABLED 5225 team->t.t_proc_bind = new_proc_bind; 5226 #endif 5227 5228 KA_TRACE(20, ("__kmp_allocate_team: using team from pool %d.\n", 5229 team->t.t_id)); 5230 5231 #if OMPT_SUPPORT 5232 __ompt_team_assign_id(team, ompt_parallel_data); 5233 #endif 5234 5235 KMP_MB(); 5236 5237 return team; 5238 } 5239 5240 /* reap team if it is too small, then loop back and check the next one */ 5241 // not sure if this is wise, but, will be redone during the hot-teams 5242 // rewrite. 5243 /* TODO: Use technique to find the right size hot-team, don't reap them */ 5244 team = __kmp_reap_team(team); 5245 __kmp_team_pool = team; 5246 } 5247 5248 /* nothing available in the pool, no matter, make a new team! */ 5249 KMP_MB(); 5250 team = (kmp_team_t *)__kmp_allocate(sizeof(kmp_team_t)); 5251 5252 /* and set it up */ 5253 team->t.t_max_nproc = max_nproc; 5254 /* NOTE well, for some reason allocating one big buffer and dividing it up 5255 seems to really hurt performance a lot on the P4, so, let's not use this */ 5256 __kmp_allocate_team_arrays(team, max_nproc); 5257 5258 KA_TRACE(20, ("__kmp_allocate_team: making a new team\n")); 5259 __kmp_initialize_team(team, new_nproc, new_icvs, NULL); 5260 5261 KA_TRACE(20, ("__kmp_allocate_team: setting task_team[0] %p and task_team[1] " 5262 "%p to NULL\n", 5263 &team->t.t_task_team[0], &team->t.t_task_team[1])); 5264 team->t.t_task_team[0] = NULL; // to be removed, as __kmp_allocate zeroes 5265 // memory, no need to duplicate 5266 team->t.t_task_team[1] = NULL; // to be removed, as __kmp_allocate zeroes 5267 // memory, no need to duplicate 5268 5269 if (__kmp_storage_map) { 5270 __kmp_print_team_storage_map("team", team, team->t.t_id, new_nproc); 5271 } 5272 5273 /* allocate space for arguments */ 5274 __kmp_alloc_argv_entries(argc, team, FALSE); 5275 team->t.t_argc = argc; 5276 5277 KA_TRACE(20, 5278 ("__kmp_allocate_team: team %d init arrived: join=%u, plain=%u\n", 5279 team->t.t_id, KMP_INIT_BARRIER_STATE, KMP_INIT_BARRIER_STATE)); 5280 { // Initialize barrier data. 5281 int b; 5282 for (b = 0; b < bs_last_barrier; ++b) { 5283 team->t.t_bar[b].b_arrived = KMP_INIT_BARRIER_STATE; 5284 #if USE_DEBUGGER 5285 team->t.t_bar[b].b_master_arrived = 0; 5286 team->t.t_bar[b].b_team_arrived = 0; 5287 #endif 5288 } 5289 } 5290 5291 #if OMP_40_ENABLED 5292 team->t.t_proc_bind = new_proc_bind; 5293 #endif 5294 5295 #if OMPT_SUPPORT 5296 __ompt_team_assign_id(team, ompt_parallel_data); 5297 team->t.ompt_serialized_team_info = NULL; 5298 #endif 5299 5300 KMP_MB(); 5301 5302 KA_TRACE(20, ("__kmp_allocate_team: done creating a new team %d.\n", 5303 team->t.t_id)); 5304 5305 return team; 5306 } 5307 5308 /* TODO implement hot-teams at all levels */ 5309 /* TODO implement lazy thread release on demand (disband request) */ 5310 5311 /* free the team. return it to the team pool. release all the threads 5312 * associated with it */ 5313 void __kmp_free_team(kmp_root_t *root, 5314 kmp_team_t *team USE_NESTED_HOT_ARG(kmp_info_t *master)) { 5315 int f; 5316 KA_TRACE(20, ("__kmp_free_team: T#%d freeing team %d\n", __kmp_get_gtid(), 5317 team->t.t_id)); 5318 5319 /* verify state */ 5320 KMP_DEBUG_ASSERT(root); 5321 KMP_DEBUG_ASSERT(team); 5322 KMP_DEBUG_ASSERT(team->t.t_nproc <= team->t.t_max_nproc); 5323 KMP_DEBUG_ASSERT(team->t.t_threads); 5324 5325 int use_hot_team = team == root->r.r_hot_team; 5326 #if KMP_NESTED_HOT_TEAMS 5327 int level; 5328 kmp_hot_team_ptr_t *hot_teams; 5329 if (master) { 5330 level = team->t.t_active_level - 1; 5331 if (master->th.th_teams_microtask) { // in teams construct? 5332 if (master->th.th_teams_size.nteams > 1) { 5333 ++level; // level was not increased in teams construct for 5334 // team_of_masters 5335 } 5336 if (team->t.t_pkfn != (microtask_t)__kmp_teams_master && 5337 master->th.th_teams_level == team->t.t_level) { 5338 ++level; // level was not increased in teams construct for 5339 // team_of_workers before the parallel 5340 } // team->t.t_level will be increased inside parallel 5341 } 5342 hot_teams = master->th.th_hot_teams; 5343 if (level < __kmp_hot_teams_max_level) { 5344 KMP_DEBUG_ASSERT(team == hot_teams[level].hot_team); 5345 use_hot_team = 1; 5346 } 5347 } 5348 #endif // KMP_NESTED_HOT_TEAMS 5349 5350 /* team is done working */ 5351 TCW_SYNC_PTR(team->t.t_pkfn, 5352 NULL); // Important for Debugging Support Library. 5353 #if KMP_OS_WINDOWS 5354 team->t.t_copyin_counter = 0; // init counter for possible reuse 5355 #endif 5356 // Do not reset pointer to parent team to NULL for hot teams. 5357 5358 /* if we are non-hot team, release our threads */ 5359 if (!use_hot_team) { 5360 if (__kmp_tasking_mode != tskm_immediate_exec) { 5361 // Wait for threads to reach reapable state 5362 for (f = 1; f < team->t.t_nproc; ++f) { 5363 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5364 kmp_info_t *th = team->t.t_threads[f]; 5365 volatile kmp_uint32 *state = &th->th.th_reap_state; 5366 while (*state != KMP_SAFE_TO_REAP) { 5367 #if KMP_OS_WINDOWS 5368 // On Windows a thread can be killed at any time, check this 5369 DWORD ecode; 5370 if (!__kmp_is_thread_alive(th, &ecode)) { 5371 *state = KMP_SAFE_TO_REAP; // reset the flag for dead thread 5372 break; 5373 } 5374 #endif 5375 // first check if thread is sleeping 5376 kmp_flag_64 fl(&th->th.th_bar[bs_forkjoin_barrier].bb.b_go, th); 5377 if (fl.is_sleeping()) 5378 fl.resume(__kmp_gtid_from_thread(th)); 5379 KMP_CPU_PAUSE(); 5380 } 5381 } 5382 5383 // Delete task teams 5384 int tt_idx; 5385 for (tt_idx = 0; tt_idx < 2; ++tt_idx) { 5386 kmp_task_team_t *task_team = team->t.t_task_team[tt_idx]; 5387 if (task_team != NULL) { 5388 for (f = 0; f < team->t.t_nproc; 5389 ++f) { // Have all threads unref task teams 5390 team->t.t_threads[f]->th.th_task_team = NULL; 5391 } 5392 KA_TRACE( 5393 20, 5394 ("__kmp_free_team: T#%d deactivating task_team %p on team %d\n", 5395 __kmp_get_gtid(), task_team, team->t.t_id)); 5396 #if KMP_NESTED_HOT_TEAMS 5397 __kmp_free_task_team(master, task_team); 5398 #endif 5399 team->t.t_task_team[tt_idx] = NULL; 5400 } 5401 } 5402 } 5403 5404 // Reset pointer to parent team only for non-hot teams. 5405 team->t.t_parent = NULL; 5406 team->t.t_level = 0; 5407 team->t.t_active_level = 0; 5408 5409 /* free the worker threads */ 5410 for (f = 1; f < team->t.t_nproc; ++f) { 5411 KMP_DEBUG_ASSERT(team->t.t_threads[f]); 5412 __kmp_free_thread(team->t.t_threads[f]); 5413 team->t.t_threads[f] = NULL; 5414 } 5415 5416 /* put the team back in the team pool */ 5417 /* TODO limit size of team pool, call reap_team if pool too large */ 5418 team->t.t_next_pool = CCAST(kmp_team_t *, __kmp_team_pool); 5419 __kmp_team_pool = (volatile kmp_team_t *)team; 5420 } 5421 5422 KMP_MB(); 5423 } 5424 5425 /* reap the team. destroy it, reclaim all its resources and free its memory */ 5426 kmp_team_t *__kmp_reap_team(kmp_team_t *team) { 5427 kmp_team_t *next_pool = team->t.t_next_pool; 5428 5429 KMP_DEBUG_ASSERT(team); 5430 KMP_DEBUG_ASSERT(team->t.t_dispatch); 5431 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 5432 KMP_DEBUG_ASSERT(team->t.t_threads); 5433 KMP_DEBUG_ASSERT(team->t.t_argv); 5434 5435 /* TODO clean the threads that are a part of this? */ 5436 5437 /* free stuff */ 5438 __kmp_free_team_arrays(team); 5439 if (team->t.t_argv != &team->t.t_inline_argv[0]) 5440 __kmp_free((void *)team->t.t_argv); 5441 __kmp_free(team); 5442 5443 KMP_MB(); 5444 return next_pool; 5445 } 5446 5447 // Free the thread. Don't reap it, just place it on the pool of available 5448 // threads. 5449 // 5450 // Changes for Quad issue 527845: We need a predictable OMP tid <-> gtid 5451 // binding for the affinity mechanism to be useful. 5452 // 5453 // Now, we always keep the free list (__kmp_thread_pool) sorted by gtid. 5454 // However, we want to avoid a potential performance problem by always 5455 // scanning through the list to find the correct point at which to insert 5456 // the thread (potential N**2 behavior). To do this we keep track of the 5457 // last place a thread struct was inserted (__kmp_thread_pool_insert_pt). 5458 // With single-level parallelism, threads will always be added to the tail 5459 // of the list, kept track of by __kmp_thread_pool_insert_pt. With nested 5460 // parallelism, all bets are off and we may need to scan through the entire 5461 // free list. 5462 // 5463 // This change also has a potentially large performance benefit, for some 5464 // applications. Previously, as threads were freed from the hot team, they 5465 // would be placed back on the free list in inverse order. If the hot team 5466 // grew back to it's original size, then the freed thread would be placed 5467 // back on the hot team in reverse order. This could cause bad cache 5468 // locality problems on programs where the size of the hot team regularly 5469 // grew and shrunk. 5470 // 5471 // Now, for single-level parallelism, the OMP tid is alway == gtid. 5472 void __kmp_free_thread(kmp_info_t *this_th) { 5473 int gtid; 5474 kmp_info_t **scan; 5475 kmp_root_t *root = this_th->th.th_root; 5476 5477 KA_TRACE(20, ("__kmp_free_thread: T#%d putting T#%d back on free pool.\n", 5478 __kmp_get_gtid(), this_th->th.th_info.ds.ds_gtid)); 5479 5480 KMP_DEBUG_ASSERT(this_th); 5481 5482 // When moving thread to pool, switch thread to wait on own b_go flag, and 5483 // uninitialized (NULL team). 5484 int b; 5485 kmp_balign_t *balign = this_th->th.th_bar; 5486 for (b = 0; b < bs_last_barrier; ++b) { 5487 if (balign[b].bb.wait_flag == KMP_BARRIER_PARENT_FLAG) 5488 balign[b].bb.wait_flag = KMP_BARRIER_SWITCH_TO_OWN_FLAG; 5489 balign[b].bb.team = NULL; 5490 balign[b].bb.leaf_kids = 0; 5491 } 5492 this_th->th.th_task_state = 0; 5493 this_th->th.th_reap_state = KMP_SAFE_TO_REAP; 5494 5495 /* put thread back on the free pool */ 5496 TCW_PTR(this_th->th.th_team, NULL); 5497 TCW_PTR(this_th->th.th_root, NULL); 5498 TCW_PTR(this_th->th.th_dispatch, NULL); /* NOT NEEDED */ 5499 5500 /* If the implicit task assigned to this thread can be used by other threads 5501 * -> multiple threads can share the data and try to free the task at 5502 * __kmp_reap_thread at exit. This duplicate use of the task data can happen 5503 * with higher probability when hot team is disabled but can occurs even when 5504 * the hot team is enabled */ 5505 __kmp_free_implicit_task(this_th); 5506 this_th->th.th_current_task = NULL; 5507 5508 // If the __kmp_thread_pool_insert_pt is already past the new insert 5509 // point, then we need to re-scan the entire list. 5510 gtid = this_th->th.th_info.ds.ds_gtid; 5511 if (__kmp_thread_pool_insert_pt != NULL) { 5512 KMP_DEBUG_ASSERT(__kmp_thread_pool != NULL); 5513 if (__kmp_thread_pool_insert_pt->th.th_info.ds.ds_gtid > gtid) { 5514 __kmp_thread_pool_insert_pt = NULL; 5515 } 5516 } 5517 5518 // Scan down the list to find the place to insert the thread. 5519 // scan is the address of a link in the list, possibly the address of 5520 // __kmp_thread_pool itself. 5521 // 5522 // In the absence of nested parallism, the for loop will have 0 iterations. 5523 if (__kmp_thread_pool_insert_pt != NULL) { 5524 scan = &(__kmp_thread_pool_insert_pt->th.th_next_pool); 5525 } else { 5526 scan = CCAST(kmp_info_t **, &__kmp_thread_pool); 5527 } 5528 for (; (*scan != NULL) && ((*scan)->th.th_info.ds.ds_gtid < gtid); 5529 scan = &((*scan)->th.th_next_pool)) 5530 ; 5531 5532 // Insert the new element on the list, and set __kmp_thread_pool_insert_pt 5533 // to its address. 5534 TCW_PTR(this_th->th.th_next_pool, *scan); 5535 __kmp_thread_pool_insert_pt = *scan = this_th; 5536 KMP_DEBUG_ASSERT((this_th->th.th_next_pool == NULL) || 5537 (this_th->th.th_info.ds.ds_gtid < 5538 this_th->th.th_next_pool->th.th_info.ds.ds_gtid)); 5539 TCW_4(this_th->th.th_in_pool, TRUE); 5540 __kmp_thread_pool_nth++; 5541 5542 TCW_4(__kmp_nth, __kmp_nth - 1); 5543 root->r.r_cg_nthreads--; 5544 5545 #ifdef KMP_ADJUST_BLOCKTIME 5546 /* Adjust blocktime back to user setting or default if necessary */ 5547 /* Middle initialization might never have occurred */ 5548 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5549 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5550 if (__kmp_nth <= __kmp_avail_proc) { 5551 __kmp_zero_bt = FALSE; 5552 } 5553 } 5554 #endif /* KMP_ADJUST_BLOCKTIME */ 5555 5556 KMP_MB(); 5557 } 5558 5559 /* ------------------------------------------------------------------------ */ 5560 5561 void *__kmp_launch_thread(kmp_info_t *this_thr) { 5562 int gtid = this_thr->th.th_info.ds.ds_gtid; 5563 /* void *stack_data;*/ 5564 kmp_team_t *(*volatile pteam); 5565 5566 KMP_MB(); 5567 KA_TRACE(10, ("__kmp_launch_thread: T#%d start\n", gtid)); 5568 5569 if (__kmp_env_consistency_check) { 5570 this_thr->th.th_cons = __kmp_allocate_cons_stack(gtid); // ATT: Memory leak? 5571 } 5572 5573 #if OMPT_SUPPORT 5574 ompt_data_t *thread_data; 5575 if (ompt_enabled.enabled) { 5576 thread_data = &(this_thr->th.ompt_thread_info.thread_data); 5577 thread_data->ptr = NULL; 5578 5579 this_thr->th.ompt_thread_info.state = omp_state_overhead; 5580 this_thr->th.ompt_thread_info.wait_id = 0; 5581 this_thr->th.ompt_thread_info.idle_frame = OMPT_GET_FRAME_ADDRESS(0); 5582 if (ompt_enabled.ompt_callback_thread_begin) { 5583 ompt_callbacks.ompt_callback(ompt_callback_thread_begin)( 5584 ompt_thread_worker, thread_data); 5585 } 5586 } 5587 #endif 5588 5589 #if OMPT_SUPPORT 5590 if (ompt_enabled.enabled) { 5591 this_thr->th.ompt_thread_info.state = omp_state_idle; 5592 } 5593 #endif 5594 /* This is the place where threads wait for work */ 5595 while (!TCR_4(__kmp_global.g.g_done)) { 5596 KMP_DEBUG_ASSERT(this_thr == __kmp_threads[gtid]); 5597 KMP_MB(); 5598 5599 /* wait for work to do */ 5600 KA_TRACE(20, ("__kmp_launch_thread: T#%d waiting for work\n", gtid)); 5601 5602 /* No tid yet since not part of a team */ 5603 __kmp_fork_barrier(gtid, KMP_GTID_DNE); 5604 5605 #if OMPT_SUPPORT 5606 if (ompt_enabled.enabled) { 5607 this_thr->th.ompt_thread_info.state = omp_state_overhead; 5608 } 5609 #endif 5610 5611 pteam = (kmp_team_t * (*))(&this_thr->th.th_team); 5612 5613 /* have we been allocated? */ 5614 if (TCR_SYNC_PTR(*pteam) && !TCR_4(__kmp_global.g.g_done)) { 5615 /* we were just woken up, so run our new task */ 5616 if (TCR_SYNC_PTR((*pteam)->t.t_pkfn) != NULL) { 5617 int rc; 5618 KA_TRACE(20, 5619 ("__kmp_launch_thread: T#%d(%d:%d) invoke microtask = %p\n", 5620 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5621 (*pteam)->t.t_pkfn)); 5622 5623 updateHWFPControl(*pteam); 5624 5625 #if OMPT_SUPPORT 5626 if (ompt_enabled.enabled) { 5627 this_thr->th.ompt_thread_info.state = omp_state_work_parallel; 5628 } 5629 #endif 5630 5631 rc = (*pteam)->t.t_invoke(gtid); 5632 KMP_ASSERT(rc); 5633 5634 KMP_MB(); 5635 KA_TRACE(20, ("__kmp_launch_thread: T#%d(%d:%d) done microtask = %p\n", 5636 gtid, (*pteam)->t.t_id, __kmp_tid_from_gtid(gtid), 5637 (*pteam)->t.t_pkfn)); 5638 } 5639 #if OMPT_SUPPORT 5640 if (ompt_enabled.enabled) { 5641 /* no frame set while outside task */ 5642 __ompt_get_task_info_object(0)->frame.exit_frame = NULL; 5643 5644 this_thr->th.ompt_thread_info.state = omp_state_overhead; 5645 } 5646 #endif 5647 /* join barrier after parallel region */ 5648 __kmp_join_barrier(gtid); 5649 } 5650 } 5651 TCR_SYNC_PTR((intptr_t)__kmp_global.g.g_done); 5652 5653 #if OMPT_SUPPORT 5654 if (ompt_enabled.ompt_callback_thread_end) { 5655 ompt_callbacks.ompt_callback(ompt_callback_thread_end)(thread_data); 5656 } 5657 #endif 5658 5659 this_thr->th.th_task_team = NULL; 5660 /* run the destructors for the threadprivate data for this thread */ 5661 __kmp_common_destroy_gtid(gtid); 5662 5663 KA_TRACE(10, ("__kmp_launch_thread: T#%d done\n", gtid)); 5664 KMP_MB(); 5665 return this_thr; 5666 } 5667 5668 /* ------------------------------------------------------------------------ */ 5669 5670 void __kmp_internal_end_dest(void *specific_gtid) { 5671 #if KMP_COMPILER_ICC 5672 #pragma warning(push) 5673 #pragma warning(disable : 810) // conversion from "void *" to "int" may lose 5674 // significant bits 5675 #endif 5676 // Make sure no significant bits are lost 5677 int gtid = (kmp_intptr_t)specific_gtid - 1; 5678 #if KMP_COMPILER_ICC 5679 #pragma warning(pop) 5680 #endif 5681 5682 KA_TRACE(30, ("__kmp_internal_end_dest: T#%d\n", gtid)); 5683 /* NOTE: the gtid is stored as gitd+1 in the thread-local-storage 5684 * this is because 0 is reserved for the nothing-stored case */ 5685 5686 /* josh: One reason for setting the gtid specific data even when it is being 5687 destroyed by pthread is to allow gtid lookup through thread specific data 5688 (__kmp_gtid_get_specific). Some of the code, especially stat code, 5689 that gets executed in the call to __kmp_internal_end_thread, actually 5690 gets the gtid through the thread specific data. Setting it here seems 5691 rather inelegant and perhaps wrong, but allows __kmp_internal_end_thread 5692 to run smoothly. 5693 todo: get rid of this after we remove the dependence on 5694 __kmp_gtid_get_specific */ 5695 if (gtid >= 0 && KMP_UBER_GTID(gtid)) 5696 __kmp_gtid_set_specific(gtid); 5697 #ifdef KMP_TDATA_GTID 5698 __kmp_gtid = gtid; 5699 #endif 5700 __kmp_internal_end_thread(gtid); 5701 } 5702 5703 #if KMP_OS_UNIX && KMP_DYNAMIC_LIB 5704 5705 // 2009-09-08 (lev): It looks the destructor does not work. In simple test cases 5706 // destructors work perfectly, but in real libomp.so I have no evidence it is 5707 // ever called. However, -fini linker option in makefile.mk works fine. 5708 5709 __attribute__((destructor)) void __kmp_internal_end_dtor(void) { 5710 __kmp_internal_end_atexit(); 5711 } 5712 5713 void __kmp_internal_end_fini(void) { __kmp_internal_end_atexit(); } 5714 5715 #endif 5716 5717 /* [Windows] josh: when the atexit handler is called, there may still be more 5718 than one thread alive */ 5719 void __kmp_internal_end_atexit(void) { 5720 KA_TRACE(30, ("__kmp_internal_end_atexit\n")); 5721 /* [Windows] 5722 josh: ideally, we want to completely shutdown the library in this atexit 5723 handler, but stat code that depends on thread specific data for gtid fails 5724 because that data becomes unavailable at some point during the shutdown, so 5725 we call __kmp_internal_end_thread instead. We should eventually remove the 5726 dependency on __kmp_get_specific_gtid in the stat code and use 5727 __kmp_internal_end_library to cleanly shutdown the library. 5728 5729 // TODO: Can some of this comment about GVS be removed? 5730 I suspect that the offending stat code is executed when the calling thread 5731 tries to clean up a dead root thread's data structures, resulting in GVS 5732 code trying to close the GVS structures for that thread, but since the stat 5733 code uses __kmp_get_specific_gtid to get the gtid with the assumption that 5734 the calling thread is cleaning up itself instead of another thread, it get 5735 confused. This happens because allowing a thread to unregister and cleanup 5736 another thread is a recent modification for addressing an issue. 5737 Based on the current design (20050722), a thread may end up 5738 trying to unregister another thread only if thread death does not trigger 5739 the calling of __kmp_internal_end_thread. For Linux* OS, there is the 5740 thread specific data destructor function to detect thread death. For 5741 Windows dynamic, there is DllMain(THREAD_DETACH). For Windows static, there 5742 is nothing. Thus, the workaround is applicable only for Windows static 5743 stat library. */ 5744 __kmp_internal_end_library(-1); 5745 #if KMP_OS_WINDOWS 5746 __kmp_close_console(); 5747 #endif 5748 } 5749 5750 static void __kmp_reap_thread(kmp_info_t *thread, int is_root) { 5751 // It is assumed __kmp_forkjoin_lock is acquired. 5752 5753 int gtid; 5754 5755 KMP_DEBUG_ASSERT(thread != NULL); 5756 5757 gtid = thread->th.th_info.ds.ds_gtid; 5758 5759 if (!is_root) { 5760 5761 if (__kmp_dflt_blocktime != KMP_MAX_BLOCKTIME) { 5762 /* Assume the threads are at the fork barrier here */ 5763 KA_TRACE( 5764 20, ("__kmp_reap_thread: releasing T#%d from fork barrier for reap\n", 5765 gtid)); 5766 /* Need release fence here to prevent seg faults for tree forkjoin barrier 5767 * (GEH) */ 5768 ANNOTATE_HAPPENS_BEFORE(thread); 5769 kmp_flag_64 flag(&thread->th.th_bar[bs_forkjoin_barrier].bb.b_go, thread); 5770 __kmp_release_64(&flag); 5771 } 5772 5773 // Terminate OS thread. 5774 __kmp_reap_worker(thread); 5775 5776 // The thread was killed asynchronously. If it was actively 5777 // spinning in the thread pool, decrement the global count. 5778 // 5779 // There is a small timing hole here - if the worker thread was just waking 5780 // up after sleeping in the pool, had reset it's th_active_in_pool flag but 5781 // not decremented the global counter __kmp_thread_pool_active_nth yet, then 5782 // the global counter might not get updated. 5783 // 5784 // Currently, this can only happen as the library is unloaded, 5785 // so there are no harmful side effects. 5786 if (thread->th.th_active_in_pool) { 5787 thread->th.th_active_in_pool = FALSE; 5788 KMP_ATOMIC_DEC(&__kmp_thread_pool_active_nth); 5789 KMP_DEBUG_ASSERT(__kmp_thread_pool_active_nth >= 0); 5790 } 5791 5792 // Decrement # of [worker] threads in the pool. 5793 KMP_DEBUG_ASSERT(__kmp_thread_pool_nth > 0); 5794 --__kmp_thread_pool_nth; 5795 } 5796 5797 __kmp_free_implicit_task(thread); 5798 5799 // Free the fast memory for tasking 5800 #if USE_FAST_MEMORY 5801 __kmp_free_fast_memory(thread); 5802 #endif /* USE_FAST_MEMORY */ 5803 5804 __kmp_suspend_uninitialize_thread(thread); 5805 5806 KMP_DEBUG_ASSERT(__kmp_threads[gtid] == thread); 5807 TCW_SYNC_PTR(__kmp_threads[gtid], NULL); 5808 5809 --__kmp_all_nth; 5810 // __kmp_nth was decremented when thread is added to the pool. 5811 5812 #ifdef KMP_ADJUST_BLOCKTIME 5813 /* Adjust blocktime back to user setting or default if necessary */ 5814 /* Middle initialization might never have occurred */ 5815 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 5816 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 5817 if (__kmp_nth <= __kmp_avail_proc) { 5818 __kmp_zero_bt = FALSE; 5819 } 5820 } 5821 #endif /* KMP_ADJUST_BLOCKTIME */ 5822 5823 /* free the memory being used */ 5824 if (__kmp_env_consistency_check) { 5825 if (thread->th.th_cons) { 5826 __kmp_free_cons_stack(thread->th.th_cons); 5827 thread->th.th_cons = NULL; 5828 } 5829 } 5830 5831 if (thread->th.th_pri_common != NULL) { 5832 __kmp_free(thread->th.th_pri_common); 5833 thread->th.th_pri_common = NULL; 5834 } 5835 5836 if (thread->th.th_task_state_memo_stack != NULL) { 5837 __kmp_free(thread->th.th_task_state_memo_stack); 5838 thread->th.th_task_state_memo_stack = NULL; 5839 } 5840 5841 #if KMP_USE_BGET 5842 if (thread->th.th_local.bget_data != NULL) { 5843 __kmp_finalize_bget(thread); 5844 } 5845 #endif 5846 5847 #if KMP_AFFINITY_SUPPORTED 5848 if (thread->th.th_affin_mask != NULL) { 5849 KMP_CPU_FREE(thread->th.th_affin_mask); 5850 thread->th.th_affin_mask = NULL; 5851 } 5852 #endif /* KMP_AFFINITY_SUPPORTED */ 5853 5854 #if KMP_USE_HIER_SCHED 5855 if (thread->th.th_hier_bar_data != NULL) { 5856 __kmp_free(thread->th.th_hier_bar_data); 5857 thread->th.th_hier_bar_data = NULL; 5858 } 5859 #endif 5860 5861 __kmp_reap_team(thread->th.th_serial_team); 5862 thread->th.th_serial_team = NULL; 5863 __kmp_free(thread); 5864 5865 KMP_MB(); 5866 5867 } // __kmp_reap_thread 5868 5869 static void __kmp_internal_end(void) { 5870 int i; 5871 5872 /* First, unregister the library */ 5873 __kmp_unregister_library(); 5874 5875 #if KMP_OS_WINDOWS 5876 /* In Win static library, we can't tell when a root actually dies, so we 5877 reclaim the data structures for any root threads that have died but not 5878 unregistered themselves, in order to shut down cleanly. 5879 In Win dynamic library we also can't tell when a thread dies. */ 5880 __kmp_reclaim_dead_roots(); // AC: moved here to always clean resources of 5881 // dead roots 5882 #endif 5883 5884 for (i = 0; i < __kmp_threads_capacity; i++) 5885 if (__kmp_root[i]) 5886 if (__kmp_root[i]->r.r_active) 5887 break; 5888 KMP_MB(); /* Flush all pending memory write invalidates. */ 5889 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 5890 5891 if (i < __kmp_threads_capacity) { 5892 #if KMP_USE_MONITOR 5893 // 2009-09-08 (lev): Other alive roots found. Why do we kill the monitor?? 5894 KMP_MB(); /* Flush all pending memory write invalidates. */ 5895 5896 // Need to check that monitor was initialized before reaping it. If we are 5897 // called form __kmp_atfork_child (which sets __kmp_init_parallel = 0), then 5898 // __kmp_monitor will appear to contain valid data, but it is only valid in 5899 // the parent process, not the child. 5900 // New behavior (201008): instead of keying off of the flag 5901 // __kmp_init_parallel, the monitor thread creation is keyed off 5902 // of the new flag __kmp_init_monitor. 5903 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 5904 if (TCR_4(__kmp_init_monitor)) { 5905 __kmp_reap_monitor(&__kmp_monitor); 5906 TCW_4(__kmp_init_monitor, 0); 5907 } 5908 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 5909 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 5910 #endif // KMP_USE_MONITOR 5911 } else { 5912 /* TODO move this to cleanup code */ 5913 #ifdef KMP_DEBUG 5914 /* make sure that everything has properly ended */ 5915 for (i = 0; i < __kmp_threads_capacity; i++) { 5916 if (__kmp_root[i]) { 5917 // KMP_ASSERT( ! KMP_UBER_GTID( i ) ); // AC: 5918 // there can be uber threads alive here 5919 KMP_ASSERT(!__kmp_root[i]->r.r_active); // TODO: can they be active? 5920 } 5921 } 5922 #endif 5923 5924 KMP_MB(); 5925 5926 // Reap the worker threads. 5927 // This is valid for now, but be careful if threads are reaped sooner. 5928 while (__kmp_thread_pool != NULL) { // Loop thru all the thread in the pool. 5929 // Get the next thread from the pool. 5930 kmp_info_t *thread = CCAST(kmp_info_t *, __kmp_thread_pool); 5931 __kmp_thread_pool = thread->th.th_next_pool; 5932 // Reap it. 5933 KMP_DEBUG_ASSERT(thread->th.th_reap_state == KMP_SAFE_TO_REAP); 5934 thread->th.th_next_pool = NULL; 5935 thread->th.th_in_pool = FALSE; 5936 __kmp_reap_thread(thread, 0); 5937 } 5938 __kmp_thread_pool_insert_pt = NULL; 5939 5940 // Reap teams. 5941 while (__kmp_team_pool != NULL) { // Loop thru all the teams in the pool. 5942 // Get the next team from the pool. 5943 kmp_team_t *team = CCAST(kmp_team_t *, __kmp_team_pool); 5944 __kmp_team_pool = team->t.t_next_pool; 5945 // Reap it. 5946 team->t.t_next_pool = NULL; 5947 __kmp_reap_team(team); 5948 } 5949 5950 __kmp_reap_task_teams(); 5951 5952 #if KMP_OS_UNIX 5953 // Threads that are not reaped should not access any resources since they 5954 // are going to be deallocated soon, so the shutdown sequence should wait 5955 // until all threads either exit the final spin-waiting loop or begin 5956 // sleeping after the given blocktime. 5957 for (i = 0; i < __kmp_threads_capacity; i++) { 5958 kmp_info_t *thr = __kmp_threads[i]; 5959 while (thr && KMP_ATOMIC_LD_ACQ(&thr->th.th_blocking)) 5960 KMP_CPU_PAUSE(); 5961 } 5962 #endif 5963 5964 for (i = 0; i < __kmp_threads_capacity; ++i) { 5965 // TBD: Add some checking... 5966 // Something like KMP_DEBUG_ASSERT( __kmp_thread[ i ] == NULL ); 5967 } 5968 5969 /* Make sure all threadprivate destructors get run by joining with all 5970 worker threads before resetting this flag */ 5971 TCW_SYNC_4(__kmp_init_common, FALSE); 5972 5973 KA_TRACE(10, ("__kmp_internal_end: all workers reaped\n")); 5974 KMP_MB(); 5975 5976 #if KMP_USE_MONITOR 5977 // See note above: One of the possible fixes for CQ138434 / CQ140126 5978 // 5979 // FIXME: push both code fragments down and CSE them? 5980 // push them into __kmp_cleanup() ? 5981 __kmp_acquire_bootstrap_lock(&__kmp_monitor_lock); 5982 if (TCR_4(__kmp_init_monitor)) { 5983 __kmp_reap_monitor(&__kmp_monitor); 5984 TCW_4(__kmp_init_monitor, 0); 5985 } 5986 __kmp_release_bootstrap_lock(&__kmp_monitor_lock); 5987 KA_TRACE(10, ("__kmp_internal_end: monitor reaped\n")); 5988 #endif 5989 } /* else !__kmp_global.t_active */ 5990 TCW_4(__kmp_init_gtid, FALSE); 5991 KMP_MB(); /* Flush all pending memory write invalidates. */ 5992 5993 __kmp_cleanup(); 5994 #if OMPT_SUPPORT 5995 ompt_fini(); 5996 #endif 5997 } 5998 5999 void __kmp_internal_end_library(int gtid_req) { 6000 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6001 /* this shouldn't be a race condition because __kmp_internal_end() is the 6002 only place to clear __kmp_serial_init */ 6003 /* we'll check this later too, after we get the lock */ 6004 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6005 // redundaant, because the next check will work in any case. 6006 if (__kmp_global.g.g_abort) { 6007 KA_TRACE(11, ("__kmp_internal_end_library: abort, exiting\n")); 6008 /* TODO abort? */ 6009 return; 6010 } 6011 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6012 KA_TRACE(10, ("__kmp_internal_end_library: already finished\n")); 6013 return; 6014 } 6015 6016 KMP_MB(); /* Flush all pending memory write invalidates. */ 6017 6018 /* find out who we are and what we should do */ 6019 { 6020 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6021 KA_TRACE( 6022 10, ("__kmp_internal_end_library: enter T#%d (%d)\n", gtid, gtid_req)); 6023 if (gtid == KMP_GTID_SHUTDOWN) { 6024 KA_TRACE(10, ("__kmp_internal_end_library: !__kmp_init_runtime, system " 6025 "already shutdown\n")); 6026 return; 6027 } else if (gtid == KMP_GTID_MONITOR) { 6028 KA_TRACE(10, ("__kmp_internal_end_library: monitor thread, gtid not " 6029 "registered, or system shutdown\n")); 6030 return; 6031 } else if (gtid == KMP_GTID_DNE) { 6032 KA_TRACE(10, ("__kmp_internal_end_library: gtid not registered or system " 6033 "shutdown\n")); 6034 /* we don't know who we are, but we may still shutdown the library */ 6035 } else if (KMP_UBER_GTID(gtid)) { 6036 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6037 if (__kmp_root[gtid]->r.r_active) { 6038 __kmp_global.g.g_abort = -1; 6039 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6040 KA_TRACE(10, 6041 ("__kmp_internal_end_library: root still active, abort T#%d\n", 6042 gtid)); 6043 return; 6044 } else { 6045 KA_TRACE( 6046 10, 6047 ("__kmp_internal_end_library: unregistering sibling T#%d\n", gtid)); 6048 __kmp_unregister_root_current_thread(gtid); 6049 } 6050 } else { 6051 /* worker threads may call this function through the atexit handler, if they 6052 * call exit() */ 6053 /* For now, skip the usual subsequent processing and just dump the debug buffer. 6054 TODO: do a thorough shutdown instead */ 6055 #ifdef DUMP_DEBUG_ON_EXIT 6056 if (__kmp_debug_buf) 6057 __kmp_dump_debug_buffer(); 6058 #endif 6059 return; 6060 } 6061 } 6062 /* synchronize the termination process */ 6063 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6064 6065 /* have we already finished */ 6066 if (__kmp_global.g.g_abort) { 6067 KA_TRACE(10, ("__kmp_internal_end_library: abort, exiting\n")); 6068 /* TODO abort? */ 6069 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6070 return; 6071 } 6072 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6073 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6074 return; 6075 } 6076 6077 /* We need this lock to enforce mutex between this reading of 6078 __kmp_threads_capacity and the writing by __kmp_register_root. 6079 Alternatively, we can use a counter of roots that is atomically updated by 6080 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6081 __kmp_internal_end_*. */ 6082 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6083 6084 /* now we can safely conduct the actual termination */ 6085 __kmp_internal_end(); 6086 6087 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6088 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6089 6090 KA_TRACE(10, ("__kmp_internal_end_library: exit\n")); 6091 6092 #ifdef DUMP_DEBUG_ON_EXIT 6093 if (__kmp_debug_buf) 6094 __kmp_dump_debug_buffer(); 6095 #endif 6096 6097 #if KMP_OS_WINDOWS 6098 __kmp_close_console(); 6099 #endif 6100 6101 __kmp_fini_allocator(); 6102 6103 } // __kmp_internal_end_library 6104 6105 void __kmp_internal_end_thread(int gtid_req) { 6106 int i; 6107 6108 /* if we have already cleaned up, don't try again, it wouldn't be pretty */ 6109 /* this shouldn't be a race condition because __kmp_internal_end() is the 6110 * only place to clear __kmp_serial_init */ 6111 /* we'll check this later too, after we get the lock */ 6112 // 2009-09-06: We do not set g_abort without setting g_done. This check looks 6113 // redundant, because the next check will work in any case. 6114 if (__kmp_global.g.g_abort) { 6115 KA_TRACE(11, ("__kmp_internal_end_thread: abort, exiting\n")); 6116 /* TODO abort? */ 6117 return; 6118 } 6119 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6120 KA_TRACE(10, ("__kmp_internal_end_thread: already finished\n")); 6121 return; 6122 } 6123 6124 KMP_MB(); /* Flush all pending memory write invalidates. */ 6125 6126 /* find out who we are and what we should do */ 6127 { 6128 int gtid = (gtid_req >= 0) ? gtid_req : __kmp_gtid_get_specific(); 6129 KA_TRACE(10, 6130 ("__kmp_internal_end_thread: enter T#%d (%d)\n", gtid, gtid_req)); 6131 if (gtid == KMP_GTID_SHUTDOWN) { 6132 KA_TRACE(10, ("__kmp_internal_end_thread: !__kmp_init_runtime, system " 6133 "already shutdown\n")); 6134 return; 6135 } else if (gtid == KMP_GTID_MONITOR) { 6136 KA_TRACE(10, ("__kmp_internal_end_thread: monitor thread, gtid not " 6137 "registered, or system shutdown\n")); 6138 return; 6139 } else if (gtid == KMP_GTID_DNE) { 6140 KA_TRACE(10, ("__kmp_internal_end_thread: gtid not registered or system " 6141 "shutdown\n")); 6142 return; 6143 /* we don't know who we are */ 6144 } else if (KMP_UBER_GTID(gtid)) { 6145 /* unregister ourselves as an uber thread. gtid is no longer valid */ 6146 if (__kmp_root[gtid]->r.r_active) { 6147 __kmp_global.g.g_abort = -1; 6148 TCW_SYNC_4(__kmp_global.g.g_done, TRUE); 6149 KA_TRACE(10, 6150 ("__kmp_internal_end_thread: root still active, abort T#%d\n", 6151 gtid)); 6152 return; 6153 } else { 6154 KA_TRACE(10, ("__kmp_internal_end_thread: unregistering sibling T#%d\n", 6155 gtid)); 6156 __kmp_unregister_root_current_thread(gtid); 6157 } 6158 } else { 6159 /* just a worker thread, let's leave */ 6160 KA_TRACE(10, ("__kmp_internal_end_thread: worker thread T#%d\n", gtid)); 6161 6162 if (gtid >= 0) { 6163 __kmp_threads[gtid]->th.th_task_team = NULL; 6164 } 6165 6166 KA_TRACE(10, 6167 ("__kmp_internal_end_thread: worker thread done, exiting T#%d\n", 6168 gtid)); 6169 return; 6170 } 6171 } 6172 #if defined KMP_DYNAMIC_LIB 6173 // AC: lets not shutdown the Linux* OS dynamic library at the exit of uber 6174 // thread, because we will better shutdown later in the library destructor. 6175 // The reason of this change is performance problem when non-openmp thread in 6176 // a loop forks and joins many openmp threads. We can save a lot of time 6177 // keeping worker threads alive until the program shutdown. 6178 // OM: Removed Linux* OS restriction to fix the crash on OS X* (DPD200239966) 6179 // and Windows(DPD200287443) that occurs when using critical sections from 6180 // foreign threads. 6181 KA_TRACE(10, ("__kmp_internal_end_thread: exiting T#%d\n", gtid_req)); 6182 return; 6183 #endif 6184 /* synchronize the termination process */ 6185 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6186 6187 /* have we already finished */ 6188 if (__kmp_global.g.g_abort) { 6189 KA_TRACE(10, ("__kmp_internal_end_thread: abort, exiting\n")); 6190 /* TODO abort? */ 6191 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6192 return; 6193 } 6194 if (TCR_4(__kmp_global.g.g_done) || !__kmp_init_serial) { 6195 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6196 return; 6197 } 6198 6199 /* We need this lock to enforce mutex between this reading of 6200 __kmp_threads_capacity and the writing by __kmp_register_root. 6201 Alternatively, we can use a counter of roots that is atomically updated by 6202 __kmp_get_global_thread_id_reg, __kmp_do_serial_initialize and 6203 __kmp_internal_end_*. */ 6204 6205 /* should we finish the run-time? are all siblings done? */ 6206 __kmp_acquire_bootstrap_lock(&__kmp_forkjoin_lock); 6207 6208 for (i = 0; i < __kmp_threads_capacity; ++i) { 6209 if (KMP_UBER_GTID(i)) { 6210 KA_TRACE( 6211 10, 6212 ("__kmp_internal_end_thread: remaining sibling task: gtid==%d\n", i)); 6213 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6214 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6215 return; 6216 } 6217 } 6218 6219 /* now we can safely conduct the actual termination */ 6220 6221 __kmp_internal_end(); 6222 6223 __kmp_release_bootstrap_lock(&__kmp_forkjoin_lock); 6224 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6225 6226 KA_TRACE(10, ("__kmp_internal_end_thread: exit T#%d\n", gtid_req)); 6227 6228 #ifdef DUMP_DEBUG_ON_EXIT 6229 if (__kmp_debug_buf) 6230 __kmp_dump_debug_buffer(); 6231 #endif 6232 } // __kmp_internal_end_thread 6233 6234 // ----------------------------------------------------------------------------- 6235 // Library registration stuff. 6236 6237 static long __kmp_registration_flag = 0; 6238 // Random value used to indicate library initialization. 6239 static char *__kmp_registration_str = NULL; 6240 // Value to be saved in env var __KMP_REGISTERED_LIB_<pid>. 6241 6242 static inline char *__kmp_reg_status_name() { 6243 /* On RHEL 3u5 if linked statically, getpid() returns different values in 6244 each thread. If registration and unregistration go in different threads 6245 (omp_misc_other_root_exit.cpp test case), the name of registered_lib_env 6246 env var can not be found, because the name will contain different pid. */ 6247 return __kmp_str_format("__KMP_REGISTERED_LIB_%d", (int)getpid()); 6248 } // __kmp_reg_status_get 6249 6250 void __kmp_register_library_startup(void) { 6251 6252 char *name = __kmp_reg_status_name(); // Name of the environment variable. 6253 int done = 0; 6254 union { 6255 double dtime; 6256 long ltime; 6257 } time; 6258 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6259 __kmp_initialize_system_tick(); 6260 #endif 6261 __kmp_read_system_time(&time.dtime); 6262 __kmp_registration_flag = 0xCAFE0000L | (time.ltime & 0x0000FFFFL); 6263 __kmp_registration_str = 6264 __kmp_str_format("%p-%lx-%s", &__kmp_registration_flag, 6265 __kmp_registration_flag, KMP_LIBRARY_FILE); 6266 6267 KA_TRACE(50, ("__kmp_register_library_startup: %s=\"%s\"\n", name, 6268 __kmp_registration_str)); 6269 6270 while (!done) { 6271 6272 char *value = NULL; // Actual value of the environment variable. 6273 6274 // Set environment variable, but do not overwrite if it is exist. 6275 __kmp_env_set(name, __kmp_registration_str, 0); 6276 // Check the variable is written. 6277 value = __kmp_env_get(name); 6278 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6279 6280 done = 1; // Ok, environment variable set successfully, exit the loop. 6281 6282 } else { 6283 6284 // Oops. Write failed. Another copy of OpenMP RTL is in memory. 6285 // Check whether it alive or dead. 6286 int neighbor = 0; // 0 -- unknown status, 1 -- alive, 2 -- dead. 6287 char *tail = value; 6288 char *flag_addr_str = NULL; 6289 char *flag_val_str = NULL; 6290 char const *file_name = NULL; 6291 __kmp_str_split(tail, '-', &flag_addr_str, &tail); 6292 __kmp_str_split(tail, '-', &flag_val_str, &tail); 6293 file_name = tail; 6294 if (tail != NULL) { 6295 long *flag_addr = 0; 6296 long flag_val = 0; 6297 KMP_SSCANF(flag_addr_str, "%p", RCAST(void**, &flag_addr)); 6298 KMP_SSCANF(flag_val_str, "%lx", &flag_val); 6299 if (flag_addr != 0 && flag_val != 0 && strcmp(file_name, "") != 0) { 6300 // First, check whether environment-encoded address is mapped into 6301 // addr space. 6302 // If so, dereference it to see if it still has the right value. 6303 if (__kmp_is_address_mapped(flag_addr) && *flag_addr == flag_val) { 6304 neighbor = 1; 6305 } else { 6306 // If not, then we know the other copy of the library is no longer 6307 // running. 6308 neighbor = 2; 6309 } 6310 } 6311 } 6312 switch (neighbor) { 6313 case 0: // Cannot parse environment variable -- neighbor status unknown. 6314 // Assume it is the incompatible format of future version of the 6315 // library. Assume the other library is alive. 6316 // WARN( ... ); // TODO: Issue a warning. 6317 file_name = "unknown library"; 6318 // Attention! Falling to the next case. That's intentional. 6319 case 1: { // Neighbor is alive. 6320 // Check it is allowed. 6321 char *duplicate_ok = __kmp_env_get("KMP_DUPLICATE_LIB_OK"); 6322 if (!__kmp_str_match_true(duplicate_ok)) { 6323 // That's not allowed. Issue fatal error. 6324 __kmp_fatal(KMP_MSG(DuplicateLibrary, KMP_LIBRARY_FILE, file_name), 6325 KMP_HNT(DuplicateLibrary), __kmp_msg_null); 6326 } 6327 KMP_INTERNAL_FREE(duplicate_ok); 6328 __kmp_duplicate_library_ok = 1; 6329 done = 1; // Exit the loop. 6330 } break; 6331 case 2: { // Neighbor is dead. 6332 // Clear the variable and try to register library again. 6333 __kmp_env_unset(name); 6334 } break; 6335 default: { KMP_DEBUG_ASSERT(0); } break; 6336 } 6337 } 6338 KMP_INTERNAL_FREE((void *)value); 6339 } 6340 KMP_INTERNAL_FREE((void *)name); 6341 6342 } // func __kmp_register_library_startup 6343 6344 void __kmp_unregister_library(void) { 6345 6346 char *name = __kmp_reg_status_name(); 6347 char *value = __kmp_env_get(name); 6348 6349 KMP_DEBUG_ASSERT(__kmp_registration_flag != 0); 6350 KMP_DEBUG_ASSERT(__kmp_registration_str != NULL); 6351 if (value != NULL && strcmp(value, __kmp_registration_str) == 0) { 6352 // Ok, this is our variable. Delete it. 6353 __kmp_env_unset(name); 6354 } 6355 6356 KMP_INTERNAL_FREE(__kmp_registration_str); 6357 KMP_INTERNAL_FREE(value); 6358 KMP_INTERNAL_FREE(name); 6359 6360 __kmp_registration_flag = 0; 6361 __kmp_registration_str = NULL; 6362 6363 } // __kmp_unregister_library 6364 6365 // End of Library registration stuff. 6366 // ----------------------------------------------------------------------------- 6367 6368 #if KMP_MIC_SUPPORTED 6369 6370 static void __kmp_check_mic_type() { 6371 kmp_cpuid_t cpuid_state = {0}; 6372 kmp_cpuid_t *cs_p = &cpuid_state; 6373 __kmp_x86_cpuid(1, 0, cs_p); 6374 // We don't support mic1 at the moment 6375 if ((cs_p->eax & 0xff0) == 0xB10) { 6376 __kmp_mic_type = mic2; 6377 } else if ((cs_p->eax & 0xf0ff0) == 0x50670) { 6378 __kmp_mic_type = mic3; 6379 } else { 6380 __kmp_mic_type = non_mic; 6381 } 6382 } 6383 6384 #endif /* KMP_MIC_SUPPORTED */ 6385 6386 static void __kmp_do_serial_initialize(void) { 6387 int i, gtid; 6388 int size; 6389 6390 KA_TRACE(10, ("__kmp_do_serial_initialize: enter\n")); 6391 6392 KMP_DEBUG_ASSERT(sizeof(kmp_int32) == 4); 6393 KMP_DEBUG_ASSERT(sizeof(kmp_uint32) == 4); 6394 KMP_DEBUG_ASSERT(sizeof(kmp_int64) == 8); 6395 KMP_DEBUG_ASSERT(sizeof(kmp_uint64) == 8); 6396 KMP_DEBUG_ASSERT(sizeof(kmp_intptr_t) == sizeof(void *)); 6397 6398 #if OMPT_SUPPORT 6399 ompt_pre_init(); 6400 #endif 6401 6402 __kmp_validate_locks(); 6403 6404 /* Initialize internal memory allocator */ 6405 __kmp_init_allocator(); 6406 6407 /* Register the library startup via an environment variable and check to see 6408 whether another copy of the library is already registered. */ 6409 6410 __kmp_register_library_startup(); 6411 6412 /* TODO reinitialization of library */ 6413 if (TCR_4(__kmp_global.g.g_done)) { 6414 KA_TRACE(10, ("__kmp_do_serial_initialize: reinitialization of library\n")); 6415 } 6416 6417 __kmp_global.g.g_abort = 0; 6418 TCW_SYNC_4(__kmp_global.g.g_done, FALSE); 6419 6420 /* initialize the locks */ 6421 #if KMP_USE_ADAPTIVE_LOCKS 6422 #if KMP_DEBUG_ADAPTIVE_LOCKS 6423 __kmp_init_speculative_stats(); 6424 #endif 6425 #endif 6426 #if KMP_STATS_ENABLED 6427 __kmp_stats_init(); 6428 #endif 6429 __kmp_init_lock(&__kmp_global_lock); 6430 __kmp_init_queuing_lock(&__kmp_dispatch_lock); 6431 __kmp_init_lock(&__kmp_debug_lock); 6432 __kmp_init_atomic_lock(&__kmp_atomic_lock); 6433 __kmp_init_atomic_lock(&__kmp_atomic_lock_1i); 6434 __kmp_init_atomic_lock(&__kmp_atomic_lock_2i); 6435 __kmp_init_atomic_lock(&__kmp_atomic_lock_4i); 6436 __kmp_init_atomic_lock(&__kmp_atomic_lock_4r); 6437 __kmp_init_atomic_lock(&__kmp_atomic_lock_8i); 6438 __kmp_init_atomic_lock(&__kmp_atomic_lock_8r); 6439 __kmp_init_atomic_lock(&__kmp_atomic_lock_8c); 6440 __kmp_init_atomic_lock(&__kmp_atomic_lock_10r); 6441 __kmp_init_atomic_lock(&__kmp_atomic_lock_16r); 6442 __kmp_init_atomic_lock(&__kmp_atomic_lock_16c); 6443 __kmp_init_atomic_lock(&__kmp_atomic_lock_20c); 6444 __kmp_init_atomic_lock(&__kmp_atomic_lock_32c); 6445 __kmp_init_bootstrap_lock(&__kmp_forkjoin_lock); 6446 __kmp_init_bootstrap_lock(&__kmp_exit_lock); 6447 #if KMP_USE_MONITOR 6448 __kmp_init_bootstrap_lock(&__kmp_monitor_lock); 6449 #endif 6450 __kmp_init_bootstrap_lock(&__kmp_tp_cached_lock); 6451 6452 /* conduct initialization and initial setup of configuration */ 6453 6454 __kmp_runtime_initialize(); 6455 6456 #if KMP_MIC_SUPPORTED 6457 __kmp_check_mic_type(); 6458 #endif 6459 6460 // Some global variable initialization moved here from kmp_env_initialize() 6461 #ifdef KMP_DEBUG 6462 kmp_diag = 0; 6463 #endif 6464 __kmp_abort_delay = 0; 6465 6466 // From __kmp_init_dflt_team_nth() 6467 /* assume the entire machine will be used */ 6468 __kmp_dflt_team_nth_ub = __kmp_xproc; 6469 if (__kmp_dflt_team_nth_ub < KMP_MIN_NTH) { 6470 __kmp_dflt_team_nth_ub = KMP_MIN_NTH; 6471 } 6472 if (__kmp_dflt_team_nth_ub > __kmp_sys_max_nth) { 6473 __kmp_dflt_team_nth_ub = __kmp_sys_max_nth; 6474 } 6475 __kmp_max_nth = __kmp_sys_max_nth; 6476 __kmp_cg_max_nth = __kmp_sys_max_nth; 6477 __kmp_teams_max_nth = __kmp_xproc; // set a "reasonable" default 6478 if (__kmp_teams_max_nth > __kmp_sys_max_nth) { 6479 __kmp_teams_max_nth = __kmp_sys_max_nth; 6480 } 6481 6482 // Three vars below moved here from __kmp_env_initialize() "KMP_BLOCKTIME" 6483 // part 6484 __kmp_dflt_blocktime = KMP_DEFAULT_BLOCKTIME; 6485 #if KMP_USE_MONITOR 6486 __kmp_monitor_wakeups = 6487 KMP_WAKEUPS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6488 __kmp_bt_intervals = 6489 KMP_INTERVALS_FROM_BLOCKTIME(__kmp_dflt_blocktime, __kmp_monitor_wakeups); 6490 #endif 6491 // From "KMP_LIBRARY" part of __kmp_env_initialize() 6492 __kmp_library = library_throughput; 6493 // From KMP_SCHEDULE initialization 6494 __kmp_static = kmp_sch_static_balanced; 6495 // AC: do not use analytical here, because it is non-monotonous 6496 //__kmp_guided = kmp_sch_guided_iterative_chunked; 6497 //__kmp_auto = kmp_sch_guided_analytical_chunked; // AC: it is the default, no 6498 // need to repeat assignment 6499 // Barrier initialization. Moved here from __kmp_env_initialize() Barrier branch 6500 // bit control and barrier method control parts 6501 #if KMP_FAST_REDUCTION_BARRIER 6502 #define kmp_reduction_barrier_gather_bb ((int)1) 6503 #define kmp_reduction_barrier_release_bb ((int)1) 6504 #define kmp_reduction_barrier_gather_pat bp_hyper_bar 6505 #define kmp_reduction_barrier_release_pat bp_hyper_bar 6506 #endif // KMP_FAST_REDUCTION_BARRIER 6507 for (i = bs_plain_barrier; i < bs_last_barrier; i++) { 6508 __kmp_barrier_gather_branch_bits[i] = __kmp_barrier_gather_bb_dflt; 6509 __kmp_barrier_release_branch_bits[i] = __kmp_barrier_release_bb_dflt; 6510 __kmp_barrier_gather_pattern[i] = __kmp_barrier_gather_pat_dflt; 6511 __kmp_barrier_release_pattern[i] = __kmp_barrier_release_pat_dflt; 6512 #if KMP_FAST_REDUCTION_BARRIER 6513 if (i == bs_reduction_barrier) { // tested and confirmed on ALTIX only ( 6514 // lin_64 ): hyper,1 6515 __kmp_barrier_gather_branch_bits[i] = kmp_reduction_barrier_gather_bb; 6516 __kmp_barrier_release_branch_bits[i] = kmp_reduction_barrier_release_bb; 6517 __kmp_barrier_gather_pattern[i] = kmp_reduction_barrier_gather_pat; 6518 __kmp_barrier_release_pattern[i] = kmp_reduction_barrier_release_pat; 6519 } 6520 #endif // KMP_FAST_REDUCTION_BARRIER 6521 } 6522 #if KMP_FAST_REDUCTION_BARRIER 6523 #undef kmp_reduction_barrier_release_pat 6524 #undef kmp_reduction_barrier_gather_pat 6525 #undef kmp_reduction_barrier_release_bb 6526 #undef kmp_reduction_barrier_gather_bb 6527 #endif // KMP_FAST_REDUCTION_BARRIER 6528 #if KMP_MIC_SUPPORTED 6529 if (__kmp_mic_type == mic2) { // KNC 6530 // AC: plane=3,2, forkjoin=2,1 are optimal for 240 threads on KNC 6531 __kmp_barrier_gather_branch_bits[bs_plain_barrier] = 3; // plain gather 6532 __kmp_barrier_release_branch_bits[bs_forkjoin_barrier] = 6533 1; // forkjoin release 6534 __kmp_barrier_gather_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6535 __kmp_barrier_release_pattern[bs_forkjoin_barrier] = bp_hierarchical_bar; 6536 } 6537 #if KMP_FAST_REDUCTION_BARRIER 6538 if (__kmp_mic_type == mic2) { // KNC 6539 __kmp_barrier_gather_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6540 __kmp_barrier_release_pattern[bs_reduction_barrier] = bp_hierarchical_bar; 6541 } 6542 #endif // KMP_FAST_REDUCTION_BARRIER 6543 #endif // KMP_MIC_SUPPORTED 6544 6545 // From KMP_CHECKS initialization 6546 #ifdef KMP_DEBUG 6547 __kmp_env_checks = TRUE; /* development versions have the extra checks */ 6548 #else 6549 __kmp_env_checks = FALSE; /* port versions do not have the extra checks */ 6550 #endif 6551 6552 // From "KMP_FOREIGN_THREADS_THREADPRIVATE" initialization 6553 __kmp_foreign_tp = TRUE; 6554 6555 __kmp_global.g.g_dynamic = FALSE; 6556 __kmp_global.g.g_dynamic_mode = dynamic_default; 6557 6558 __kmp_env_initialize(NULL); 6559 6560 // Print all messages in message catalog for testing purposes. 6561 #ifdef KMP_DEBUG 6562 char const *val = __kmp_env_get("KMP_DUMP_CATALOG"); 6563 if (__kmp_str_match_true(val)) { 6564 kmp_str_buf_t buffer; 6565 __kmp_str_buf_init(&buffer); 6566 __kmp_i18n_dump_catalog(&buffer); 6567 __kmp_printf("%s", buffer.str); 6568 __kmp_str_buf_free(&buffer); 6569 } 6570 __kmp_env_free(&val); 6571 #endif 6572 6573 __kmp_threads_capacity = 6574 __kmp_initial_threads_capacity(__kmp_dflt_team_nth_ub); 6575 // Moved here from __kmp_env_initialize() "KMP_ALL_THREADPRIVATE" part 6576 __kmp_tp_capacity = __kmp_default_tp_capacity( 6577 __kmp_dflt_team_nth_ub, __kmp_max_nth, __kmp_allThreadsSpecified); 6578 6579 // If the library is shut down properly, both pools must be NULL. Just in 6580 // case, set them to NULL -- some memory may leak, but subsequent code will 6581 // work even if pools are not freed. 6582 KMP_DEBUG_ASSERT(__kmp_thread_pool == NULL); 6583 KMP_DEBUG_ASSERT(__kmp_thread_pool_insert_pt == NULL); 6584 KMP_DEBUG_ASSERT(__kmp_team_pool == NULL); 6585 __kmp_thread_pool = NULL; 6586 __kmp_thread_pool_insert_pt = NULL; 6587 __kmp_team_pool = NULL; 6588 6589 /* Allocate all of the variable sized records */ 6590 /* NOTE: __kmp_threads_capacity entries are allocated, but the arrays are 6591 * expandable */ 6592 /* Since allocation is cache-aligned, just add extra padding at the end */ 6593 size = 6594 (sizeof(kmp_info_t *) + sizeof(kmp_root_t *)) * __kmp_threads_capacity + 6595 CACHE_LINE; 6596 __kmp_threads = (kmp_info_t **)__kmp_allocate(size); 6597 __kmp_root = (kmp_root_t **)((char *)__kmp_threads + 6598 sizeof(kmp_info_t *) * __kmp_threads_capacity); 6599 6600 /* init thread counts */ 6601 KMP_DEBUG_ASSERT(__kmp_all_nth == 6602 0); // Asserts fail if the library is reinitializing and 6603 KMP_DEBUG_ASSERT(__kmp_nth == 0); // something was wrong in termination. 6604 __kmp_all_nth = 0; 6605 __kmp_nth = 0; 6606 6607 /* setup the uber master thread and hierarchy */ 6608 gtid = __kmp_register_root(TRUE); 6609 KA_TRACE(10, ("__kmp_do_serial_initialize T#%d\n", gtid)); 6610 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6611 KMP_ASSERT(KMP_INITIAL_GTID(gtid)); 6612 6613 KMP_MB(); /* Flush all pending memory write invalidates. */ 6614 6615 __kmp_common_initialize(); 6616 6617 #if KMP_OS_UNIX 6618 /* invoke the child fork handler */ 6619 __kmp_register_atfork(); 6620 #endif 6621 6622 #if !defined KMP_DYNAMIC_LIB 6623 { 6624 /* Invoke the exit handler when the program finishes, only for static 6625 library. For dynamic library, we already have _fini and DllMain. */ 6626 int rc = atexit(__kmp_internal_end_atexit); 6627 if (rc != 0) { 6628 __kmp_fatal(KMP_MSG(FunctionError, "atexit()"), KMP_ERR(rc), 6629 __kmp_msg_null); 6630 } 6631 } 6632 #endif 6633 6634 #if KMP_HANDLE_SIGNALS 6635 #if KMP_OS_UNIX 6636 /* NOTE: make sure that this is called before the user installs their own 6637 signal handlers so that the user handlers are called first. this way they 6638 can return false, not call our handler, avoid terminating the library, and 6639 continue execution where they left off. */ 6640 __kmp_install_signals(FALSE); 6641 #endif /* KMP_OS_UNIX */ 6642 #if KMP_OS_WINDOWS 6643 __kmp_install_signals(TRUE); 6644 #endif /* KMP_OS_WINDOWS */ 6645 #endif 6646 6647 /* we have finished the serial initialization */ 6648 __kmp_init_counter++; 6649 6650 __kmp_init_serial = TRUE; 6651 6652 if (__kmp_settings) { 6653 __kmp_env_print(); 6654 } 6655 6656 #if OMP_40_ENABLED 6657 if (__kmp_display_env || __kmp_display_env_verbose) { 6658 __kmp_env_print_2(); 6659 } 6660 #endif // OMP_40_ENABLED 6661 6662 #if OMPT_SUPPORT 6663 ompt_post_init(); 6664 #endif 6665 6666 KMP_MB(); 6667 6668 KA_TRACE(10, ("__kmp_do_serial_initialize: exit\n")); 6669 } 6670 6671 void __kmp_serial_initialize(void) { 6672 if (__kmp_init_serial) { 6673 return; 6674 } 6675 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6676 if (__kmp_init_serial) { 6677 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6678 return; 6679 } 6680 __kmp_do_serial_initialize(); 6681 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6682 } 6683 6684 static void __kmp_do_middle_initialize(void) { 6685 int i, j; 6686 int prev_dflt_team_nth; 6687 6688 if (!__kmp_init_serial) { 6689 __kmp_do_serial_initialize(); 6690 } 6691 6692 KA_TRACE(10, ("__kmp_middle_initialize: enter\n")); 6693 6694 // Save the previous value for the __kmp_dflt_team_nth so that 6695 // we can avoid some reinitialization if it hasn't changed. 6696 prev_dflt_team_nth = __kmp_dflt_team_nth; 6697 6698 #if KMP_AFFINITY_SUPPORTED 6699 // __kmp_affinity_initialize() will try to set __kmp_ncores to the 6700 // number of cores on the machine. 6701 __kmp_affinity_initialize(); 6702 6703 // Run through the __kmp_threads array and set the affinity mask 6704 // for each root thread that is currently registered with the RTL. 6705 for (i = 0; i < __kmp_threads_capacity; i++) { 6706 if (TCR_PTR(__kmp_threads[i]) != NULL) { 6707 __kmp_affinity_set_init_mask(i, TRUE); 6708 } 6709 } 6710 #endif /* KMP_AFFINITY_SUPPORTED */ 6711 6712 KMP_ASSERT(__kmp_xproc > 0); 6713 if (__kmp_avail_proc == 0) { 6714 __kmp_avail_proc = __kmp_xproc; 6715 } 6716 6717 // If there were empty places in num_threads list (OMP_NUM_THREADS=,,2,3), 6718 // correct them now 6719 j = 0; 6720 while ((j < __kmp_nested_nth.used) && !__kmp_nested_nth.nth[j]) { 6721 __kmp_nested_nth.nth[j] = __kmp_dflt_team_nth = __kmp_dflt_team_nth_ub = 6722 __kmp_avail_proc; 6723 j++; 6724 } 6725 6726 if (__kmp_dflt_team_nth == 0) { 6727 #ifdef KMP_DFLT_NTH_CORES 6728 // Default #threads = #cores 6729 __kmp_dflt_team_nth = __kmp_ncores; 6730 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6731 "__kmp_ncores (%d)\n", 6732 __kmp_dflt_team_nth)); 6733 #else 6734 // Default #threads = #available OS procs 6735 __kmp_dflt_team_nth = __kmp_avail_proc; 6736 KA_TRACE(20, ("__kmp_middle_initialize: setting __kmp_dflt_team_nth = " 6737 "__kmp_avail_proc(%d)\n", 6738 __kmp_dflt_team_nth)); 6739 #endif /* KMP_DFLT_NTH_CORES */ 6740 } 6741 6742 if (__kmp_dflt_team_nth < KMP_MIN_NTH) { 6743 __kmp_dflt_team_nth = KMP_MIN_NTH; 6744 } 6745 if (__kmp_dflt_team_nth > __kmp_sys_max_nth) { 6746 __kmp_dflt_team_nth = __kmp_sys_max_nth; 6747 } 6748 6749 // There's no harm in continuing if the following check fails, 6750 // but it indicates an error in the previous logic. 6751 KMP_DEBUG_ASSERT(__kmp_dflt_team_nth <= __kmp_dflt_team_nth_ub); 6752 6753 if (__kmp_dflt_team_nth != prev_dflt_team_nth) { 6754 // Run through the __kmp_threads array and set the num threads icv for each 6755 // root thread that is currently registered with the RTL (which has not 6756 // already explicitly set its nthreads-var with a call to 6757 // omp_set_num_threads()). 6758 for (i = 0; i < __kmp_threads_capacity; i++) { 6759 kmp_info_t *thread = __kmp_threads[i]; 6760 if (thread == NULL) 6761 continue; 6762 if (thread->th.th_current_task->td_icvs.nproc != 0) 6763 continue; 6764 6765 set__nproc(__kmp_threads[i], __kmp_dflt_team_nth); 6766 } 6767 } 6768 KA_TRACE( 6769 20, 6770 ("__kmp_middle_initialize: final value for __kmp_dflt_team_nth = %d\n", 6771 __kmp_dflt_team_nth)); 6772 6773 #ifdef KMP_ADJUST_BLOCKTIME 6774 /* Adjust blocktime to zero if necessary now that __kmp_avail_proc is set */ 6775 if (!__kmp_env_blocktime && (__kmp_avail_proc > 0)) { 6776 KMP_DEBUG_ASSERT(__kmp_avail_proc > 0); 6777 if (__kmp_nth > __kmp_avail_proc) { 6778 __kmp_zero_bt = TRUE; 6779 } 6780 } 6781 #endif /* KMP_ADJUST_BLOCKTIME */ 6782 6783 /* we have finished middle initialization */ 6784 TCW_SYNC_4(__kmp_init_middle, TRUE); 6785 6786 KA_TRACE(10, ("__kmp_do_middle_initialize: exit\n")); 6787 } 6788 6789 void __kmp_middle_initialize(void) { 6790 if (__kmp_init_middle) { 6791 return; 6792 } 6793 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6794 if (__kmp_init_middle) { 6795 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6796 return; 6797 } 6798 __kmp_do_middle_initialize(); 6799 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6800 } 6801 6802 void __kmp_parallel_initialize(void) { 6803 int gtid = __kmp_entry_gtid(); // this might be a new root 6804 6805 /* synchronize parallel initialization (for sibling) */ 6806 if (TCR_4(__kmp_init_parallel)) 6807 return; 6808 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 6809 if (TCR_4(__kmp_init_parallel)) { 6810 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6811 return; 6812 } 6813 6814 /* TODO reinitialization after we have already shut down */ 6815 if (TCR_4(__kmp_global.g.g_done)) { 6816 KA_TRACE( 6817 10, 6818 ("__kmp_parallel_initialize: attempt to init while shutting down\n")); 6819 __kmp_infinite_loop(); 6820 } 6821 6822 /* jc: The lock __kmp_initz_lock is already held, so calling 6823 __kmp_serial_initialize would cause a deadlock. So we call 6824 __kmp_do_serial_initialize directly. */ 6825 if (!__kmp_init_middle) { 6826 __kmp_do_middle_initialize(); 6827 } 6828 6829 /* begin initialization */ 6830 KA_TRACE(10, ("__kmp_parallel_initialize: enter\n")); 6831 KMP_ASSERT(KMP_UBER_GTID(gtid)); 6832 6833 #if KMP_ARCH_X86 || KMP_ARCH_X86_64 6834 // Save the FP control regs. 6835 // Worker threads will set theirs to these values at thread startup. 6836 __kmp_store_x87_fpu_control_word(&__kmp_init_x87_fpu_control_word); 6837 __kmp_store_mxcsr(&__kmp_init_mxcsr); 6838 __kmp_init_mxcsr &= KMP_X86_MXCSR_MASK; 6839 #endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 6840 6841 #if KMP_OS_UNIX 6842 #if KMP_HANDLE_SIGNALS 6843 /* must be after __kmp_serial_initialize */ 6844 __kmp_install_signals(TRUE); 6845 #endif 6846 #endif 6847 6848 __kmp_suspend_initialize(); 6849 6850 #if defined(USE_LOAD_BALANCE) 6851 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6852 __kmp_global.g.g_dynamic_mode = dynamic_load_balance; 6853 } 6854 #else 6855 if (__kmp_global.g.g_dynamic_mode == dynamic_default) { 6856 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 6857 } 6858 #endif 6859 6860 if (__kmp_version) { 6861 __kmp_print_version_2(); 6862 } 6863 6864 /* we have finished parallel initialization */ 6865 TCW_SYNC_4(__kmp_init_parallel, TRUE); 6866 6867 KMP_MB(); 6868 KA_TRACE(10, ("__kmp_parallel_initialize: exit\n")); 6869 6870 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 6871 } 6872 6873 /* ------------------------------------------------------------------------ */ 6874 6875 void __kmp_run_before_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6876 kmp_team_t *team) { 6877 kmp_disp_t *dispatch; 6878 6879 KMP_MB(); 6880 6881 /* none of the threads have encountered any constructs, yet. */ 6882 this_thr->th.th_local.this_construct = 0; 6883 #if KMP_CACHE_MANAGE 6884 KMP_CACHE_PREFETCH(&this_thr->th.th_bar[bs_forkjoin_barrier].bb.b_arrived); 6885 #endif /* KMP_CACHE_MANAGE */ 6886 dispatch = (kmp_disp_t *)TCR_PTR(this_thr->th.th_dispatch); 6887 KMP_DEBUG_ASSERT(dispatch); 6888 KMP_DEBUG_ASSERT(team->t.t_dispatch); 6889 // KMP_DEBUG_ASSERT( this_thr->th.th_dispatch == &team->t.t_dispatch[ 6890 // this_thr->th.th_info.ds.ds_tid ] ); 6891 6892 dispatch->th_disp_index = 0; /* reset the dispatch buffer counter */ 6893 #if OMP_45_ENABLED 6894 dispatch->th_doacross_buf_idx = 6895 0; /* reset the doacross dispatch buffer counter */ 6896 #endif 6897 if (__kmp_env_consistency_check) 6898 __kmp_push_parallel(gtid, team->t.t_ident); 6899 6900 KMP_MB(); /* Flush all pending memory write invalidates. */ 6901 } 6902 6903 void __kmp_run_after_invoked_task(int gtid, int tid, kmp_info_t *this_thr, 6904 kmp_team_t *team) { 6905 if (__kmp_env_consistency_check) 6906 __kmp_pop_parallel(gtid, team->t.t_ident); 6907 6908 __kmp_finish_implicit_task(this_thr); 6909 } 6910 6911 int __kmp_invoke_task_func(int gtid) { 6912 int rc; 6913 int tid = __kmp_tid_from_gtid(gtid); 6914 kmp_info_t *this_thr = __kmp_threads[gtid]; 6915 kmp_team_t *team = this_thr->th.th_team; 6916 6917 __kmp_run_before_invoked_task(gtid, tid, this_thr, team); 6918 #if USE_ITT_BUILD 6919 if (__itt_stack_caller_create_ptr) { 6920 __kmp_itt_stack_callee_enter( 6921 (__itt_caller) 6922 team->t.t_stack_id); // inform ittnotify about entering user's code 6923 } 6924 #endif /* USE_ITT_BUILD */ 6925 #if INCLUDE_SSC_MARKS 6926 SSC_MARK_INVOKING(); 6927 #endif 6928 6929 #if OMPT_SUPPORT 6930 void *dummy; 6931 void **exit_runtime_p; 6932 ompt_data_t *my_task_data; 6933 ompt_data_t *my_parallel_data; 6934 int ompt_team_size; 6935 6936 if (ompt_enabled.enabled) { 6937 exit_runtime_p = &( 6938 team->t.t_implicit_task_taskdata[tid].ompt_task_info.frame.exit_frame); 6939 } else { 6940 exit_runtime_p = &dummy; 6941 } 6942 6943 my_task_data = 6944 &(team->t.t_implicit_task_taskdata[tid].ompt_task_info.task_data); 6945 my_parallel_data = &(team->t.ompt_team_info.parallel_data); 6946 if (ompt_enabled.ompt_callback_implicit_task) { 6947 ompt_team_size = team->t.t_nproc; 6948 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 6949 ompt_scope_begin, my_parallel_data, my_task_data, ompt_team_size, 6950 __kmp_tid_from_gtid(gtid)); 6951 OMPT_CUR_TASK_INFO(this_thr)->thread_num = __kmp_tid_from_gtid(gtid); 6952 } 6953 #endif 6954 6955 { 6956 KMP_TIME_PARTITIONED_BLOCK(OMP_parallel); 6957 KMP_SET_THREAD_STATE_BLOCK(IMPLICIT_TASK); 6958 rc = 6959 __kmp_invoke_microtask((microtask_t)TCR_SYNC_PTR(team->t.t_pkfn), gtid, 6960 tid, (int)team->t.t_argc, (void **)team->t.t_argv 6961 #if OMPT_SUPPORT 6962 , 6963 exit_runtime_p 6964 #endif 6965 ); 6966 #if OMPT_SUPPORT 6967 *exit_runtime_p = NULL; 6968 #endif 6969 } 6970 6971 #if USE_ITT_BUILD 6972 if (__itt_stack_caller_create_ptr) { 6973 __kmp_itt_stack_callee_leave( 6974 (__itt_caller) 6975 team->t.t_stack_id); // inform ittnotify about leaving user's code 6976 } 6977 #endif /* USE_ITT_BUILD */ 6978 __kmp_run_after_invoked_task(gtid, tid, this_thr, team); 6979 6980 return rc; 6981 } 6982 6983 #if OMP_40_ENABLED 6984 void __kmp_teams_master(int gtid) { 6985 // This routine is called by all master threads in teams construct 6986 kmp_info_t *thr = __kmp_threads[gtid]; 6987 kmp_team_t *team = thr->th.th_team; 6988 ident_t *loc = team->t.t_ident; 6989 thr->th.th_set_nproc = thr->th.th_teams_size.nth; 6990 KMP_DEBUG_ASSERT(thr->th.th_teams_microtask); 6991 KMP_DEBUG_ASSERT(thr->th.th_set_nproc); 6992 KA_TRACE(20, ("__kmp_teams_master: T#%d, Tid %d, microtask %p\n", gtid, 6993 __kmp_tid_from_gtid(gtid), thr->th.th_teams_microtask)); 6994 // Launch league of teams now, but not let workers execute 6995 // (they hang on fork barrier until next parallel) 6996 #if INCLUDE_SSC_MARKS 6997 SSC_MARK_FORKING(); 6998 #endif 6999 __kmp_fork_call(loc, gtid, fork_context_intel, team->t.t_argc, 7000 (microtask_t)thr->th.th_teams_microtask, // "wrapped" task 7001 VOLATILE_CAST(launch_t) __kmp_invoke_task_func, NULL); 7002 #if INCLUDE_SSC_MARKS 7003 SSC_MARK_JOINING(); 7004 #endif 7005 7006 // AC: last parameter "1" eliminates join barrier which won't work because 7007 // worker threads are in a fork barrier waiting for more parallel regions 7008 __kmp_join_call(loc, gtid 7009 #if OMPT_SUPPORT 7010 , 7011 fork_context_intel 7012 #endif 7013 , 7014 1); 7015 } 7016 7017 int __kmp_invoke_teams_master(int gtid) { 7018 kmp_info_t *this_thr = __kmp_threads[gtid]; 7019 kmp_team_t *team = this_thr->th.th_team; 7020 #if KMP_DEBUG 7021 if (!__kmp_threads[gtid]->th.th_team->t.t_serialized) 7022 KMP_DEBUG_ASSERT((void *)__kmp_threads[gtid]->th.th_team->t.t_pkfn == 7023 (void *)__kmp_teams_master); 7024 #endif 7025 __kmp_run_before_invoked_task(gtid, 0, this_thr, team); 7026 __kmp_teams_master(gtid); 7027 __kmp_run_after_invoked_task(gtid, 0, this_thr, team); 7028 return 1; 7029 } 7030 #endif /* OMP_40_ENABLED */ 7031 7032 /* this sets the requested number of threads for the next parallel region 7033 encountered by this team. since this should be enclosed in the forkjoin 7034 critical section it should avoid race conditions with assymmetrical nested 7035 parallelism */ 7036 7037 void __kmp_push_num_threads(ident_t *id, int gtid, int num_threads) { 7038 kmp_info_t *thr = __kmp_threads[gtid]; 7039 7040 if (num_threads > 0) 7041 thr->th.th_set_nproc = num_threads; 7042 } 7043 7044 #if OMP_40_ENABLED 7045 7046 /* this sets the requested number of teams for the teams region and/or 7047 the number of threads for the next parallel region encountered */ 7048 void __kmp_push_num_teams(ident_t *id, int gtid, int num_teams, 7049 int num_threads) { 7050 kmp_info_t *thr = __kmp_threads[gtid]; 7051 KMP_DEBUG_ASSERT(num_teams >= 0); 7052 KMP_DEBUG_ASSERT(num_threads >= 0); 7053 7054 if (num_teams == 0) 7055 num_teams = 1; // default number of teams is 1. 7056 if (num_teams > __kmp_teams_max_nth) { // if too many teams requested? 7057 if (!__kmp_reserve_warn) { 7058 __kmp_reserve_warn = 1; 7059 __kmp_msg(kmp_ms_warning, 7060 KMP_MSG(CantFormThrTeam, num_teams, __kmp_teams_max_nth), 7061 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7062 } 7063 num_teams = __kmp_teams_max_nth; 7064 } 7065 // Set number of teams (number of threads in the outer "parallel" of the 7066 // teams) 7067 thr->th.th_set_nproc = thr->th.th_teams_size.nteams = num_teams; 7068 7069 // Remember the number of threads for inner parallel regions 7070 if (num_threads == 0) { 7071 if (!TCR_4(__kmp_init_middle)) 7072 __kmp_middle_initialize(); // get __kmp_avail_proc calculated 7073 num_threads = __kmp_avail_proc / num_teams; 7074 if (num_teams * num_threads > __kmp_teams_max_nth) { 7075 // adjust num_threads w/o warning as it is not user setting 7076 num_threads = __kmp_teams_max_nth / num_teams; 7077 } 7078 } else { 7079 if (num_teams * num_threads > __kmp_teams_max_nth) { 7080 int new_threads = __kmp_teams_max_nth / num_teams; 7081 if (!__kmp_reserve_warn) { // user asked for too many threads 7082 __kmp_reserve_warn = 1; // that conflicts with KMP_TEAMS_THREAD_LIMIT 7083 __kmp_msg(kmp_ms_warning, 7084 KMP_MSG(CantFormThrTeam, num_threads, new_threads), 7085 KMP_HNT(Unset_ALL_THREADS), __kmp_msg_null); 7086 } 7087 num_threads = new_threads; 7088 } 7089 } 7090 thr->th.th_teams_size.nth = num_threads; 7091 } 7092 7093 // Set the proc_bind var to use in the following parallel region. 7094 void __kmp_push_proc_bind(ident_t *id, int gtid, kmp_proc_bind_t proc_bind) { 7095 kmp_info_t *thr = __kmp_threads[gtid]; 7096 thr->th.th_set_proc_bind = proc_bind; 7097 } 7098 7099 #endif /* OMP_40_ENABLED */ 7100 7101 /* Launch the worker threads into the microtask. */ 7102 7103 void __kmp_internal_fork(ident_t *id, int gtid, kmp_team_t *team) { 7104 kmp_info_t *this_thr = __kmp_threads[gtid]; 7105 7106 #ifdef KMP_DEBUG 7107 int f; 7108 #endif /* KMP_DEBUG */ 7109 7110 KMP_DEBUG_ASSERT(team); 7111 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7112 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7113 KMP_MB(); /* Flush all pending memory write invalidates. */ 7114 7115 team->t.t_construct = 0; /* no single directives seen yet */ 7116 team->t.t_ordered.dt.t_value = 7117 0; /* thread 0 enters the ordered section first */ 7118 7119 /* Reset the identifiers on the dispatch buffer */ 7120 KMP_DEBUG_ASSERT(team->t.t_disp_buffer); 7121 if (team->t.t_max_nproc > 1) { 7122 int i; 7123 for (i = 0; i < __kmp_dispatch_num_buffers; ++i) { 7124 team->t.t_disp_buffer[i].buffer_index = i; 7125 #if OMP_45_ENABLED 7126 team->t.t_disp_buffer[i].doacross_buf_idx = i; 7127 #endif 7128 } 7129 } else { 7130 team->t.t_disp_buffer[0].buffer_index = 0; 7131 #if OMP_45_ENABLED 7132 team->t.t_disp_buffer[0].doacross_buf_idx = 0; 7133 #endif 7134 } 7135 7136 KMP_MB(); /* Flush all pending memory write invalidates. */ 7137 KMP_ASSERT(this_thr->th.th_team == team); 7138 7139 #ifdef KMP_DEBUG 7140 for (f = 0; f < team->t.t_nproc; f++) { 7141 KMP_DEBUG_ASSERT(team->t.t_threads[f] && 7142 team->t.t_threads[f]->th.th_team_nproc == team->t.t_nproc); 7143 } 7144 #endif /* KMP_DEBUG */ 7145 7146 /* release the worker threads so they may begin working */ 7147 __kmp_fork_barrier(gtid, 0); 7148 } 7149 7150 void __kmp_internal_join(ident_t *id, int gtid, kmp_team_t *team) { 7151 kmp_info_t *this_thr = __kmp_threads[gtid]; 7152 7153 KMP_DEBUG_ASSERT(team); 7154 KMP_DEBUG_ASSERT(this_thr->th.th_team == team); 7155 KMP_ASSERT(KMP_MASTER_GTID(gtid)); 7156 KMP_MB(); /* Flush all pending memory write invalidates. */ 7157 7158 /* Join barrier after fork */ 7159 7160 #ifdef KMP_DEBUG 7161 if (__kmp_threads[gtid] && 7162 __kmp_threads[gtid]->th.th_team_nproc != team->t.t_nproc) { 7163 __kmp_printf("GTID: %d, __kmp_threads[%d]=%p\n", gtid, gtid, 7164 __kmp_threads[gtid]); 7165 __kmp_printf("__kmp_threads[%d]->th.th_team_nproc=%d, TEAM: %p, " 7166 "team->t.t_nproc=%d\n", 7167 gtid, __kmp_threads[gtid]->th.th_team_nproc, team, 7168 team->t.t_nproc); 7169 __kmp_print_structure(); 7170 } 7171 KMP_DEBUG_ASSERT(__kmp_threads[gtid] && 7172 __kmp_threads[gtid]->th.th_team_nproc == team->t.t_nproc); 7173 #endif /* KMP_DEBUG */ 7174 7175 __kmp_join_barrier(gtid); /* wait for everyone */ 7176 #if OMPT_SUPPORT 7177 if (ompt_enabled.enabled && 7178 this_thr->th.ompt_thread_info.state == omp_state_wait_barrier_implicit) { 7179 int ds_tid = this_thr->th.th_info.ds.ds_tid; 7180 ompt_data_t *task_data = OMPT_CUR_TASK_DATA(this_thr); 7181 this_thr->th.ompt_thread_info.state = omp_state_overhead; 7182 #if OMPT_OPTIONAL 7183 void *codeptr = NULL; 7184 if (KMP_MASTER_TID(ds_tid) && 7185 (ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait) || 7186 ompt_callbacks.ompt_callback(ompt_callback_sync_region))) 7187 codeptr = OMPT_CUR_TEAM_INFO(this_thr)->master_return_address; 7188 7189 if (ompt_enabled.ompt_callback_sync_region_wait) { 7190 ompt_callbacks.ompt_callback(ompt_callback_sync_region_wait)( 7191 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr); 7192 } 7193 if (ompt_enabled.ompt_callback_sync_region) { 7194 ompt_callbacks.ompt_callback(ompt_callback_sync_region)( 7195 ompt_sync_region_barrier, ompt_scope_end, NULL, task_data, codeptr); 7196 } 7197 #endif 7198 if (!KMP_MASTER_TID(ds_tid) && ompt_enabled.ompt_callback_implicit_task) { 7199 ompt_callbacks.ompt_callback(ompt_callback_implicit_task)( 7200 ompt_scope_end, NULL, task_data, 0, ds_tid); 7201 } 7202 } 7203 #endif 7204 7205 KMP_MB(); /* Flush all pending memory write invalidates. */ 7206 KMP_ASSERT(this_thr->th.th_team == team); 7207 } 7208 7209 /* ------------------------------------------------------------------------ */ 7210 7211 #ifdef USE_LOAD_BALANCE 7212 7213 // Return the worker threads actively spinning in the hot team, if we 7214 // are at the outermost level of parallelism. Otherwise, return 0. 7215 static int __kmp_active_hot_team_nproc(kmp_root_t *root) { 7216 int i; 7217 int retval; 7218 kmp_team_t *hot_team; 7219 7220 if (root->r.r_active) { 7221 return 0; 7222 } 7223 hot_team = root->r.r_hot_team; 7224 if (__kmp_dflt_blocktime == KMP_MAX_BLOCKTIME) { 7225 return hot_team->t.t_nproc - 1; // Don't count master thread 7226 } 7227 7228 // Skip the master thread - it is accounted for elsewhere. 7229 retval = 0; 7230 for (i = 1; i < hot_team->t.t_nproc; i++) { 7231 if (hot_team->t.t_threads[i]->th.th_active) { 7232 retval++; 7233 } 7234 } 7235 return retval; 7236 } 7237 7238 // Perform an automatic adjustment to the number of 7239 // threads used by the next parallel region. 7240 static int __kmp_load_balance_nproc(kmp_root_t *root, int set_nproc) { 7241 int retval; 7242 int pool_active; 7243 int hot_team_active; 7244 int team_curr_active; 7245 int system_active; 7246 7247 KB_TRACE(20, ("__kmp_load_balance_nproc: called root:%p set_nproc:%d\n", root, 7248 set_nproc)); 7249 KMP_DEBUG_ASSERT(root); 7250 KMP_DEBUG_ASSERT(root->r.r_root_team->t.t_threads[0] 7251 ->th.th_current_task->td_icvs.dynamic == TRUE); 7252 KMP_DEBUG_ASSERT(set_nproc > 1); 7253 7254 if (set_nproc == 1) { 7255 KB_TRACE(20, ("__kmp_load_balance_nproc: serial execution.\n")); 7256 return 1; 7257 } 7258 7259 // Threads that are active in the thread pool, active in the hot team for this 7260 // particular root (if we are at the outer par level), and the currently 7261 // executing thread (to become the master) are available to add to the new 7262 // team, but are currently contributing to the system load, and must be 7263 // accounted for. 7264 pool_active = __kmp_thread_pool_active_nth; 7265 hot_team_active = __kmp_active_hot_team_nproc(root); 7266 team_curr_active = pool_active + hot_team_active + 1; 7267 7268 // Check the system load. 7269 system_active = __kmp_get_load_balance(__kmp_avail_proc + team_curr_active); 7270 KB_TRACE(30, ("__kmp_load_balance_nproc: system active = %d pool active = %d " 7271 "hot team active = %d\n", 7272 system_active, pool_active, hot_team_active)); 7273 7274 if (system_active < 0) { 7275 // There was an error reading the necessary info from /proc, so use the 7276 // thread limit algorithm instead. Once we set __kmp_global.g.g_dynamic_mode 7277 // = dynamic_thread_limit, we shouldn't wind up getting back here. 7278 __kmp_global.g.g_dynamic_mode = dynamic_thread_limit; 7279 KMP_WARNING(CantLoadBalUsing, "KMP_DYNAMIC_MODE=thread limit"); 7280 7281 // Make this call behave like the thread limit algorithm. 7282 retval = __kmp_avail_proc - __kmp_nth + 7283 (root->r.r_active ? 1 : root->r.r_hot_team->t.t_nproc); 7284 if (retval > set_nproc) { 7285 retval = set_nproc; 7286 } 7287 if (retval < KMP_MIN_NTH) { 7288 retval = KMP_MIN_NTH; 7289 } 7290 7291 KB_TRACE(20, ("__kmp_load_balance_nproc: thread limit exit. retval:%d\n", 7292 retval)); 7293 return retval; 7294 } 7295 7296 // There is a slight delay in the load balance algorithm in detecting new 7297 // running procs. The real system load at this instant should be at least as 7298 // large as the #active omp thread that are available to add to the team. 7299 if (system_active < team_curr_active) { 7300 system_active = team_curr_active; 7301 } 7302 retval = __kmp_avail_proc - system_active + team_curr_active; 7303 if (retval > set_nproc) { 7304 retval = set_nproc; 7305 } 7306 if (retval < KMP_MIN_NTH) { 7307 retval = KMP_MIN_NTH; 7308 } 7309 7310 KB_TRACE(20, ("__kmp_load_balance_nproc: exit. retval:%d\n", retval)); 7311 return retval; 7312 } // __kmp_load_balance_nproc() 7313 7314 #endif /* USE_LOAD_BALANCE */ 7315 7316 /* ------------------------------------------------------------------------ */ 7317 7318 /* NOTE: this is called with the __kmp_init_lock held */ 7319 void __kmp_cleanup(void) { 7320 int f; 7321 7322 KA_TRACE(10, ("__kmp_cleanup: enter\n")); 7323 7324 if (TCR_4(__kmp_init_parallel)) { 7325 #if KMP_HANDLE_SIGNALS 7326 __kmp_remove_signals(); 7327 #endif 7328 TCW_4(__kmp_init_parallel, FALSE); 7329 } 7330 7331 if (TCR_4(__kmp_init_middle)) { 7332 #if KMP_AFFINITY_SUPPORTED 7333 __kmp_affinity_uninitialize(); 7334 #endif /* KMP_AFFINITY_SUPPORTED */ 7335 __kmp_cleanup_hierarchy(); 7336 TCW_4(__kmp_init_middle, FALSE); 7337 } 7338 7339 KA_TRACE(10, ("__kmp_cleanup: go serial cleanup\n")); 7340 7341 if (__kmp_init_serial) { 7342 __kmp_runtime_destroy(); 7343 __kmp_init_serial = FALSE; 7344 } 7345 7346 __kmp_cleanup_threadprivate_caches(); 7347 7348 for (f = 0; f < __kmp_threads_capacity; f++) { 7349 if (__kmp_root[f] != NULL) { 7350 __kmp_free(__kmp_root[f]); 7351 __kmp_root[f] = NULL; 7352 } 7353 } 7354 __kmp_free(__kmp_threads); 7355 // __kmp_threads and __kmp_root were allocated at once, as single block, so 7356 // there is no need in freeing __kmp_root. 7357 __kmp_threads = NULL; 7358 __kmp_root = NULL; 7359 __kmp_threads_capacity = 0; 7360 7361 #if KMP_USE_DYNAMIC_LOCK 7362 __kmp_cleanup_indirect_user_locks(); 7363 #else 7364 __kmp_cleanup_user_locks(); 7365 #endif 7366 7367 #if KMP_AFFINITY_SUPPORTED 7368 KMP_INTERNAL_FREE(CCAST(char *, __kmp_cpuinfo_file)); 7369 __kmp_cpuinfo_file = NULL; 7370 #endif /* KMP_AFFINITY_SUPPORTED */ 7371 7372 #if KMP_USE_ADAPTIVE_LOCKS 7373 #if KMP_DEBUG_ADAPTIVE_LOCKS 7374 __kmp_print_speculative_stats(); 7375 #endif 7376 #endif 7377 KMP_INTERNAL_FREE(__kmp_nested_nth.nth); 7378 __kmp_nested_nth.nth = NULL; 7379 __kmp_nested_nth.size = 0; 7380 __kmp_nested_nth.used = 0; 7381 KMP_INTERNAL_FREE(__kmp_nested_proc_bind.bind_types); 7382 __kmp_nested_proc_bind.bind_types = NULL; 7383 __kmp_nested_proc_bind.size = 0; 7384 __kmp_nested_proc_bind.used = 0; 7385 7386 __kmp_i18n_catclose(); 7387 7388 #if KMP_USE_HIER_SCHED 7389 __kmp_hier_scheds.deallocate(); 7390 #endif 7391 7392 #if KMP_STATS_ENABLED 7393 __kmp_stats_fini(); 7394 #endif 7395 7396 KA_TRACE(10, ("__kmp_cleanup: exit\n")); 7397 } 7398 7399 /* ------------------------------------------------------------------------ */ 7400 7401 int __kmp_ignore_mppbeg(void) { 7402 char *env; 7403 7404 if ((env = getenv("KMP_IGNORE_MPPBEG")) != NULL) { 7405 if (__kmp_str_match_false(env)) 7406 return FALSE; 7407 } 7408 // By default __kmpc_begin() is no-op. 7409 return TRUE; 7410 } 7411 7412 int __kmp_ignore_mppend(void) { 7413 char *env; 7414 7415 if ((env = getenv("KMP_IGNORE_MPPEND")) != NULL) { 7416 if (__kmp_str_match_false(env)) 7417 return FALSE; 7418 } 7419 // By default __kmpc_end() is no-op. 7420 return TRUE; 7421 } 7422 7423 void __kmp_internal_begin(void) { 7424 int gtid; 7425 kmp_root_t *root; 7426 7427 /* this is a very important step as it will register new sibling threads 7428 and assign these new uber threads a new gtid */ 7429 gtid = __kmp_entry_gtid(); 7430 root = __kmp_threads[gtid]->th.th_root; 7431 KMP_ASSERT(KMP_UBER_GTID(gtid)); 7432 7433 if (root->r.r_begin) 7434 return; 7435 __kmp_acquire_lock(&root->r.r_begin_lock, gtid); 7436 if (root->r.r_begin) { 7437 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7438 return; 7439 } 7440 7441 root->r.r_begin = TRUE; 7442 7443 __kmp_release_lock(&root->r.r_begin_lock, gtid); 7444 } 7445 7446 /* ------------------------------------------------------------------------ */ 7447 7448 void __kmp_user_set_library(enum library_type arg) { 7449 int gtid; 7450 kmp_root_t *root; 7451 kmp_info_t *thread; 7452 7453 /* first, make sure we are initialized so we can get our gtid */ 7454 7455 gtid = __kmp_entry_gtid(); 7456 thread = __kmp_threads[gtid]; 7457 7458 root = thread->th.th_root; 7459 7460 KA_TRACE(20, ("__kmp_user_set_library: enter T#%d, arg: %d, %d\n", gtid, arg, 7461 library_serial)); 7462 if (root->r.r_in_parallel) { /* Must be called in serial section of top-level 7463 thread */ 7464 KMP_WARNING(SetLibraryIncorrectCall); 7465 return; 7466 } 7467 7468 switch (arg) { 7469 case library_serial: 7470 thread->th.th_set_nproc = 0; 7471 set__nproc(thread, 1); 7472 break; 7473 case library_turnaround: 7474 thread->th.th_set_nproc = 0; 7475 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7476 : __kmp_dflt_team_nth_ub); 7477 break; 7478 case library_throughput: 7479 thread->th.th_set_nproc = 0; 7480 set__nproc(thread, __kmp_dflt_team_nth ? __kmp_dflt_team_nth 7481 : __kmp_dflt_team_nth_ub); 7482 break; 7483 default: 7484 KMP_FATAL(UnknownLibraryType, arg); 7485 } 7486 7487 __kmp_aux_set_library(arg); 7488 } 7489 7490 void __kmp_aux_set_stacksize(size_t arg) { 7491 if (!__kmp_init_serial) 7492 __kmp_serial_initialize(); 7493 7494 #if KMP_OS_DARWIN 7495 if (arg & (0x1000 - 1)) { 7496 arg &= ~(0x1000 - 1); 7497 if (arg + 0x1000) /* check for overflow if we round up */ 7498 arg += 0x1000; 7499 } 7500 #endif 7501 __kmp_acquire_bootstrap_lock(&__kmp_initz_lock); 7502 7503 /* only change the default stacksize before the first parallel region */ 7504 if (!TCR_4(__kmp_init_parallel)) { 7505 size_t value = arg; /* argument is in bytes */ 7506 7507 if (value < __kmp_sys_min_stksize) 7508 value = __kmp_sys_min_stksize; 7509 else if (value > KMP_MAX_STKSIZE) 7510 value = KMP_MAX_STKSIZE; 7511 7512 __kmp_stksize = value; 7513 7514 __kmp_env_stksize = TRUE; /* was KMP_STACKSIZE specified? */ 7515 } 7516 7517 __kmp_release_bootstrap_lock(&__kmp_initz_lock); 7518 } 7519 7520 /* set the behaviour of the runtime library */ 7521 /* TODO this can cause some odd behaviour with sibling parallelism... */ 7522 void __kmp_aux_set_library(enum library_type arg) { 7523 __kmp_library = arg; 7524 7525 switch (__kmp_library) { 7526 case library_serial: { 7527 KMP_INFORM(LibraryIsSerial); 7528 (void)__kmp_change_library(TRUE); 7529 } break; 7530 case library_turnaround: 7531 (void)__kmp_change_library(TRUE); 7532 break; 7533 case library_throughput: 7534 (void)__kmp_change_library(FALSE); 7535 break; 7536 default: 7537 KMP_FATAL(UnknownLibraryType, arg); 7538 } 7539 } 7540 7541 /* ------------------------------------------------------------------------ */ 7542 7543 void __kmp_aux_set_blocktime(int arg, kmp_info_t *thread, int tid) { 7544 int blocktime = arg; /* argument is in milliseconds */ 7545 #if KMP_USE_MONITOR 7546 int bt_intervals; 7547 #endif 7548 int bt_set; 7549 7550 __kmp_save_internal_controls(thread); 7551 7552 /* Normalize and set blocktime for the teams */ 7553 if (blocktime < KMP_MIN_BLOCKTIME) 7554 blocktime = KMP_MIN_BLOCKTIME; 7555 else if (blocktime > KMP_MAX_BLOCKTIME) 7556 blocktime = KMP_MAX_BLOCKTIME; 7557 7558 set__blocktime_team(thread->th.th_team, tid, blocktime); 7559 set__blocktime_team(thread->th.th_serial_team, 0, blocktime); 7560 7561 #if KMP_USE_MONITOR 7562 /* Calculate and set blocktime intervals for the teams */ 7563 bt_intervals = KMP_INTERVALS_FROM_BLOCKTIME(blocktime, __kmp_monitor_wakeups); 7564 7565 set__bt_intervals_team(thread->th.th_team, tid, bt_intervals); 7566 set__bt_intervals_team(thread->th.th_serial_team, 0, bt_intervals); 7567 #endif 7568 7569 /* Set whether blocktime has been set to "TRUE" */ 7570 bt_set = TRUE; 7571 7572 set__bt_set_team(thread->th.th_team, tid, bt_set); 7573 set__bt_set_team(thread->th.th_serial_team, 0, bt_set); 7574 #if KMP_USE_MONITOR 7575 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d, " 7576 "bt_intervals=%d, monitor_updates=%d\n", 7577 __kmp_gtid_from_tid(tid, thread->th.th_team), 7578 thread->th.th_team->t.t_id, tid, blocktime, bt_intervals, 7579 __kmp_monitor_wakeups)); 7580 #else 7581 KF_TRACE(10, ("kmp_set_blocktime: T#%d(%d:%d), blocktime=%d\n", 7582 __kmp_gtid_from_tid(tid, thread->th.th_team), 7583 thread->th.th_team->t.t_id, tid, blocktime)); 7584 #endif 7585 } 7586 7587 void __kmp_aux_set_defaults(char const *str, int len) { 7588 if (!__kmp_init_serial) { 7589 __kmp_serial_initialize(); 7590 } 7591 __kmp_env_initialize(str); 7592 7593 if (__kmp_settings 7594 #if OMP_40_ENABLED 7595 || __kmp_display_env || __kmp_display_env_verbose 7596 #endif // OMP_40_ENABLED 7597 ) { 7598 __kmp_env_print(); 7599 } 7600 } // __kmp_aux_set_defaults 7601 7602 /* ------------------------------------------------------------------------ */ 7603 /* internal fast reduction routines */ 7604 7605 PACKED_REDUCTION_METHOD_T 7606 __kmp_determine_reduction_method( 7607 ident_t *loc, kmp_int32 global_tid, kmp_int32 num_vars, size_t reduce_size, 7608 void *reduce_data, void (*reduce_func)(void *lhs_data, void *rhs_data), 7609 kmp_critical_name *lck) { 7610 7611 // Default reduction method: critical construct ( lck != NULL, like in current 7612 // PAROPT ) 7613 // If ( reduce_data!=NULL && reduce_func!=NULL ): the tree-reduction method 7614 // can be selected by RTL 7615 // If loc->flags contains KMP_IDENT_ATOMIC_REDUCE, the atomic reduce method 7616 // can be selected by RTL 7617 // Finally, it's up to OpenMP RTL to make a decision on which method to select 7618 // among generated by PAROPT. 7619 7620 PACKED_REDUCTION_METHOD_T retval; 7621 7622 int team_size; 7623 7624 KMP_DEBUG_ASSERT(loc); // it would be nice to test ( loc != 0 ) 7625 KMP_DEBUG_ASSERT(lck); // it would be nice to test ( lck != 0 ) 7626 7627 #define FAST_REDUCTION_ATOMIC_METHOD_GENERATED \ 7628 ((loc->flags & (KMP_IDENT_ATOMIC_REDUCE)) == (KMP_IDENT_ATOMIC_REDUCE)) 7629 #define FAST_REDUCTION_TREE_METHOD_GENERATED ((reduce_data) && (reduce_func)) 7630 7631 retval = critical_reduce_block; 7632 7633 // another choice of getting a team size (with 1 dynamic deference) is slower 7634 team_size = __kmp_get_team_num_threads(global_tid); 7635 if (team_size == 1) { 7636 7637 retval = empty_reduce_block; 7638 7639 } else { 7640 7641 int atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 7642 7643 #if KMP_ARCH_X86_64 || KMP_ARCH_PPC64 || KMP_ARCH_AARCH64 || KMP_ARCH_MIPS64 7644 7645 #if KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || \ 7646 KMP_OS_DARWIN 7647 7648 int teamsize_cutoff = 4; 7649 7650 #if KMP_MIC_SUPPORTED 7651 if (__kmp_mic_type != non_mic) { 7652 teamsize_cutoff = 8; 7653 } 7654 #endif 7655 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 7656 if (tree_available) { 7657 if (team_size <= teamsize_cutoff) { 7658 if (atomic_available) { 7659 retval = atomic_reduce_block; 7660 } 7661 } else { 7662 retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 7663 } 7664 } else if (atomic_available) { 7665 retval = atomic_reduce_block; 7666 } 7667 #else 7668 #error "Unknown or unsupported OS" 7669 #endif // KMP_OS_LINUX || KMP_OS_FREEBSD || KMP_OS_NETBSD || KMP_OS_WINDOWS || 7670 // KMP_OS_DARWIN 7671 7672 #elif KMP_ARCH_X86 || KMP_ARCH_ARM || KMP_ARCH_AARCH || KMP_ARCH_MIPS 7673 7674 #if KMP_OS_LINUX || KMP_OS_WINDOWS 7675 7676 // basic tuning 7677 7678 if (atomic_available) { 7679 if (num_vars <= 2) { // && ( team_size <= 8 ) due to false-sharing ??? 7680 retval = atomic_reduce_block; 7681 } 7682 } // otherwise: use critical section 7683 7684 #elif KMP_OS_DARWIN 7685 7686 int tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 7687 if (atomic_available && (num_vars <= 3)) { 7688 retval = atomic_reduce_block; 7689 } else if (tree_available) { 7690 if ((reduce_size > (9 * sizeof(kmp_real64))) && 7691 (reduce_size < (2000 * sizeof(kmp_real64)))) { 7692 retval = TREE_REDUCE_BLOCK_WITH_PLAIN_BARRIER; 7693 } 7694 } // otherwise: use critical section 7695 7696 #else 7697 #error "Unknown or unsupported OS" 7698 #endif 7699 7700 #else 7701 #error "Unknown or unsupported architecture" 7702 #endif 7703 } 7704 7705 // KMP_FORCE_REDUCTION 7706 7707 // If the team is serialized (team_size == 1), ignore the forced reduction 7708 // method and stay with the unsynchronized method (empty_reduce_block) 7709 if (__kmp_force_reduction_method != reduction_method_not_defined && 7710 team_size != 1) { 7711 7712 PACKED_REDUCTION_METHOD_T forced_retval = critical_reduce_block; 7713 7714 int atomic_available, tree_available; 7715 7716 switch ((forced_retval = __kmp_force_reduction_method)) { 7717 case critical_reduce_block: 7718 KMP_ASSERT(lck); // lck should be != 0 7719 break; 7720 7721 case atomic_reduce_block: 7722 atomic_available = FAST_REDUCTION_ATOMIC_METHOD_GENERATED; 7723 if (!atomic_available) { 7724 KMP_WARNING(RedMethodNotSupported, "atomic"); 7725 forced_retval = critical_reduce_block; 7726 } 7727 break; 7728 7729 case tree_reduce_block: 7730 tree_available = FAST_REDUCTION_TREE_METHOD_GENERATED; 7731 if (!tree_available) { 7732 KMP_WARNING(RedMethodNotSupported, "tree"); 7733 forced_retval = critical_reduce_block; 7734 } else { 7735 #if KMP_FAST_REDUCTION_BARRIER 7736 forced_retval = TREE_REDUCE_BLOCK_WITH_REDUCTION_BARRIER; 7737 #endif 7738 } 7739 break; 7740 7741 default: 7742 KMP_ASSERT(0); // "unsupported method specified" 7743 } 7744 7745 retval = forced_retval; 7746 } 7747 7748 KA_TRACE(10, ("reduction method selected=%08x\n", retval)); 7749 7750 #undef FAST_REDUCTION_TREE_METHOD_GENERATED 7751 #undef FAST_REDUCTION_ATOMIC_METHOD_GENERATED 7752 7753 return (retval); 7754 } 7755 7756 // this function is for testing set/get/determine reduce method 7757 kmp_int32 __kmp_get_reduce_method(void) { 7758 return ((__kmp_entry_thread()->th.th_local.packed_reduction_method) >> 8); 7759 } 7760