1 /* 2 * kmp_affinity.cpp -- affinity management 3 */ 4 5 6 //===----------------------------------------------------------------------===// 7 // 8 // The LLVM Compiler Infrastructure 9 // 10 // This file is dual licensed under the MIT and the University of Illinois Open 11 // Source Licenses. See LICENSE.txt for details. 12 // 13 //===----------------------------------------------------------------------===// 14 15 16 #include "kmp.h" 17 #include "kmp_i18n.h" 18 #include "kmp_io.h" 19 #include "kmp_str.h" 20 #include "kmp_wrapper_getpid.h" 21 #include "kmp_affinity.h" 22 23 // Store the real or imagined machine hierarchy here 24 static hierarchy_info machine_hierarchy; 25 26 void __kmp_cleanup_hierarchy() { 27 machine_hierarchy.fini(); 28 } 29 30 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 31 kmp_uint32 depth; 32 // The test below is true if affinity is available, but set to "none". Need to init on first use of hierarchical barrier. 33 if (TCR_1(machine_hierarchy.uninitialized)) 34 machine_hierarchy.init(NULL, nproc); 35 36 // Adjust the hierarchy in case num threads exceeds original 37 if (nproc > machine_hierarchy.base_num_threads) 38 machine_hierarchy.resize(nproc); 39 40 depth = machine_hierarchy.depth; 41 KMP_DEBUG_ASSERT(depth > 0); 42 43 thr_bar->depth = depth; 44 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 45 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 46 } 47 48 #if KMP_AFFINITY_SUPPORTED 49 50 // 51 // Print the affinity mask to the character array in a pretty format. 52 // 53 #if KMP_USE_HWLOC 54 char * 55 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 56 { 57 int num_chars_to_write, num_chars_written; 58 char* scan; 59 KMP_ASSERT(buf_len >= 40); 60 61 // bufsize of 0 just retrieves the needed buffer size. 62 num_chars_to_write = hwloc_bitmap_list_snprintf(buf, 0, (hwloc_bitmap_t)mask); 63 64 // need '{', "xxxxxxxx...xx", '}', '\0' = num_chars_to_write + 3 bytes 65 // * num_chars_to_write returned by hwloc_bitmap_list_snprintf does not 66 // take into account the '\0' character. 67 if(hwloc_bitmap_iszero((hwloc_bitmap_t)mask)) { 68 KMP_SNPRINTF(buf, buf_len, "{<empty>}"); 69 } else if(num_chars_to_write < buf_len - 3) { 70 // no problem fitting the mask into buf_len number of characters 71 buf[0] = '{'; 72 // use buf_len-3 because we have the three characters: '{' '}' '\0' to add to the buffer 73 num_chars_written = hwloc_bitmap_list_snprintf(buf+1, buf_len-3, (hwloc_bitmap_t)mask); 74 buf[num_chars_written+1] = '}'; 75 buf[num_chars_written+2] = '\0'; 76 } else { 77 // Need to truncate the affinity mask string and add ellipsis. 78 // To do this, we first write out the '{' + str(mask) 79 buf[0] = '{'; 80 hwloc_bitmap_list_snprintf(buf+1, buf_len-7, (hwloc_bitmap_t)mask); 81 // then, what we do here is go to the 7th to last character, then go backwards until we are NOT 82 // on a digit then write "...}\0". This way it is a clean ellipsis addition and we don't 83 // overwrite part of an affinity number. i.e., we avoid something like { 45, 67, 8...} and get 84 // { 45, 67,...} instead. 85 scan = buf + buf_len - 7; 86 while(*scan >= '0' && *scan <= '9' && scan >= buf) 87 scan--; 88 *(scan+1) = '.'; 89 *(scan+2) = '.'; 90 *(scan+3) = '.'; 91 *(scan+4) = '}'; 92 *(scan+5) = '\0'; 93 } 94 return buf; 95 } 96 #else 97 char * 98 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 99 { 100 KMP_ASSERT(buf_len >= 40); 101 char *scan = buf; 102 char *end = buf + buf_len - 1; 103 104 // 105 // Find first element / check for empty set. 106 // 107 size_t i; 108 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 109 if (KMP_CPU_ISSET(i, mask)) { 110 break; 111 } 112 } 113 if (i == KMP_CPU_SETSIZE) { 114 KMP_SNPRINTF(scan, end-scan+1, "{<empty>}"); 115 while (*scan != '\0') scan++; 116 KMP_ASSERT(scan <= end); 117 return buf; 118 } 119 120 KMP_SNPRINTF(scan, end-scan+1, "{%ld", (long)i); 121 while (*scan != '\0') scan++; 122 i++; 123 for (; i < KMP_CPU_SETSIZE; i++) { 124 if (! KMP_CPU_ISSET(i, mask)) { 125 continue; 126 } 127 128 // 129 // Check for buffer overflow. A string of the form ",<n>" will have 130 // at most 10 characters, plus we want to leave room to print ",...}" 131 // if the set is too large to print for a total of 15 characters. 132 // We already left room for '\0' in setting end. 133 // 134 if (end - scan < 15) { 135 break; 136 } 137 KMP_SNPRINTF(scan, end-scan+1, ",%-ld", (long)i); 138 while (*scan != '\0') scan++; 139 } 140 if (i < KMP_CPU_SETSIZE) { 141 KMP_SNPRINTF(scan, end-scan+1, ",..."); 142 while (*scan != '\0') scan++; 143 } 144 KMP_SNPRINTF(scan, end-scan+1, "}"); 145 while (*scan != '\0') scan++; 146 KMP_ASSERT(scan <= end); 147 return buf; 148 } 149 #endif // KMP_USE_HWLOC 150 151 152 void 153 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 154 { 155 KMP_CPU_ZERO(mask); 156 157 # if KMP_GROUP_AFFINITY 158 159 if (__kmp_num_proc_groups > 1) { 160 int group; 161 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 162 for (group = 0; group < __kmp_num_proc_groups; group++) { 163 int i; 164 int num = __kmp_GetActiveProcessorCount(group); 165 for (i = 0; i < num; i++) { 166 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 167 } 168 } 169 } 170 else 171 172 # endif /* KMP_GROUP_AFFINITY */ 173 174 { 175 int proc; 176 for (proc = 0; proc < __kmp_xproc; proc++) { 177 KMP_CPU_SET(proc, mask); 178 } 179 } 180 } 181 182 // 183 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 184 // called to renumber the labels from [0..n] and place them into the child_num 185 // vector of the address object. This is done in case the labels used for 186 // the children at one node of the hierarchy differ from those used for 187 // another node at the same level. Example: suppose the machine has 2 nodes 188 // with 2 packages each. The first node contains packages 601 and 602, and 189 // second node contains packages 603 and 604. If we try to sort the table 190 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 191 // because we are paying attention to the labels themselves, not the ordinal 192 // child numbers. By using the child numbers in the sort, the result is 193 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 194 // 195 static void 196 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 197 int numAddrs) 198 { 199 KMP_DEBUG_ASSERT(numAddrs > 0); 200 int depth = address2os->first.depth; 201 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 202 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 203 * sizeof(unsigned)); 204 int labCt; 205 for (labCt = 0; labCt < depth; labCt++) { 206 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 207 lastLabel[labCt] = address2os[0].first.labels[labCt]; 208 } 209 int i; 210 for (i = 1; i < numAddrs; i++) { 211 for (labCt = 0; labCt < depth; labCt++) { 212 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 213 int labCt2; 214 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 215 counts[labCt2] = 0; 216 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 217 } 218 counts[labCt]++; 219 lastLabel[labCt] = address2os[i].first.labels[labCt]; 220 break; 221 } 222 } 223 for (labCt = 0; labCt < depth; labCt++) { 224 address2os[i].first.childNums[labCt] = counts[labCt]; 225 } 226 for (; labCt < (int)Address::maxDepth; labCt++) { 227 address2os[i].first.childNums[labCt] = 0; 228 } 229 } 230 } 231 232 233 // 234 // All of the __kmp_affinity_create_*_map() routines should set 235 // __kmp_affinity_masks to a vector of affinity mask objects of length 236 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 237 // return the number of levels in the machine topology tree (zero if 238 // __kmp_affinity_type == affinity_none). 239 // 240 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 241 // to the affinity mask for the initialization thread. They need to save and 242 // restore the mask, and it could be needed later, so saving it is just an 243 // optimization to avoid calling kmp_get_system_affinity() again. 244 // 245 static kmp_affin_mask_t *fullMask = NULL; 246 247 kmp_affin_mask_t * 248 __kmp_affinity_get_fullMask() { return fullMask; } 249 250 251 static int nCoresPerPkg, nPackages; 252 static int __kmp_nThreadsPerCore; 253 #ifndef KMP_DFLT_NTH_CORES 254 static int __kmp_ncores; 255 #endif 256 257 // 258 // __kmp_affinity_uniform_topology() doesn't work when called from 259 // places which support arbitrarily many levels in the machine topology 260 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 261 // __kmp_affinity_create_x2apicid_map(). 262 // 263 inline static bool 264 __kmp_affinity_uniform_topology() 265 { 266 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 267 } 268 269 270 // 271 // Print out the detailed machine topology map, i.e. the physical locations 272 // of each OS proc. 273 // 274 static void 275 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 276 int pkgLevel, int coreLevel, int threadLevel) 277 { 278 int proc; 279 280 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 281 for (proc = 0; proc < len; proc++) { 282 int level; 283 kmp_str_buf_t buf; 284 __kmp_str_buf_init(&buf); 285 for (level = 0; level < depth; level++) { 286 if (level == threadLevel) { 287 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 288 } 289 else if (level == coreLevel) { 290 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 291 } 292 else if (level == pkgLevel) { 293 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 294 } 295 else if (level > pkgLevel) { 296 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 297 level - pkgLevel - 1); 298 } 299 else { 300 __kmp_str_buf_print(&buf, "L%d ", level); 301 } 302 __kmp_str_buf_print(&buf, "%d ", 303 address2os[proc].first.labels[level]); 304 } 305 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 306 buf.str); 307 __kmp_str_buf_free(&buf); 308 } 309 } 310 311 #if KMP_USE_HWLOC 312 static int 313 __kmp_affinity_create_hwloc_map(AddrUnsPair **address2os, 314 kmp_i18n_id_t *const msg_id) 315 { 316 *address2os = NULL; 317 *msg_id = kmp_i18n_null; 318 319 // 320 // Save the affinity mask for the current thread. 321 // 322 kmp_affin_mask_t *oldMask; 323 KMP_CPU_ALLOC(oldMask); 324 __kmp_get_system_affinity(oldMask, TRUE); 325 326 unsigned depth = hwloc_topology_get_depth(__kmp_hwloc_topology); 327 int threadLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_PU); 328 int coreLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_CORE); 329 int pkgLevel = hwloc_get_type_depth(__kmp_hwloc_topology, HWLOC_OBJ_SOCKET); 330 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 0; 331 332 // 333 // This makes an assumption about the topology being four levels: 334 // machines -> packages -> cores -> hardware threads 335 // 336 hwloc_obj_t current_level_iterator = hwloc_get_root_obj(__kmp_hwloc_topology); 337 hwloc_obj_t child_iterator; 338 for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); 339 child_iterator != NULL; 340 child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) 341 { 342 nPackages++; 343 } 344 current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, pkgLevel, 0); 345 for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); 346 child_iterator != NULL; 347 child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) 348 { 349 nCoresPerPkg++; 350 } 351 current_level_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, coreLevel, 0); 352 for(child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, NULL); 353 child_iterator != NULL; 354 child_iterator = hwloc_get_next_child(__kmp_hwloc_topology, current_level_iterator, child_iterator)) 355 { 356 __kmp_nThreadsPerCore++; 357 } 358 359 if (! KMP_AFFINITY_CAPABLE()) 360 { 361 // 362 // Hack to try and infer the machine topology using only the data 363 // available from cpuid on the current thread, and __kmp_xproc. 364 // 365 KMP_ASSERT(__kmp_affinity_type == affinity_none); 366 367 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 368 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 369 if (__kmp_affinity_verbose) { 370 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 371 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 372 if (__kmp_affinity_uniform_topology()) { 373 KMP_INFORM(Uniform, "KMP_AFFINITY"); 374 } else { 375 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 376 } 377 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 378 __kmp_nThreadsPerCore, __kmp_ncores); 379 } 380 return 0; 381 } 382 383 // 384 // Allocate the data structure to be returned. 385 // 386 AddrUnsPair *retval = (AddrUnsPair *)__kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 387 388 unsigned num_hardware_threads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel); 389 unsigned i; 390 hwloc_obj_t hardware_thread_iterator; 391 int nActiveThreads = 0; 392 for(i=0;i<num_hardware_threads;i++) { 393 hardware_thread_iterator = hwloc_get_obj_by_depth(__kmp_hwloc_topology, threadLevel, i); 394 Address addr(3); 395 if(! KMP_CPU_ISSET(i, fullMask)) continue; 396 addr.labels[0] = hardware_thread_iterator->parent->parent->logical_index; 397 addr.labels[1] = hardware_thread_iterator->parent->logical_index % nCoresPerPkg; 398 addr.labels[2] = hardware_thread_iterator->logical_index % __kmp_nThreadsPerCore; 399 retval[nActiveThreads] = AddrUnsPair(addr, hardware_thread_iterator->os_index); 400 nActiveThreads++; 401 } 402 403 // 404 // If there's only one thread context to bind to, return now. 405 // 406 KMP_ASSERT(nActiveThreads > 0); 407 if (nActiveThreads == 1) { 408 __kmp_ncores = nPackages = 1; 409 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 410 if (__kmp_affinity_verbose) { 411 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 412 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 413 414 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 415 if (__kmp_affinity_respect_mask) { 416 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 417 } else { 418 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 419 } 420 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 421 KMP_INFORM(Uniform, "KMP_AFFINITY"); 422 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 423 __kmp_nThreadsPerCore, __kmp_ncores); 424 } 425 426 if (__kmp_affinity_type == affinity_none) { 427 __kmp_free(retval); 428 KMP_CPU_FREE(oldMask); 429 return 0; 430 } 431 432 // 433 // Form an Address object which only includes the package level. 434 // 435 Address addr(1); 436 addr.labels[0] = retval[0].first.labels[pkgLevel-1]; 437 retval[0].first = addr; 438 439 if (__kmp_affinity_gran_levels < 0) { 440 __kmp_affinity_gran_levels = 0; 441 } 442 443 if (__kmp_affinity_verbose) { 444 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 445 } 446 447 *address2os = retval; 448 KMP_CPU_FREE(oldMask); 449 return 1; 450 } 451 452 // 453 // Sort the table by physical Id. 454 // 455 qsort(retval, nActiveThreads, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 456 457 // 458 // When affinity is off, this routine will still be called to set 459 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 460 // nCoresPerPkg, & nPackages. Make sure all these vars are set 461 // correctly, and return if affinity is not enabled. 462 // 463 __kmp_ncores = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, coreLevel); 464 465 // 466 // Check to see if the machine topology is uniform 467 // 468 unsigned npackages = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, pkgLevel); 469 unsigned ncores = __kmp_ncores; 470 unsigned nthreads = hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology, threadLevel); 471 unsigned uniform = (npackages * nCoresPerPkg * __kmp_nThreadsPerCore == nthreads); 472 473 // 474 // Print the machine topology summary. 475 // 476 if (__kmp_affinity_verbose) { 477 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 478 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 479 480 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 481 if (__kmp_affinity_respect_mask) { 482 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 483 } else { 484 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 485 } 486 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 487 if (uniform) { 488 KMP_INFORM(Uniform, "KMP_AFFINITY"); 489 } else { 490 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 491 } 492 493 kmp_str_buf_t buf; 494 __kmp_str_buf_init(&buf); 495 496 __kmp_str_buf_print(&buf, "%d", npackages); 497 //for (level = 1; level <= pkgLevel; level++) { 498 // __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 499 // } 500 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 501 __kmp_nThreadsPerCore, __kmp_ncores); 502 503 __kmp_str_buf_free(&buf); 504 } 505 506 if (__kmp_affinity_type == affinity_none) { 507 KMP_CPU_FREE(oldMask); 508 return 0; 509 } 510 511 // 512 // Find any levels with radiix 1, and remove them from the map 513 // (except for the package level). 514 // 515 int new_depth = 0; 516 int level; 517 unsigned proc; 518 for (level = 1; level < (int)depth; level++) { 519 if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) { 520 continue; 521 } 522 new_depth++; 523 } 524 525 // 526 // If we are removing any levels, allocate a new vector to return, 527 // and copy the relevant information to it. 528 // 529 if (new_depth != depth-1) { 530 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 531 sizeof(AddrUnsPair) * nActiveThreads); 532 for (proc = 0; (int)proc < nActiveThreads; proc++) { 533 Address addr(new_depth); 534 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 535 } 536 int new_level = 0; 537 for (level = 1; level < (int)depth; level++) { 538 if ((hwloc_get_nbobjs_by_depth(__kmp_hwloc_topology,level) == 1) && (level != pkgLevel)) { 539 if (level == threadLevel) { 540 threadLevel = -1; 541 } 542 else if ((threadLevel >= 0) && (level < threadLevel)) { 543 threadLevel--; 544 } 545 if (level == coreLevel) { 546 coreLevel = -1; 547 } 548 else if ((coreLevel >= 0) && (level < coreLevel)) { 549 coreLevel--; 550 } 551 if (level < pkgLevel) { 552 pkgLevel--; 553 } 554 continue; 555 } 556 for (proc = 0; (int)proc < nActiveThreads; proc++) { 557 new_retval[proc].first.labels[new_level] 558 = retval[proc].first.labels[level]; 559 } 560 new_level++; 561 } 562 563 __kmp_free(retval); 564 retval = new_retval; 565 depth = new_depth; 566 } 567 568 if (__kmp_affinity_gran_levels < 0) { 569 // 570 // Set the granularity level based on what levels are modeled 571 // in the machine topology map. 572 // 573 __kmp_affinity_gran_levels = 0; 574 if ((threadLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 575 __kmp_affinity_gran_levels++; 576 } 577 if ((coreLevel-1 >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 578 __kmp_affinity_gran_levels++; 579 } 580 if (__kmp_affinity_gran > affinity_gran_package) { 581 __kmp_affinity_gran_levels++; 582 } 583 } 584 585 if (__kmp_affinity_verbose) { 586 __kmp_affinity_print_topology(retval, nActiveThreads, depth-1, pkgLevel-1, 587 coreLevel-1, threadLevel-1); 588 } 589 590 KMP_CPU_FREE(oldMask); 591 *address2os = retval; 592 if(depth == 0) return 0; 593 else return depth-1; 594 } 595 #endif // KMP_USE_HWLOC 596 597 // 598 // If we don't know how to retrieve the machine's processor topology, or 599 // encounter an error in doing so, this routine is called to form a "flat" 600 // mapping of os thread id's <-> processor id's. 601 // 602 static int 603 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 604 kmp_i18n_id_t *const msg_id) 605 { 606 *address2os = NULL; 607 *msg_id = kmp_i18n_null; 608 609 // 610 // Even if __kmp_affinity_type == affinity_none, this routine might still 611 // called to set __kmp_ncores, as well as 612 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 613 // 614 if (! KMP_AFFINITY_CAPABLE()) { 615 KMP_ASSERT(__kmp_affinity_type == affinity_none); 616 __kmp_ncores = nPackages = __kmp_xproc; 617 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 618 if (__kmp_affinity_verbose) { 619 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 620 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 621 KMP_INFORM(Uniform, "KMP_AFFINITY"); 622 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 623 __kmp_nThreadsPerCore, __kmp_ncores); 624 } 625 return 0; 626 } 627 628 // 629 // When affinity is off, this routine will still be called to set 630 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 631 // nCoresPerPkg, & nPackages. Make sure all these vars are set 632 // correctly, and return now if affinity is not enabled. 633 // 634 __kmp_ncores = nPackages = __kmp_avail_proc; 635 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 636 if (__kmp_affinity_verbose) { 637 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 638 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 639 640 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 641 if (__kmp_affinity_respect_mask) { 642 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 643 } else { 644 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 645 } 646 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 647 KMP_INFORM(Uniform, "KMP_AFFINITY"); 648 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 649 __kmp_nThreadsPerCore, __kmp_ncores); 650 } 651 if (__kmp_affinity_type == affinity_none) { 652 return 0; 653 } 654 655 // 656 // Contruct the data structure to be returned. 657 // 658 *address2os = (AddrUnsPair*) 659 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 660 int avail_ct = 0; 661 unsigned int i; 662 KMP_CPU_SET_ITERATE(i, fullMask) { 663 // 664 // Skip this proc if it is not included in the machine model. 665 // 666 if (! KMP_CPU_ISSET(i, fullMask)) { 667 continue; 668 } 669 670 Address addr(1); 671 addr.labels[0] = i; 672 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 673 } 674 if (__kmp_affinity_verbose) { 675 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 676 } 677 678 if (__kmp_affinity_gran_levels < 0) { 679 // 680 // Only the package level is modeled in the machine topology map, 681 // so the #levels of granularity is either 0 or 1. 682 // 683 if (__kmp_affinity_gran > affinity_gran_package) { 684 __kmp_affinity_gran_levels = 1; 685 } 686 else { 687 __kmp_affinity_gran_levels = 0; 688 } 689 } 690 return 1; 691 } 692 693 694 # if KMP_GROUP_AFFINITY 695 696 // 697 // If multiple Windows* OS processor groups exist, we can create a 2-level 698 // topology map with the groups at level 0 and the individual procs at 699 // level 1. 700 // 701 // This facilitates letting the threads float among all procs in a group, 702 // if granularity=group (the default when there are multiple groups). 703 // 704 static int 705 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 706 kmp_i18n_id_t *const msg_id) 707 { 708 *address2os = NULL; 709 *msg_id = kmp_i18n_null; 710 711 // 712 // If we don't have multiple processor groups, return now. 713 // The flat mapping will be used. 714 // 715 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 716 // FIXME set *msg_id 717 return -1; 718 } 719 720 // 721 // Contruct the data structure to be returned. 722 // 723 *address2os = (AddrUnsPair*) 724 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 725 int avail_ct = 0; 726 int i; 727 KMP_CPU_SET_ITERATE(i, fullMask) { 728 // 729 // Skip this proc if it is not included in the machine model. 730 // 731 if (! KMP_CPU_ISSET(i, fullMask)) { 732 continue; 733 } 734 735 Address addr(2); 736 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 737 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 738 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 739 740 if (__kmp_affinity_verbose) { 741 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 742 addr.labels[1]); 743 } 744 } 745 746 if (__kmp_affinity_gran_levels < 0) { 747 if (__kmp_affinity_gran == affinity_gran_group) { 748 __kmp_affinity_gran_levels = 1; 749 } 750 else if ((__kmp_affinity_gran == affinity_gran_fine) 751 || (__kmp_affinity_gran == affinity_gran_thread)) { 752 __kmp_affinity_gran_levels = 0; 753 } 754 else { 755 const char *gran_str = NULL; 756 if (__kmp_affinity_gran == affinity_gran_core) { 757 gran_str = "core"; 758 } 759 else if (__kmp_affinity_gran == affinity_gran_package) { 760 gran_str = "package"; 761 } 762 else if (__kmp_affinity_gran == affinity_gran_node) { 763 gran_str = "node"; 764 } 765 else { 766 KMP_ASSERT(0); 767 } 768 769 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 770 __kmp_affinity_gran_levels = 0; 771 } 772 } 773 return 2; 774 } 775 776 # endif /* KMP_GROUP_AFFINITY */ 777 778 779 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 780 781 static int 782 __kmp_cpuid_mask_width(int count) { 783 int r = 0; 784 785 while((1<<r) < count) 786 ++r; 787 return r; 788 } 789 790 791 class apicThreadInfo { 792 public: 793 unsigned osId; // param to __kmp_affinity_bind_thread 794 unsigned apicId; // from cpuid after binding 795 unsigned maxCoresPerPkg; // "" 796 unsigned maxThreadsPerPkg; // "" 797 unsigned pkgId; // inferred from above values 798 unsigned coreId; // "" 799 unsigned threadId; // "" 800 }; 801 802 803 static int 804 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 805 { 806 const apicThreadInfo *aa = (const apicThreadInfo *)a; 807 const apicThreadInfo *bb = (const apicThreadInfo *)b; 808 if (aa->osId < bb->osId) return -1; 809 if (aa->osId > bb->osId) return 1; 810 return 0; 811 } 812 813 814 static int 815 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 816 { 817 const apicThreadInfo *aa = (const apicThreadInfo *)a; 818 const apicThreadInfo *bb = (const apicThreadInfo *)b; 819 if (aa->pkgId < bb->pkgId) return -1; 820 if (aa->pkgId > bb->pkgId) return 1; 821 if (aa->coreId < bb->coreId) return -1; 822 if (aa->coreId > bb->coreId) return 1; 823 if (aa->threadId < bb->threadId) return -1; 824 if (aa->threadId > bb->threadId) return 1; 825 return 0; 826 } 827 828 829 // 830 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 831 // an algorithm which cycles through the available os threads, setting 832 // the current thread's affinity mask to that thread, and then retrieves 833 // the Apic Id for each thread context using the cpuid instruction. 834 // 835 static int 836 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 837 kmp_i18n_id_t *const msg_id) 838 { 839 kmp_cpuid buf; 840 int rc; 841 *address2os = NULL; 842 *msg_id = kmp_i18n_null; 843 844 // 845 // Check if cpuid leaf 4 is supported. 846 // 847 __kmp_x86_cpuid(0, 0, &buf); 848 if (buf.eax < 4) { 849 *msg_id = kmp_i18n_str_NoLeaf4Support; 850 return -1; 851 } 852 853 // 854 // The algorithm used starts by setting the affinity to each available 855 // thread and retrieving info from the cpuid instruction, so if we are 856 // not capable of calling __kmp_get_system_affinity() and 857 // _kmp_get_system_affinity(), then we need to do something else - use 858 // the defaults that we calculated from issuing cpuid without binding 859 // to each proc. 860 // 861 if (! KMP_AFFINITY_CAPABLE()) { 862 // 863 // Hack to try and infer the machine topology using only the data 864 // available from cpuid on the current thread, and __kmp_xproc. 865 // 866 KMP_ASSERT(__kmp_affinity_type == affinity_none); 867 868 // 869 // Get an upper bound on the number of threads per package using 870 // cpuid(1). 871 // 872 // On some OS/chps combinations where HT is supported by the chip 873 // but is disabled, this value will be 2 on a single core chip. 874 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 875 // 876 __kmp_x86_cpuid(1, 0, &buf); 877 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 878 if (maxThreadsPerPkg == 0) { 879 maxThreadsPerPkg = 1; 880 } 881 882 // 883 // The num cores per pkg comes from cpuid(4). 884 // 1 must be added to the encoded value. 885 // 886 // The author of cpu_count.cpp treated this only an upper bound 887 // on the number of cores, but I haven't seen any cases where it 888 // was greater than the actual number of cores, so we will treat 889 // it as exact in this block of code. 890 // 891 // First, we need to check if cpuid(4) is supported on this chip. 892 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 893 // has the value n or greater. 894 // 895 __kmp_x86_cpuid(0, 0, &buf); 896 if (buf.eax >= 4) { 897 __kmp_x86_cpuid(4, 0, &buf); 898 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 899 } 900 else { 901 nCoresPerPkg = 1; 902 } 903 904 // 905 // There is no way to reliably tell if HT is enabled without issuing 906 // the cpuid instruction from every thread, can correlating the cpuid 907 // info, so if the machine is not affinity capable, we assume that HT 908 // is off. We have seen quite a few machines where maxThreadsPerPkg 909 // is 2, yet the machine does not support HT. 910 // 911 // - Older OSes are usually found on machines with older chips, which 912 // do not support HT. 913 // 914 // - The performance penalty for mistakenly identifying a machine as 915 // HT when it isn't (which results in blocktime being incorrecly set 916 // to 0) is greater than the penalty when for mistakenly identifying 917 // a machine as being 1 thread/core when it is really HT enabled 918 // (which results in blocktime being incorrectly set to a positive 919 // value). 920 // 921 __kmp_ncores = __kmp_xproc; 922 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 923 __kmp_nThreadsPerCore = 1; 924 if (__kmp_affinity_verbose) { 925 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 926 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 927 if (__kmp_affinity_uniform_topology()) { 928 KMP_INFORM(Uniform, "KMP_AFFINITY"); 929 } else { 930 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 931 } 932 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 933 __kmp_nThreadsPerCore, __kmp_ncores); 934 } 935 return 0; 936 } 937 938 // 939 // 940 // From here on, we can assume that it is safe to call 941 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 942 // even if __kmp_affinity_type = affinity_none. 943 // 944 945 // 946 // Save the affinity mask for the current thread. 947 // 948 kmp_affin_mask_t *oldMask; 949 KMP_CPU_ALLOC(oldMask); 950 KMP_ASSERT(oldMask != NULL); 951 __kmp_get_system_affinity(oldMask, TRUE); 952 953 // 954 // Run through each of the available contexts, binding the current thread 955 // to it, and obtaining the pertinent information using the cpuid instr. 956 // 957 // The relevant information is: 958 // 959 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 960 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 961 // 962 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 963 // value of this field determines the width of the core# + thread# 964 // fields in the Apic Id. It is also an upper bound on the number 965 // of threads per package, but it has been verified that situations 966 // happen were it is not exact. In particular, on certain OS/chip 967 // combinations where Intel(R) Hyper-Threading Technology is supported 968 // by the chip but has 969 // been disabled, the value of this field will be 2 (for a single core 970 // chip). On other OS/chip combinations supporting 971 // Intel(R) Hyper-Threading Technology, the value of 972 // this field will be 1 when Intel(R) Hyper-Threading Technology is 973 // disabled and 2 when it is enabled. 974 // 975 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 976 // value of this field (+1) determines the width of the core# field in 977 // the Apic Id. The comments in "cpucount.cpp" say that this value is 978 // an upper bound, but the IA-32 architecture manual says that it is 979 // exactly the number of cores per package, and I haven't seen any 980 // case where it wasn't. 981 // 982 // From this information, deduce the package Id, core Id, and thread Id, 983 // and set the corresponding fields in the apicThreadInfo struct. 984 // 985 unsigned i; 986 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 987 __kmp_avail_proc * sizeof(apicThreadInfo)); 988 unsigned nApics = 0; 989 KMP_CPU_SET_ITERATE(i, fullMask) { 990 // 991 // Skip this proc if it is not included in the machine model. 992 // 993 if (! KMP_CPU_ISSET(i, fullMask)) { 994 continue; 995 } 996 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 997 998 __kmp_affinity_bind_thread(i); 999 threadInfo[nApics].osId = i; 1000 1001 // 1002 // The apic id and max threads per pkg come from cpuid(1). 1003 // 1004 __kmp_x86_cpuid(1, 0, &buf); 1005 if (! (buf.edx >> 9) & 1) { 1006 __kmp_set_system_affinity(oldMask, TRUE); 1007 __kmp_free(threadInfo); 1008 KMP_CPU_FREE(oldMask); 1009 *msg_id = kmp_i18n_str_ApicNotPresent; 1010 return -1; 1011 } 1012 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 1013 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 1014 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 1015 threadInfo[nApics].maxThreadsPerPkg = 1; 1016 } 1017 1018 // 1019 // Max cores per pkg comes from cpuid(4). 1020 // 1 must be added to the encoded value. 1021 // 1022 // First, we need to check if cpuid(4) is supported on this chip. 1023 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 1024 // has the value n or greater. 1025 // 1026 __kmp_x86_cpuid(0, 0, &buf); 1027 if (buf.eax >= 4) { 1028 __kmp_x86_cpuid(4, 0, &buf); 1029 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 1030 } 1031 else { 1032 threadInfo[nApics].maxCoresPerPkg = 1; 1033 } 1034 1035 // 1036 // Infer the pkgId / coreId / threadId using only the info 1037 // obtained locally. 1038 // 1039 int widthCT = __kmp_cpuid_mask_width( 1040 threadInfo[nApics].maxThreadsPerPkg); 1041 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1042 1043 int widthC = __kmp_cpuid_mask_width( 1044 threadInfo[nApics].maxCoresPerPkg); 1045 int widthT = widthCT - widthC; 1046 if (widthT < 0) { 1047 // 1048 // I've never seen this one happen, but I suppose it could, if 1049 // the cpuid instruction on a chip was really screwed up. 1050 // Make sure to restore the affinity mask before the tail call. 1051 // 1052 __kmp_set_system_affinity(oldMask, TRUE); 1053 __kmp_free(threadInfo); 1054 KMP_CPU_FREE(oldMask); 1055 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1056 return -1; 1057 } 1058 1059 int maskC = (1 << widthC) - 1; 1060 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1061 &maskC; 1062 1063 int maskT = (1 << widthT) - 1; 1064 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1065 1066 nApics++; 1067 } 1068 1069 // 1070 // We've collected all the info we need. 1071 // Restore the old affinity mask for this thread. 1072 // 1073 __kmp_set_system_affinity(oldMask, TRUE); 1074 1075 // 1076 // If there's only one thread context to bind to, form an Address object 1077 // with depth 1 and return immediately (or, if affinity is off, set 1078 // address2os to NULL and return). 1079 // 1080 // If it is configured to omit the package level when there is only a 1081 // single package, the logic at the end of this routine won't work if 1082 // there is only a single thread - it would try to form an Address 1083 // object with depth 0. 1084 // 1085 KMP_ASSERT(nApics > 0); 1086 if (nApics == 1) { 1087 __kmp_ncores = nPackages = 1; 1088 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1089 if (__kmp_affinity_verbose) { 1090 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1091 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1092 1093 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1094 if (__kmp_affinity_respect_mask) { 1095 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1096 } else { 1097 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1098 } 1099 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1100 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1101 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1102 __kmp_nThreadsPerCore, __kmp_ncores); 1103 } 1104 1105 if (__kmp_affinity_type == affinity_none) { 1106 __kmp_free(threadInfo); 1107 KMP_CPU_FREE(oldMask); 1108 return 0; 1109 } 1110 1111 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1112 Address addr(1); 1113 addr.labels[0] = threadInfo[0].pkgId; 1114 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1115 1116 if (__kmp_affinity_gran_levels < 0) { 1117 __kmp_affinity_gran_levels = 0; 1118 } 1119 1120 if (__kmp_affinity_verbose) { 1121 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1122 } 1123 1124 __kmp_free(threadInfo); 1125 KMP_CPU_FREE(oldMask); 1126 return 1; 1127 } 1128 1129 // 1130 // Sort the threadInfo table by physical Id. 1131 // 1132 qsort(threadInfo, nApics, sizeof(*threadInfo), 1133 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1134 1135 // 1136 // The table is now sorted by pkgId / coreId / threadId, but we really 1137 // don't know the radix of any of the fields. pkgId's may be sparsely 1138 // assigned among the chips on a system. Although coreId's are usually 1139 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1140 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1141 // 1142 // For that matter, we don't know what coresPerPkg and threadsPerCore 1143 // (or the total # packages) are at this point - we want to determine 1144 // that now. We only have an upper bound on the first two figures. 1145 // 1146 // We also perform a consistency check at this point: the values returned 1147 // by the cpuid instruction for any thread bound to a given package had 1148 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1149 // 1150 nPackages = 1; 1151 nCoresPerPkg = 1; 1152 __kmp_nThreadsPerCore = 1; 1153 unsigned nCores = 1; 1154 1155 unsigned pkgCt = 1; // to determine radii 1156 unsigned lastPkgId = threadInfo[0].pkgId; 1157 unsigned coreCt = 1; 1158 unsigned lastCoreId = threadInfo[0].coreId; 1159 unsigned threadCt = 1; 1160 unsigned lastThreadId = threadInfo[0].threadId; 1161 1162 // intra-pkg consist checks 1163 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1164 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1165 1166 for (i = 1; i < nApics; i++) { 1167 if (threadInfo[i].pkgId != lastPkgId) { 1168 nCores++; 1169 pkgCt++; 1170 lastPkgId = threadInfo[i].pkgId; 1171 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1172 coreCt = 1; 1173 lastCoreId = threadInfo[i].coreId; 1174 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1175 threadCt = 1; 1176 lastThreadId = threadInfo[i].threadId; 1177 1178 // 1179 // This is a different package, so go on to the next iteration 1180 // without doing any consistency checks. Reset the consistency 1181 // check vars, though. 1182 // 1183 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1184 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1185 continue; 1186 } 1187 1188 if (threadInfo[i].coreId != lastCoreId) { 1189 nCores++; 1190 coreCt++; 1191 lastCoreId = threadInfo[i].coreId; 1192 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1193 threadCt = 1; 1194 lastThreadId = threadInfo[i].threadId; 1195 } 1196 else if (threadInfo[i].threadId != lastThreadId) { 1197 threadCt++; 1198 lastThreadId = threadInfo[i].threadId; 1199 } 1200 else { 1201 __kmp_free(threadInfo); 1202 KMP_CPU_FREE(oldMask); 1203 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1204 return -1; 1205 } 1206 1207 // 1208 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1209 // fields agree between all the threads bounds to a given package. 1210 // 1211 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1212 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1213 __kmp_free(threadInfo); 1214 KMP_CPU_FREE(oldMask); 1215 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1216 return -1; 1217 } 1218 } 1219 nPackages = pkgCt; 1220 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1221 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1222 1223 // 1224 // When affinity is off, this routine will still be called to set 1225 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1226 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1227 // correctly, and return now if affinity is not enabled. 1228 // 1229 __kmp_ncores = nCores; 1230 if (__kmp_affinity_verbose) { 1231 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1232 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1233 1234 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1235 if (__kmp_affinity_respect_mask) { 1236 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1237 } else { 1238 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1239 } 1240 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1241 if (__kmp_affinity_uniform_topology()) { 1242 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1243 } else { 1244 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1245 } 1246 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1247 __kmp_nThreadsPerCore, __kmp_ncores); 1248 1249 } 1250 1251 if (__kmp_affinity_type == affinity_none) { 1252 __kmp_free(threadInfo); 1253 KMP_CPU_FREE(oldMask); 1254 return 0; 1255 } 1256 1257 // 1258 // Now that we've determined the number of packages, the number of cores 1259 // per package, and the number of threads per core, we can construct the 1260 // data structure that is to be returned. 1261 // 1262 int pkgLevel = 0; 1263 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1264 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1265 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1266 1267 KMP_ASSERT(depth > 0); 1268 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1269 1270 for (i = 0; i < nApics; ++i) { 1271 Address addr(depth); 1272 unsigned os = threadInfo[i].osId; 1273 int d = 0; 1274 1275 if (pkgLevel >= 0) { 1276 addr.labels[d++] = threadInfo[i].pkgId; 1277 } 1278 if (coreLevel >= 0) { 1279 addr.labels[d++] = threadInfo[i].coreId; 1280 } 1281 if (threadLevel >= 0) { 1282 addr.labels[d++] = threadInfo[i].threadId; 1283 } 1284 (*address2os)[i] = AddrUnsPair(addr, os); 1285 } 1286 1287 if (__kmp_affinity_gran_levels < 0) { 1288 // 1289 // Set the granularity level based on what levels are modeled 1290 // in the machine topology map. 1291 // 1292 __kmp_affinity_gran_levels = 0; 1293 if ((threadLevel >= 0) 1294 && (__kmp_affinity_gran > affinity_gran_thread)) { 1295 __kmp_affinity_gran_levels++; 1296 } 1297 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1298 __kmp_affinity_gran_levels++; 1299 } 1300 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1301 __kmp_affinity_gran_levels++; 1302 } 1303 } 1304 1305 if (__kmp_affinity_verbose) { 1306 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1307 coreLevel, threadLevel); 1308 } 1309 1310 __kmp_free(threadInfo); 1311 KMP_CPU_FREE(oldMask); 1312 return depth; 1313 } 1314 1315 1316 // 1317 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1318 // architectures support a newer interface for specifying the x2APIC Ids, 1319 // based on cpuid leaf 11. 1320 // 1321 static int 1322 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1323 kmp_i18n_id_t *const msg_id) 1324 { 1325 kmp_cpuid buf; 1326 1327 *address2os = NULL; 1328 *msg_id = kmp_i18n_null; 1329 1330 // 1331 // Check to see if cpuid leaf 11 is supported. 1332 // 1333 __kmp_x86_cpuid(0, 0, &buf); 1334 if (buf.eax < 11) { 1335 *msg_id = kmp_i18n_str_NoLeaf11Support; 1336 return -1; 1337 } 1338 __kmp_x86_cpuid(11, 0, &buf); 1339 if (buf.ebx == 0) { 1340 *msg_id = kmp_i18n_str_NoLeaf11Support; 1341 return -1; 1342 } 1343 1344 // 1345 // Find the number of levels in the machine topology. While we're at it, 1346 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1347 // try to get more accurate values later by explicitly counting them, 1348 // but get reasonable defaults now, in case we return early. 1349 // 1350 int level; 1351 int threadLevel = -1; 1352 int coreLevel = -1; 1353 int pkgLevel = -1; 1354 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1355 1356 for (level = 0;; level++) { 1357 if (level > 31) { 1358 // 1359 // FIXME: Hack for DPD200163180 1360 // 1361 // If level is big then something went wrong -> exiting 1362 // 1363 // There could actually be 32 valid levels in the machine topology, 1364 // but so far, the only machine we have seen which does not exit 1365 // this loop before iteration 32 has fubar x2APIC settings. 1366 // 1367 // For now, just reject this case based upon loop trip count. 1368 // 1369 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1370 return -1; 1371 } 1372 __kmp_x86_cpuid(11, level, &buf); 1373 if (buf.ebx == 0) { 1374 if (pkgLevel < 0) { 1375 // 1376 // Will infer nPackages from __kmp_xproc 1377 // 1378 pkgLevel = level; 1379 level++; 1380 } 1381 break; 1382 } 1383 int kind = (buf.ecx >> 8) & 0xff; 1384 if (kind == 1) { 1385 // 1386 // SMT level 1387 // 1388 threadLevel = level; 1389 coreLevel = -1; 1390 pkgLevel = -1; 1391 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1392 if (__kmp_nThreadsPerCore == 0) { 1393 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1394 return -1; 1395 } 1396 } 1397 else if (kind == 2) { 1398 // 1399 // core level 1400 // 1401 coreLevel = level; 1402 pkgLevel = -1; 1403 nCoresPerPkg = buf.ebx & 0xff; 1404 if (nCoresPerPkg == 0) { 1405 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1406 return -1; 1407 } 1408 } 1409 else { 1410 if (level <= 0) { 1411 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1412 return -1; 1413 } 1414 if (pkgLevel >= 0) { 1415 continue; 1416 } 1417 pkgLevel = level; 1418 nPackages = buf.ebx & 0xff; 1419 if (nPackages == 0) { 1420 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1421 return -1; 1422 } 1423 } 1424 } 1425 int depth = level; 1426 1427 // 1428 // In the above loop, "level" was counted from the finest level (usually 1429 // thread) to the coarsest. The caller expects that we will place the 1430 // labels in (*address2os)[].first.labels[] in the inverse order, so 1431 // we need to invert the vars saying which level means what. 1432 // 1433 if (threadLevel >= 0) { 1434 threadLevel = depth - threadLevel - 1; 1435 } 1436 if (coreLevel >= 0) { 1437 coreLevel = depth - coreLevel - 1; 1438 } 1439 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1440 pkgLevel = depth - pkgLevel - 1; 1441 1442 // 1443 // The algorithm used starts by setting the affinity to each available 1444 // thread and retrieving info from the cpuid instruction, so if we are 1445 // not capable of calling __kmp_get_system_affinity() and 1446 // _kmp_get_system_affinity(), then we need to do something else - use 1447 // the defaults that we calculated from issuing cpuid without binding 1448 // to each proc. 1449 // 1450 if (! KMP_AFFINITY_CAPABLE()) 1451 { 1452 // 1453 // Hack to try and infer the machine topology using only the data 1454 // available from cpuid on the current thread, and __kmp_xproc. 1455 // 1456 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1457 1458 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1459 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1460 if (__kmp_affinity_verbose) { 1461 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1462 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1463 if (__kmp_affinity_uniform_topology()) { 1464 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1465 } else { 1466 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1467 } 1468 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1469 __kmp_nThreadsPerCore, __kmp_ncores); 1470 } 1471 return 0; 1472 } 1473 1474 // 1475 // 1476 // From here on, we can assume that it is safe to call 1477 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1478 // even if __kmp_affinity_type = affinity_none. 1479 // 1480 1481 // 1482 // Save the affinity mask for the current thread. 1483 // 1484 kmp_affin_mask_t *oldMask; 1485 KMP_CPU_ALLOC(oldMask); 1486 __kmp_get_system_affinity(oldMask, TRUE); 1487 1488 // 1489 // Allocate the data structure to be returned. 1490 // 1491 AddrUnsPair *retval = (AddrUnsPair *) 1492 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1493 1494 // 1495 // Run through each of the available contexts, binding the current thread 1496 // to it, and obtaining the pertinent information using the cpuid instr. 1497 // 1498 unsigned int proc; 1499 int nApics = 0; 1500 KMP_CPU_SET_ITERATE(proc, fullMask) { 1501 // 1502 // Skip this proc if it is not included in the machine model. 1503 // 1504 if (! KMP_CPU_ISSET(proc, fullMask)) { 1505 continue; 1506 } 1507 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1508 1509 __kmp_affinity_bind_thread(proc); 1510 1511 // 1512 // Extrach the labels for each level in the machine topology map 1513 // from the Apic ID. 1514 // 1515 Address addr(depth); 1516 int prev_shift = 0; 1517 1518 for (level = 0; level < depth; level++) { 1519 __kmp_x86_cpuid(11, level, &buf); 1520 unsigned apicId = buf.edx; 1521 if (buf.ebx == 0) { 1522 if (level != depth - 1) { 1523 KMP_CPU_FREE(oldMask); 1524 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1525 return -1; 1526 } 1527 addr.labels[depth - level - 1] = apicId >> prev_shift; 1528 level++; 1529 break; 1530 } 1531 int shift = buf.eax & 0x1f; 1532 int mask = (1 << shift) - 1; 1533 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1534 prev_shift = shift; 1535 } 1536 if (level != depth) { 1537 KMP_CPU_FREE(oldMask); 1538 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1539 return -1; 1540 } 1541 1542 retval[nApics] = AddrUnsPair(addr, proc); 1543 nApics++; 1544 } 1545 1546 // 1547 // We've collected all the info we need. 1548 // Restore the old affinity mask for this thread. 1549 // 1550 __kmp_set_system_affinity(oldMask, TRUE); 1551 1552 // 1553 // If there's only one thread context to bind to, return now. 1554 // 1555 KMP_ASSERT(nApics > 0); 1556 if (nApics == 1) { 1557 __kmp_ncores = nPackages = 1; 1558 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1559 if (__kmp_affinity_verbose) { 1560 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1561 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1562 1563 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1564 if (__kmp_affinity_respect_mask) { 1565 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1566 } else { 1567 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1568 } 1569 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1570 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1571 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1572 __kmp_nThreadsPerCore, __kmp_ncores); 1573 } 1574 1575 if (__kmp_affinity_type == affinity_none) { 1576 __kmp_free(retval); 1577 KMP_CPU_FREE(oldMask); 1578 return 0; 1579 } 1580 1581 // 1582 // Form an Address object which only includes the package level. 1583 // 1584 Address addr(1); 1585 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1586 retval[0].first = addr; 1587 1588 if (__kmp_affinity_gran_levels < 0) { 1589 __kmp_affinity_gran_levels = 0; 1590 } 1591 1592 if (__kmp_affinity_verbose) { 1593 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1594 } 1595 1596 *address2os = retval; 1597 KMP_CPU_FREE(oldMask); 1598 return 1; 1599 } 1600 1601 // 1602 // Sort the table by physical Id. 1603 // 1604 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1605 1606 // 1607 // Find the radix at each of the levels. 1608 // 1609 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1610 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1611 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1612 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1613 for (level = 0; level < depth; level++) { 1614 totals[level] = 1; 1615 maxCt[level] = 1; 1616 counts[level] = 1; 1617 last[level] = retval[0].first.labels[level]; 1618 } 1619 1620 // 1621 // From here on, the iteration variable "level" runs from the finest 1622 // level to the coarsest, i.e. we iterate forward through 1623 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1624 // backwards. 1625 // 1626 for (proc = 1; (int)proc < nApics; proc++) { 1627 int level; 1628 for (level = 0; level < depth; level++) { 1629 if (retval[proc].first.labels[level] != last[level]) { 1630 int j; 1631 for (j = level + 1; j < depth; j++) { 1632 totals[j]++; 1633 counts[j] = 1; 1634 // The line below causes printing incorrect topology information 1635 // in case the max value for some level (maxCt[level]) is encountered earlier than 1636 // some less value while going through the array. 1637 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1638 // whereas it must be 4. 1639 // TODO!!! Check if it can be commented safely 1640 //maxCt[j] = 1; 1641 last[j] = retval[proc].first.labels[j]; 1642 } 1643 totals[level]++; 1644 counts[level]++; 1645 if (counts[level] > maxCt[level]) { 1646 maxCt[level] = counts[level]; 1647 } 1648 last[level] = retval[proc].first.labels[level]; 1649 break; 1650 } 1651 else if (level == depth - 1) { 1652 __kmp_free(last); 1653 __kmp_free(maxCt); 1654 __kmp_free(counts); 1655 __kmp_free(totals); 1656 __kmp_free(retval); 1657 KMP_CPU_FREE(oldMask); 1658 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1659 return -1; 1660 } 1661 } 1662 } 1663 1664 // 1665 // When affinity is off, this routine will still be called to set 1666 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 1667 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1668 // correctly, and return if affinity is not enabled. 1669 // 1670 if (threadLevel >= 0) { 1671 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1672 } 1673 else { 1674 __kmp_nThreadsPerCore = 1; 1675 } 1676 nPackages = totals[pkgLevel]; 1677 1678 if (coreLevel >= 0) { 1679 __kmp_ncores = totals[coreLevel]; 1680 nCoresPerPkg = maxCt[coreLevel]; 1681 } 1682 else { 1683 __kmp_ncores = nPackages; 1684 nCoresPerPkg = 1; 1685 } 1686 1687 // 1688 // Check to see if the machine topology is uniform 1689 // 1690 unsigned prod = maxCt[0]; 1691 for (level = 1; level < depth; level++) { 1692 prod *= maxCt[level]; 1693 } 1694 bool uniform = (prod == totals[level - 1]); 1695 1696 // 1697 // Print the machine topology summary. 1698 // 1699 if (__kmp_affinity_verbose) { 1700 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1701 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1702 1703 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1704 if (__kmp_affinity_respect_mask) { 1705 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1706 } else { 1707 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1708 } 1709 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1710 if (uniform) { 1711 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1712 } else { 1713 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1714 } 1715 1716 kmp_str_buf_t buf; 1717 __kmp_str_buf_init(&buf); 1718 1719 __kmp_str_buf_print(&buf, "%d", totals[0]); 1720 for (level = 1; level <= pkgLevel; level++) { 1721 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1722 } 1723 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1724 __kmp_nThreadsPerCore, __kmp_ncores); 1725 1726 __kmp_str_buf_free(&buf); 1727 } 1728 1729 if (__kmp_affinity_type == affinity_none) { 1730 __kmp_free(last); 1731 __kmp_free(maxCt); 1732 __kmp_free(counts); 1733 __kmp_free(totals); 1734 __kmp_free(retval); 1735 KMP_CPU_FREE(oldMask); 1736 return 0; 1737 } 1738 1739 // 1740 // Find any levels with radiix 1, and remove them from the map 1741 // (except for the package level). 1742 // 1743 int new_depth = 0; 1744 for (level = 0; level < depth; level++) { 1745 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1746 continue; 1747 } 1748 new_depth++; 1749 } 1750 1751 // 1752 // If we are removing any levels, allocate a new vector to return, 1753 // and copy the relevant information to it. 1754 // 1755 if (new_depth != depth) { 1756 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1757 sizeof(AddrUnsPair) * nApics); 1758 for (proc = 0; (int)proc < nApics; proc++) { 1759 Address addr(new_depth); 1760 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1761 } 1762 int new_level = 0; 1763 int newPkgLevel = -1; 1764 int newCoreLevel = -1; 1765 int newThreadLevel = -1; 1766 int i; 1767 for (level = 0; level < depth; level++) { 1768 if ((maxCt[level] == 1) 1769 && (level != pkgLevel)) { 1770 // 1771 // Remove this level. Never remove the package level 1772 // 1773 continue; 1774 } 1775 if (level == pkgLevel) { 1776 newPkgLevel = level; 1777 } 1778 if (level == coreLevel) { 1779 newCoreLevel = level; 1780 } 1781 if (level == threadLevel) { 1782 newThreadLevel = level; 1783 } 1784 for (proc = 0; (int)proc < nApics; proc++) { 1785 new_retval[proc].first.labels[new_level] 1786 = retval[proc].first.labels[level]; 1787 } 1788 new_level++; 1789 } 1790 1791 __kmp_free(retval); 1792 retval = new_retval; 1793 depth = new_depth; 1794 pkgLevel = newPkgLevel; 1795 coreLevel = newCoreLevel; 1796 threadLevel = newThreadLevel; 1797 } 1798 1799 if (__kmp_affinity_gran_levels < 0) { 1800 // 1801 // Set the granularity level based on what levels are modeled 1802 // in the machine topology map. 1803 // 1804 __kmp_affinity_gran_levels = 0; 1805 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1806 __kmp_affinity_gran_levels++; 1807 } 1808 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1809 __kmp_affinity_gran_levels++; 1810 } 1811 if (__kmp_affinity_gran > affinity_gran_package) { 1812 __kmp_affinity_gran_levels++; 1813 } 1814 } 1815 1816 if (__kmp_affinity_verbose) { 1817 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1818 coreLevel, threadLevel); 1819 } 1820 1821 __kmp_free(last); 1822 __kmp_free(maxCt); 1823 __kmp_free(counts); 1824 __kmp_free(totals); 1825 KMP_CPU_FREE(oldMask); 1826 *address2os = retval; 1827 return depth; 1828 } 1829 1830 1831 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1832 1833 1834 #define osIdIndex 0 1835 #define threadIdIndex 1 1836 #define coreIdIndex 2 1837 #define pkgIdIndex 3 1838 #define nodeIdIndex 4 1839 1840 typedef unsigned *ProcCpuInfo; 1841 static unsigned maxIndex = pkgIdIndex; 1842 1843 1844 static int 1845 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1846 { 1847 const unsigned *aa = (const unsigned *)a; 1848 const unsigned *bb = (const unsigned *)b; 1849 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1850 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1851 return 0; 1852 }; 1853 1854 1855 static int 1856 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1857 { 1858 unsigned i; 1859 const unsigned *aa = *((const unsigned **)a); 1860 const unsigned *bb = *((const unsigned **)b); 1861 for (i = maxIndex; ; i--) { 1862 if (aa[i] < bb[i]) return -1; 1863 if (aa[i] > bb[i]) return 1; 1864 if (i == osIdIndex) break; 1865 } 1866 return 0; 1867 } 1868 1869 1870 // 1871 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1872 // affinity map. 1873 // 1874 static int 1875 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1876 kmp_i18n_id_t *const msg_id, FILE *f) 1877 { 1878 *address2os = NULL; 1879 *msg_id = kmp_i18n_null; 1880 1881 // 1882 // Scan of the file, and count the number of "processor" (osId) fields, 1883 // and find the highest value of <n> for a node_<n> field. 1884 // 1885 char buf[256]; 1886 unsigned num_records = 0; 1887 while (! feof(f)) { 1888 buf[sizeof(buf) - 1] = 1; 1889 if (! fgets(buf, sizeof(buf), f)) { 1890 // 1891 // Read errors presumably because of EOF 1892 // 1893 break; 1894 } 1895 1896 char s1[] = "processor"; 1897 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1898 num_records++; 1899 continue; 1900 } 1901 1902 // 1903 // FIXME - this will match "node_<n> <garbage>" 1904 // 1905 unsigned level; 1906 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 1907 if (nodeIdIndex + level >= maxIndex) { 1908 maxIndex = nodeIdIndex + level; 1909 } 1910 continue; 1911 } 1912 } 1913 1914 // 1915 // Check for empty file / no valid processor records, or too many. 1916 // The number of records can't exceed the number of valid bits in the 1917 // affinity mask. 1918 // 1919 if (num_records == 0) { 1920 *line = 0; 1921 *msg_id = kmp_i18n_str_NoProcRecords; 1922 return -1; 1923 } 1924 if (num_records > (unsigned)__kmp_xproc) { 1925 *line = 0; 1926 *msg_id = kmp_i18n_str_TooManyProcRecords; 1927 return -1; 1928 } 1929 1930 // 1931 // Set the file pointer back to the begginning, so that we can scan the 1932 // file again, this time performing a full parse of the data. 1933 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1934 // Adding an extra element at the end allows us to remove a lot of extra 1935 // checks for termination conditions. 1936 // 1937 if (fseek(f, 0, SEEK_SET) != 0) { 1938 *line = 0; 1939 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1940 return -1; 1941 } 1942 1943 // 1944 // Allocate the array of records to store the proc info in. The dummy 1945 // element at the end makes the logic in filling them out easier to code. 1946 // 1947 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1948 * sizeof(unsigned *)); 1949 unsigned i; 1950 for (i = 0; i <= num_records; i++) { 1951 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1952 * sizeof(unsigned)); 1953 } 1954 1955 #define CLEANUP_THREAD_INFO \ 1956 for (i = 0; i <= num_records; i++) { \ 1957 __kmp_free(threadInfo[i]); \ 1958 } \ 1959 __kmp_free(threadInfo); 1960 1961 // 1962 // A value of UINT_MAX means that we didn't find the field 1963 // 1964 unsigned __index; 1965 1966 #define INIT_PROC_INFO(p) \ 1967 for (__index = 0; __index <= maxIndex; __index++) { \ 1968 (p)[__index] = UINT_MAX; \ 1969 } 1970 1971 for (i = 0; i <= num_records; i++) { 1972 INIT_PROC_INFO(threadInfo[i]); 1973 } 1974 1975 unsigned num_avail = 0; 1976 *line = 0; 1977 while (! feof(f)) { 1978 // 1979 // Create an inner scoping level, so that all the goto targets at the 1980 // end of the loop appear in an outer scoping level. This avoids 1981 // warnings about jumping past an initialization to a target in the 1982 // same block. 1983 // 1984 { 1985 buf[sizeof(buf) - 1] = 1; 1986 bool long_line = false; 1987 if (! fgets(buf, sizeof(buf), f)) { 1988 // 1989 // Read errors presumably because of EOF 1990 // 1991 // If there is valid data in threadInfo[num_avail], then fake 1992 // a blank line in ensure that the last address gets parsed. 1993 // 1994 bool valid = false; 1995 for (i = 0; i <= maxIndex; i++) { 1996 if (threadInfo[num_avail][i] != UINT_MAX) { 1997 valid = true; 1998 } 1999 } 2000 if (! valid) { 2001 break; 2002 } 2003 buf[0] = 0; 2004 } else if (!buf[sizeof(buf) - 1]) { 2005 // 2006 // The line is longer than the buffer. Set a flag and don't 2007 // emit an error if we were going to ignore the line, anyway. 2008 // 2009 long_line = true; 2010 2011 #define CHECK_LINE \ 2012 if (long_line) { \ 2013 CLEANUP_THREAD_INFO; \ 2014 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 2015 return -1; \ 2016 } 2017 } 2018 (*line)++; 2019 2020 char s1[] = "processor"; 2021 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 2022 CHECK_LINE; 2023 char *p = strchr(buf + sizeof(s1) - 1, ':'); 2024 unsigned val; 2025 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2026 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 2027 threadInfo[num_avail][osIdIndex] = val; 2028 #if KMP_OS_LINUX && USE_SYSFS_INFO 2029 char path[256]; 2030 KMP_SNPRINTF(path, sizeof(path), 2031 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 2032 threadInfo[num_avail][osIdIndex]); 2033 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 2034 2035 KMP_SNPRINTF(path, sizeof(path), 2036 "/sys/devices/system/cpu/cpu%u/topology/core_id", 2037 threadInfo[num_avail][osIdIndex]); 2038 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 2039 continue; 2040 #else 2041 } 2042 char s2[] = "physical id"; 2043 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2044 CHECK_LINE; 2045 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2046 unsigned val; 2047 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2048 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2049 threadInfo[num_avail][pkgIdIndex] = val; 2050 continue; 2051 } 2052 char s3[] = "core id"; 2053 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2054 CHECK_LINE; 2055 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2056 unsigned val; 2057 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2058 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2059 threadInfo[num_avail][coreIdIndex] = val; 2060 continue; 2061 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2062 } 2063 char s4[] = "thread id"; 2064 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2065 CHECK_LINE; 2066 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2067 unsigned val; 2068 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2069 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2070 threadInfo[num_avail][threadIdIndex] = val; 2071 continue; 2072 } 2073 unsigned level; 2074 if (KMP_SSCANF(buf, "node_%d id", &level) == 1) { 2075 CHECK_LINE; 2076 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2077 unsigned val; 2078 if ((p == NULL) || (KMP_SSCANF(p + 1, "%u\n", &val) != 1)) goto no_val; 2079 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2080 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2081 threadInfo[num_avail][nodeIdIndex + level] = val; 2082 continue; 2083 } 2084 2085 // 2086 // We didn't recognize the leading token on the line. 2087 // There are lots of leading tokens that we don't recognize - 2088 // if the line isn't empty, go on to the next line. 2089 // 2090 if ((*buf != 0) && (*buf != '\n')) { 2091 // 2092 // If the line is longer than the buffer, read characters 2093 // until we find a newline. 2094 // 2095 if (long_line) { 2096 int ch; 2097 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2098 } 2099 continue; 2100 } 2101 2102 // 2103 // A newline has signalled the end of the processor record. 2104 // Check that there aren't too many procs specified. 2105 // 2106 if ((int)num_avail == __kmp_xproc) { 2107 CLEANUP_THREAD_INFO; 2108 *msg_id = kmp_i18n_str_TooManyEntries; 2109 return -1; 2110 } 2111 2112 // 2113 // Check for missing fields. The osId field must be there, and we 2114 // currently require that the physical id field is specified, also. 2115 // 2116 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2117 CLEANUP_THREAD_INFO; 2118 *msg_id = kmp_i18n_str_MissingProcField; 2119 return -1; 2120 } 2121 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2122 CLEANUP_THREAD_INFO; 2123 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2124 return -1; 2125 } 2126 2127 // 2128 // Skip this proc if it is not included in the machine model. 2129 // 2130 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2131 INIT_PROC_INFO(threadInfo[num_avail]); 2132 continue; 2133 } 2134 2135 // 2136 // We have a successful parse of this proc's info. 2137 // Increment the counter, and prepare for the next proc. 2138 // 2139 num_avail++; 2140 KMP_ASSERT(num_avail <= num_records); 2141 INIT_PROC_INFO(threadInfo[num_avail]); 2142 } 2143 continue; 2144 2145 no_val: 2146 CLEANUP_THREAD_INFO; 2147 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2148 return -1; 2149 2150 dup_field: 2151 CLEANUP_THREAD_INFO; 2152 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2153 return -1; 2154 } 2155 *line = 0; 2156 2157 # if KMP_MIC && REDUCE_TEAM_SIZE 2158 unsigned teamSize = 0; 2159 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2160 2161 // check for num_records == __kmp_xproc ??? 2162 2163 // 2164 // If there's only one thread context to bind to, form an Address object 2165 // with depth 1 and return immediately (or, if affinity is off, set 2166 // address2os to NULL and return). 2167 // 2168 // If it is configured to omit the package level when there is only a 2169 // single package, the logic at the end of this routine won't work if 2170 // there is only a single thread - it would try to form an Address 2171 // object with depth 0. 2172 // 2173 KMP_ASSERT(num_avail > 0); 2174 KMP_ASSERT(num_avail <= num_records); 2175 if (num_avail == 1) { 2176 __kmp_ncores = 1; 2177 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2178 if (__kmp_affinity_verbose) { 2179 if (! KMP_AFFINITY_CAPABLE()) { 2180 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2181 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2182 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2183 } 2184 else { 2185 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2186 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2187 fullMask); 2188 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2189 if (__kmp_affinity_respect_mask) { 2190 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2191 } else { 2192 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2193 } 2194 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2195 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2196 } 2197 int index; 2198 kmp_str_buf_t buf; 2199 __kmp_str_buf_init(&buf); 2200 __kmp_str_buf_print(&buf, "1"); 2201 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2202 __kmp_str_buf_print(&buf, " x 1"); 2203 } 2204 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2205 __kmp_str_buf_free(&buf); 2206 } 2207 2208 if (__kmp_affinity_type == affinity_none) { 2209 CLEANUP_THREAD_INFO; 2210 return 0; 2211 } 2212 2213 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2214 Address addr(1); 2215 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2216 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2217 2218 if (__kmp_affinity_gran_levels < 0) { 2219 __kmp_affinity_gran_levels = 0; 2220 } 2221 2222 if (__kmp_affinity_verbose) { 2223 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2224 } 2225 2226 CLEANUP_THREAD_INFO; 2227 return 1; 2228 } 2229 2230 // 2231 // Sort the threadInfo table by physical Id. 2232 // 2233 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2234 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2235 2236 // 2237 // The table is now sorted by pkgId / coreId / threadId, but we really 2238 // don't know the radix of any of the fields. pkgId's may be sparsely 2239 // assigned among the chips on a system. Although coreId's are usually 2240 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2241 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2242 // 2243 // For that matter, we don't know what coresPerPkg and threadsPerCore 2244 // (or the total # packages) are at this point - we want to determine 2245 // that now. We only have an upper bound on the first two figures. 2246 // 2247 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2248 * sizeof(unsigned)); 2249 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2250 * sizeof(unsigned)); 2251 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2252 * sizeof(unsigned)); 2253 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2254 * sizeof(unsigned)); 2255 2256 bool assign_thread_ids = false; 2257 unsigned threadIdCt; 2258 unsigned index; 2259 2260 restart_radix_check: 2261 threadIdCt = 0; 2262 2263 // 2264 // Initialize the counter arrays with data from threadInfo[0]. 2265 // 2266 if (assign_thread_ids) { 2267 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2268 threadInfo[0][threadIdIndex] = threadIdCt++; 2269 } 2270 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2271 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2272 } 2273 } 2274 for (index = 0; index <= maxIndex; index++) { 2275 counts[index] = 1; 2276 maxCt[index] = 1; 2277 totals[index] = 1; 2278 lastId[index] = threadInfo[0][index];; 2279 } 2280 2281 // 2282 // Run through the rest of the OS procs. 2283 // 2284 for (i = 1; i < num_avail; i++) { 2285 // 2286 // Find the most significant index whose id differs 2287 // from the id for the previous OS proc. 2288 // 2289 for (index = maxIndex; index >= threadIdIndex; index--) { 2290 if (assign_thread_ids && (index == threadIdIndex)) { 2291 // 2292 // Auto-assign the thread id field if it wasn't specified. 2293 // 2294 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2295 threadInfo[i][threadIdIndex] = threadIdCt++; 2296 } 2297 2298 // 2299 // Aparrently the thread id field was specified for some 2300 // entries and not others. Start the thread id counter 2301 // off at the next higher thread id. 2302 // 2303 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2304 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2305 } 2306 } 2307 if (threadInfo[i][index] != lastId[index]) { 2308 // 2309 // Run through all indices which are less significant, 2310 // and reset the counts to 1. 2311 // 2312 // At all levels up to and including index, we need to 2313 // increment the totals and record the last id. 2314 // 2315 unsigned index2; 2316 for (index2 = threadIdIndex; index2 < index; index2++) { 2317 totals[index2]++; 2318 if (counts[index2] > maxCt[index2]) { 2319 maxCt[index2] = counts[index2]; 2320 } 2321 counts[index2] = 1; 2322 lastId[index2] = threadInfo[i][index2]; 2323 } 2324 counts[index]++; 2325 totals[index]++; 2326 lastId[index] = threadInfo[i][index]; 2327 2328 if (assign_thread_ids && (index > threadIdIndex)) { 2329 2330 # if KMP_MIC && REDUCE_TEAM_SIZE 2331 // 2332 // The default team size is the total #threads in the machine 2333 // minus 1 thread for every core that has 3 or more threads. 2334 // 2335 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2336 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2337 2338 // 2339 // Restart the thread counter, as we are on a new core. 2340 // 2341 threadIdCt = 0; 2342 2343 // 2344 // Auto-assign the thread id field if it wasn't specified. 2345 // 2346 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2347 threadInfo[i][threadIdIndex] = threadIdCt++; 2348 } 2349 2350 // 2351 // Aparrently the thread id field was specified for some 2352 // entries and not others. Start the thread id counter 2353 // off at the next higher thread id. 2354 // 2355 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2356 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2357 } 2358 } 2359 break; 2360 } 2361 } 2362 if (index < threadIdIndex) { 2363 // 2364 // If thread ids were specified, it is an error if they are not 2365 // unique. Also, check that we waven't already restarted the 2366 // loop (to be safe - shouldn't need to). 2367 // 2368 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2369 || assign_thread_ids) { 2370 __kmp_free(lastId); 2371 __kmp_free(totals); 2372 __kmp_free(maxCt); 2373 __kmp_free(counts); 2374 CLEANUP_THREAD_INFO; 2375 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2376 return -1; 2377 } 2378 2379 // 2380 // If the thread ids were not specified and we see entries 2381 // entries that are duplicates, start the loop over and 2382 // assign the thread ids manually. 2383 // 2384 assign_thread_ids = true; 2385 goto restart_radix_check; 2386 } 2387 } 2388 2389 # if KMP_MIC && REDUCE_TEAM_SIZE 2390 // 2391 // The default team size is the total #threads in the machine 2392 // minus 1 thread for every core that has 3 or more threads. 2393 // 2394 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2395 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2396 2397 for (index = threadIdIndex; index <= maxIndex; index++) { 2398 if (counts[index] > maxCt[index]) { 2399 maxCt[index] = counts[index]; 2400 } 2401 } 2402 2403 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2404 nCoresPerPkg = maxCt[coreIdIndex]; 2405 nPackages = totals[pkgIdIndex]; 2406 2407 // 2408 // Check to see if the machine topology is uniform 2409 // 2410 unsigned prod = totals[maxIndex]; 2411 for (index = threadIdIndex; index < maxIndex; index++) { 2412 prod *= maxCt[index]; 2413 } 2414 bool uniform = (prod == totals[threadIdIndex]); 2415 2416 // 2417 // When affinity is off, this routine will still be called to set 2418 // __kmp_ncores, as well as __kmp_nThreadsPerCore, 2419 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2420 // correctly, and return now if affinity is not enabled. 2421 // 2422 __kmp_ncores = totals[coreIdIndex]; 2423 2424 if (__kmp_affinity_verbose) { 2425 if (! KMP_AFFINITY_CAPABLE()) { 2426 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2427 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2428 if (uniform) { 2429 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2430 } else { 2431 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2432 } 2433 } 2434 else { 2435 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2436 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2437 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2438 if (__kmp_affinity_respect_mask) { 2439 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2440 } else { 2441 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2442 } 2443 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2444 if (uniform) { 2445 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2446 } else { 2447 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2448 } 2449 } 2450 kmp_str_buf_t buf; 2451 __kmp_str_buf_init(&buf); 2452 2453 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2454 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2455 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2456 } 2457 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2458 maxCt[threadIdIndex], __kmp_ncores); 2459 2460 __kmp_str_buf_free(&buf); 2461 } 2462 2463 # if KMP_MIC && REDUCE_TEAM_SIZE 2464 // 2465 // Set the default team size. 2466 // 2467 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2468 __kmp_dflt_team_nth = teamSize; 2469 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2470 __kmp_dflt_team_nth)); 2471 } 2472 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2473 2474 if (__kmp_affinity_type == affinity_none) { 2475 __kmp_free(lastId); 2476 __kmp_free(totals); 2477 __kmp_free(maxCt); 2478 __kmp_free(counts); 2479 CLEANUP_THREAD_INFO; 2480 return 0; 2481 } 2482 2483 // 2484 // Count the number of levels which have more nodes at that level than 2485 // at the parent's level (with there being an implicit root node of 2486 // the top level). This is equivalent to saying that there is at least 2487 // one node at this level which has a sibling. These levels are in the 2488 // map, and the package level is always in the map. 2489 // 2490 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2491 int level = 0; 2492 for (index = threadIdIndex; index < maxIndex; index++) { 2493 KMP_ASSERT(totals[index] >= totals[index + 1]); 2494 inMap[index] = (totals[index] > totals[index + 1]); 2495 } 2496 inMap[maxIndex] = (totals[maxIndex] > 1); 2497 inMap[pkgIdIndex] = true; 2498 2499 int depth = 0; 2500 for (index = threadIdIndex; index <= maxIndex; index++) { 2501 if (inMap[index]) { 2502 depth++; 2503 } 2504 } 2505 KMP_ASSERT(depth > 0); 2506 2507 // 2508 // Construct the data structure that is to be returned. 2509 // 2510 *address2os = (AddrUnsPair*) 2511 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2512 int pkgLevel = -1; 2513 int coreLevel = -1; 2514 int threadLevel = -1; 2515 2516 for (i = 0; i < num_avail; ++i) { 2517 Address addr(depth); 2518 unsigned os = threadInfo[i][osIdIndex]; 2519 int src_index; 2520 int dst_index = 0; 2521 2522 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2523 if (! inMap[src_index]) { 2524 continue; 2525 } 2526 addr.labels[dst_index] = threadInfo[i][src_index]; 2527 if (src_index == pkgIdIndex) { 2528 pkgLevel = dst_index; 2529 } 2530 else if (src_index == coreIdIndex) { 2531 coreLevel = dst_index; 2532 } 2533 else if (src_index == threadIdIndex) { 2534 threadLevel = dst_index; 2535 } 2536 dst_index++; 2537 } 2538 (*address2os)[i] = AddrUnsPair(addr, os); 2539 } 2540 2541 if (__kmp_affinity_gran_levels < 0) { 2542 // 2543 // Set the granularity level based on what levels are modeled 2544 // in the machine topology map. 2545 // 2546 unsigned src_index; 2547 __kmp_affinity_gran_levels = 0; 2548 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2549 if (! inMap[src_index]) { 2550 continue; 2551 } 2552 switch (src_index) { 2553 case threadIdIndex: 2554 if (__kmp_affinity_gran > affinity_gran_thread) { 2555 __kmp_affinity_gran_levels++; 2556 } 2557 2558 break; 2559 case coreIdIndex: 2560 if (__kmp_affinity_gran > affinity_gran_core) { 2561 __kmp_affinity_gran_levels++; 2562 } 2563 break; 2564 2565 case pkgIdIndex: 2566 if (__kmp_affinity_gran > affinity_gran_package) { 2567 __kmp_affinity_gran_levels++; 2568 } 2569 break; 2570 } 2571 } 2572 } 2573 2574 if (__kmp_affinity_verbose) { 2575 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2576 coreLevel, threadLevel); 2577 } 2578 2579 __kmp_free(inMap); 2580 __kmp_free(lastId); 2581 __kmp_free(totals); 2582 __kmp_free(maxCt); 2583 __kmp_free(counts); 2584 CLEANUP_THREAD_INFO; 2585 return depth; 2586 } 2587 2588 2589 // 2590 // Create and return a table of affinity masks, indexed by OS thread ID. 2591 // This routine handles OR'ing together all the affinity masks of threads 2592 // that are sufficiently close, if granularity > fine. 2593 // 2594 static kmp_affin_mask_t * 2595 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2596 AddrUnsPair *address2os, unsigned numAddrs) 2597 { 2598 // 2599 // First form a table of affinity masks in order of OS thread id. 2600 // 2601 unsigned depth; 2602 unsigned maxOsId; 2603 unsigned i; 2604 2605 KMP_ASSERT(numAddrs > 0); 2606 depth = address2os[0].first.depth; 2607 2608 maxOsId = 0; 2609 for (i = 0; i < numAddrs; i++) { 2610 unsigned osId = address2os[i].second; 2611 if (osId > maxOsId) { 2612 maxOsId = osId; 2613 } 2614 } 2615 kmp_affin_mask_t *osId2Mask; 2616 KMP_CPU_ALLOC_ARRAY(osId2Mask, (maxOsId+1)); 2617 2618 // 2619 // Sort the address2os table according to physical order. Doing so 2620 // will put all threads on the same core/package/node in consecutive 2621 // locations. 2622 // 2623 qsort(address2os, numAddrs, sizeof(*address2os), 2624 __kmp_affinity_cmp_Address_labels); 2625 2626 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2627 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2628 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2629 } 2630 if (__kmp_affinity_gran_levels >= (int)depth) { 2631 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2632 && (__kmp_affinity_type != affinity_none))) { 2633 KMP_WARNING(AffThreadsMayMigrate); 2634 } 2635 } 2636 2637 // 2638 // Run through the table, forming the masks for all threads on each 2639 // core. Threads on the same core will have identical "Address" 2640 // objects, not considering the last level, which must be the thread 2641 // id. All threads on a core will appear consecutively. 2642 // 2643 unsigned unique = 0; 2644 unsigned j = 0; // index of 1st thread on core 2645 unsigned leader = 0; 2646 Address *leaderAddr = &(address2os[0].first); 2647 kmp_affin_mask_t *sum; 2648 KMP_CPU_ALLOC_ON_STACK(sum); 2649 KMP_CPU_ZERO(sum); 2650 KMP_CPU_SET(address2os[0].second, sum); 2651 for (i = 1; i < numAddrs; i++) { 2652 // 2653 // If this thread is sufficiently close to the leader (within the 2654 // granularity setting), then set the bit for this os thread in the 2655 // affinity mask for this group, and go on to the next thread. 2656 // 2657 if (leaderAddr->isClose(address2os[i].first, 2658 __kmp_affinity_gran_levels)) { 2659 KMP_CPU_SET(address2os[i].second, sum); 2660 continue; 2661 } 2662 2663 // 2664 // For every thread in this group, copy the mask to the thread's 2665 // entry in the osId2Mask table. Mark the first address as a 2666 // leader. 2667 // 2668 for (; j < i; j++) { 2669 unsigned osId = address2os[j].second; 2670 KMP_DEBUG_ASSERT(osId <= maxOsId); 2671 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2672 KMP_CPU_COPY(mask, sum); 2673 address2os[j].first.leader = (j == leader); 2674 } 2675 unique++; 2676 2677 // 2678 // Start a new mask. 2679 // 2680 leader = i; 2681 leaderAddr = &(address2os[i].first); 2682 KMP_CPU_ZERO(sum); 2683 KMP_CPU_SET(address2os[i].second, sum); 2684 } 2685 2686 // 2687 // For every thread in last group, copy the mask to the thread's 2688 // entry in the osId2Mask table. 2689 // 2690 for (; j < i; j++) { 2691 unsigned osId = address2os[j].second; 2692 KMP_DEBUG_ASSERT(osId <= maxOsId); 2693 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2694 KMP_CPU_COPY(mask, sum); 2695 address2os[j].first.leader = (j == leader); 2696 } 2697 unique++; 2698 KMP_CPU_FREE_FROM_STACK(sum); 2699 2700 *maxIndex = maxOsId; 2701 *numUnique = unique; 2702 return osId2Mask; 2703 } 2704 2705 2706 // 2707 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2708 // as file-static than to try and pass them through the calling sequence of 2709 // the recursive-descent OMP_PLACES parser. 2710 // 2711 static kmp_affin_mask_t *newMasks; 2712 static int numNewMasks; 2713 static int nextNewMask; 2714 2715 #define ADD_MASK(_mask) \ 2716 { \ 2717 if (nextNewMask >= numNewMasks) { \ 2718 int i; \ 2719 numNewMasks *= 2; \ 2720 kmp_affin_mask_t* temp; \ 2721 KMP_CPU_INTERNAL_ALLOC_ARRAY(temp, numNewMasks); \ 2722 for(i=0;i<numNewMasks/2;i++) { \ 2723 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); \ 2724 kmp_affin_mask_t* dest = KMP_CPU_INDEX(temp, i); \ 2725 KMP_CPU_COPY(dest, src); \ 2726 } \ 2727 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks/2); \ 2728 newMasks = temp; \ 2729 } \ 2730 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2731 nextNewMask++; \ 2732 } 2733 2734 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2735 { \ 2736 if (((_osId) > _maxOsId) || \ 2737 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2738 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2739 && (__kmp_affinity_type != affinity_none))) { \ 2740 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2741 } \ 2742 } \ 2743 else { \ 2744 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2745 } \ 2746 } 2747 2748 2749 // 2750 // Re-parse the proclist (for the explicit affinity type), and form the list 2751 // of affinity newMasks indexed by gtid. 2752 // 2753 static void 2754 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2755 unsigned int *out_numMasks, const char *proclist, 2756 kmp_affin_mask_t *osId2Mask, int maxOsId) 2757 { 2758 int i; 2759 const char *scan = proclist; 2760 const char *next = proclist; 2761 2762 // 2763 // We use malloc() for the temporary mask vector, 2764 // so that we can use realloc() to extend it. 2765 // 2766 numNewMasks = 2; 2767 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 2768 nextNewMask = 0; 2769 kmp_affin_mask_t *sumMask; 2770 KMP_CPU_ALLOC(sumMask); 2771 int setSize = 0; 2772 2773 for (;;) { 2774 int start, end, stride; 2775 2776 SKIP_WS(scan); 2777 next = scan; 2778 if (*next == '\0') { 2779 break; 2780 } 2781 2782 if (*next == '{') { 2783 int num; 2784 setSize = 0; 2785 next++; // skip '{' 2786 SKIP_WS(next); 2787 scan = next; 2788 2789 // 2790 // Read the first integer in the set. 2791 // 2792 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2793 "bad proclist"); 2794 SKIP_DIGITS(next); 2795 num = __kmp_str_to_int(scan, *next); 2796 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2797 2798 // 2799 // Copy the mask for that osId to the sum (union) mask. 2800 // 2801 if ((num > maxOsId) || 2802 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2803 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2804 && (__kmp_affinity_type != affinity_none))) { 2805 KMP_WARNING(AffIgnoreInvalidProcID, num); 2806 } 2807 KMP_CPU_ZERO(sumMask); 2808 } 2809 else { 2810 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2811 setSize = 1; 2812 } 2813 2814 for (;;) { 2815 // 2816 // Check for end of set. 2817 // 2818 SKIP_WS(next); 2819 if (*next == '}') { 2820 next++; // skip '}' 2821 break; 2822 } 2823 2824 // 2825 // Skip optional comma. 2826 // 2827 if (*next == ',') { 2828 next++; 2829 } 2830 SKIP_WS(next); 2831 2832 // 2833 // Read the next integer in the set. 2834 // 2835 scan = next; 2836 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2837 "bad explicit proc list"); 2838 2839 SKIP_DIGITS(next); 2840 num = __kmp_str_to_int(scan, *next); 2841 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2842 2843 // 2844 // Add the mask for that osId to the sum mask. 2845 // 2846 if ((num > maxOsId) || 2847 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2848 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2849 && (__kmp_affinity_type != affinity_none))) { 2850 KMP_WARNING(AffIgnoreInvalidProcID, num); 2851 } 2852 } 2853 else { 2854 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2855 setSize++; 2856 } 2857 } 2858 if (setSize > 0) { 2859 ADD_MASK(sumMask); 2860 } 2861 2862 SKIP_WS(next); 2863 if (*next == ',') { 2864 next++; 2865 } 2866 scan = next; 2867 continue; 2868 } 2869 2870 // 2871 // Read the first integer. 2872 // 2873 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2874 SKIP_DIGITS(next); 2875 start = __kmp_str_to_int(scan, *next); 2876 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2877 SKIP_WS(next); 2878 2879 // 2880 // If this isn't a range, then add a mask to the list and go on. 2881 // 2882 if (*next != '-') { 2883 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2884 2885 // 2886 // Skip optional comma. 2887 // 2888 if (*next == ',') { 2889 next++; 2890 } 2891 scan = next; 2892 continue; 2893 } 2894 2895 // 2896 // This is a range. Skip over the '-' and read in the 2nd int. 2897 // 2898 next++; // skip '-' 2899 SKIP_WS(next); 2900 scan = next; 2901 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2902 SKIP_DIGITS(next); 2903 end = __kmp_str_to_int(scan, *next); 2904 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2905 2906 // 2907 // Check for a stride parameter 2908 // 2909 stride = 1; 2910 SKIP_WS(next); 2911 if (*next == ':') { 2912 // 2913 // A stride is specified. Skip over the ':" and read the 3rd int. 2914 // 2915 int sign = +1; 2916 next++; // skip ':' 2917 SKIP_WS(next); 2918 scan = next; 2919 if (*next == '-') { 2920 sign = -1; 2921 next++; 2922 SKIP_WS(next); 2923 scan = next; 2924 } 2925 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2926 "bad explicit proc list"); 2927 SKIP_DIGITS(next); 2928 stride = __kmp_str_to_int(scan, *next); 2929 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2930 stride *= sign; 2931 } 2932 2933 // 2934 // Do some range checks. 2935 // 2936 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2937 if (stride > 0) { 2938 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2939 } 2940 else { 2941 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2942 } 2943 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2944 2945 // 2946 // Add the mask for each OS proc # to the list. 2947 // 2948 if (stride > 0) { 2949 do { 2950 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2951 start += stride; 2952 } while (start <= end); 2953 } 2954 else { 2955 do { 2956 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2957 start += stride; 2958 } while (start >= end); 2959 } 2960 2961 // 2962 // Skip optional comma. 2963 // 2964 SKIP_WS(next); 2965 if (*next == ',') { 2966 next++; 2967 } 2968 scan = next; 2969 } 2970 2971 *out_numMasks = nextNewMask; 2972 if (nextNewMask == 0) { 2973 *out_masks = NULL; 2974 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2975 return; 2976 } 2977 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 2978 for(i = 0; i < nextNewMask; i++) { 2979 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 2980 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 2981 KMP_CPU_COPY(dest, src); 2982 } 2983 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 2984 KMP_CPU_FREE(sumMask); 2985 } 2986 2987 2988 # if OMP_40_ENABLED 2989 2990 /*----------------------------------------------------------------------------- 2991 2992 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2993 places. Again, Here is the grammar: 2994 2995 place_list := place 2996 place_list := place , place_list 2997 place := num 2998 place := place : num 2999 place := place : num : signed 3000 place := { subplacelist } 3001 place := ! place // (lowest priority) 3002 subplace_list := subplace 3003 subplace_list := subplace , subplace_list 3004 subplace := num 3005 subplace := num : num 3006 subplace := num : num : signed 3007 signed := num 3008 signed := + signed 3009 signed := - signed 3010 3011 -----------------------------------------------------------------------------*/ 3012 3013 static void 3014 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 3015 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3016 { 3017 const char *next; 3018 3019 for (;;) { 3020 int start, count, stride, i; 3021 3022 // 3023 // Read in the starting proc id 3024 // 3025 SKIP_WS(*scan); 3026 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3027 "bad explicit places list"); 3028 next = *scan; 3029 SKIP_DIGITS(next); 3030 start = __kmp_str_to_int(*scan, *next); 3031 KMP_ASSERT(start >= 0); 3032 *scan = next; 3033 3034 // 3035 // valid follow sets are ',' ':' and '}' 3036 // 3037 SKIP_WS(*scan); 3038 if (**scan == '}' || **scan == ',') { 3039 if ((start > maxOsId) || 3040 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3041 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3042 && (__kmp_affinity_type != affinity_none))) { 3043 KMP_WARNING(AffIgnoreInvalidProcID, start); 3044 } 3045 } 3046 else { 3047 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3048 (*setSize)++; 3049 } 3050 if (**scan == '}') { 3051 break; 3052 } 3053 (*scan)++; // skip ',' 3054 continue; 3055 } 3056 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3057 (*scan)++; // skip ':' 3058 3059 // 3060 // Read count parameter 3061 // 3062 SKIP_WS(*scan); 3063 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3064 "bad explicit places list"); 3065 next = *scan; 3066 SKIP_DIGITS(next); 3067 count = __kmp_str_to_int(*scan, *next); 3068 KMP_ASSERT(count >= 0); 3069 *scan = next; 3070 3071 // 3072 // valid follow sets are ',' ':' and '}' 3073 // 3074 SKIP_WS(*scan); 3075 if (**scan == '}' || **scan == ',') { 3076 for (i = 0; i < count; i++) { 3077 if ((start > maxOsId) || 3078 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3079 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3080 && (__kmp_affinity_type != affinity_none))) { 3081 KMP_WARNING(AffIgnoreInvalidProcID, start); 3082 } 3083 break; // don't proliferate warnings for large count 3084 } 3085 else { 3086 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3087 start++; 3088 (*setSize)++; 3089 } 3090 } 3091 if (**scan == '}') { 3092 break; 3093 } 3094 (*scan)++; // skip ',' 3095 continue; 3096 } 3097 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3098 (*scan)++; // skip ':' 3099 3100 // 3101 // Read stride parameter 3102 // 3103 int sign = +1; 3104 for (;;) { 3105 SKIP_WS(*scan); 3106 if (**scan == '+') { 3107 (*scan)++; // skip '+' 3108 continue; 3109 } 3110 if (**scan == '-') { 3111 sign *= -1; 3112 (*scan)++; // skip '-' 3113 continue; 3114 } 3115 break; 3116 } 3117 SKIP_WS(*scan); 3118 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3119 "bad explicit places list"); 3120 next = *scan; 3121 SKIP_DIGITS(next); 3122 stride = __kmp_str_to_int(*scan, *next); 3123 KMP_ASSERT(stride >= 0); 3124 *scan = next; 3125 stride *= sign; 3126 3127 // 3128 // valid follow sets are ',' and '}' 3129 // 3130 SKIP_WS(*scan); 3131 if (**scan == '}' || **scan == ',') { 3132 for (i = 0; i < count; i++) { 3133 if ((start > maxOsId) || 3134 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3135 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3136 && (__kmp_affinity_type != affinity_none))) { 3137 KMP_WARNING(AffIgnoreInvalidProcID, start); 3138 } 3139 break; // don't proliferate warnings for large count 3140 } 3141 else { 3142 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3143 start += stride; 3144 (*setSize)++; 3145 } 3146 } 3147 if (**scan == '}') { 3148 break; 3149 } 3150 (*scan)++; // skip ',' 3151 continue; 3152 } 3153 3154 KMP_ASSERT2(0, "bad explicit places list"); 3155 } 3156 } 3157 3158 3159 static void 3160 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3161 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3162 { 3163 const char *next; 3164 3165 // 3166 // valid follow sets are '{' '!' and num 3167 // 3168 SKIP_WS(*scan); 3169 if (**scan == '{') { 3170 (*scan)++; // skip '{' 3171 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3172 setSize); 3173 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3174 (*scan)++; // skip '}' 3175 } 3176 else if (**scan == '!') { 3177 (*scan)++; // skip '!' 3178 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3179 KMP_CPU_COMPLEMENT(maxOsId, tempMask); 3180 } 3181 else if ((**scan >= '0') && (**scan <= '9')) { 3182 next = *scan; 3183 SKIP_DIGITS(next); 3184 int num = __kmp_str_to_int(*scan, *next); 3185 KMP_ASSERT(num >= 0); 3186 if ((num > maxOsId) || 3187 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3188 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3189 && (__kmp_affinity_type != affinity_none))) { 3190 KMP_WARNING(AffIgnoreInvalidProcID, num); 3191 } 3192 } 3193 else { 3194 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3195 (*setSize)++; 3196 } 3197 *scan = next; // skip num 3198 } 3199 else { 3200 KMP_ASSERT2(0, "bad explicit places list"); 3201 } 3202 } 3203 3204 3205 //static void 3206 void 3207 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3208 unsigned int *out_numMasks, const char *placelist, 3209 kmp_affin_mask_t *osId2Mask, int maxOsId) 3210 { 3211 int i,j,count,stride,sign; 3212 const char *scan = placelist; 3213 const char *next = placelist; 3214 3215 numNewMasks = 2; 3216 KMP_CPU_INTERNAL_ALLOC_ARRAY(newMasks, numNewMasks); 3217 nextNewMask = 0; 3218 3219 // tempMask is modified based on the previous or initial 3220 // place to form the current place 3221 // previousMask contains the previous place 3222 kmp_affin_mask_t *tempMask; 3223 kmp_affin_mask_t *previousMask; 3224 KMP_CPU_ALLOC(tempMask); 3225 KMP_CPU_ZERO(tempMask); 3226 KMP_CPU_ALLOC(previousMask); 3227 KMP_CPU_ZERO(previousMask); 3228 int setSize = 0; 3229 3230 for (;;) { 3231 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3232 3233 // 3234 // valid follow sets are ',' ':' and EOL 3235 // 3236 SKIP_WS(scan); 3237 if (*scan == '\0' || *scan == ',') { 3238 if (setSize > 0) { 3239 ADD_MASK(tempMask); 3240 } 3241 KMP_CPU_ZERO(tempMask); 3242 setSize = 0; 3243 if (*scan == '\0') { 3244 break; 3245 } 3246 scan++; // skip ',' 3247 continue; 3248 } 3249 3250 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3251 scan++; // skip ':' 3252 3253 // 3254 // Read count parameter 3255 // 3256 SKIP_WS(scan); 3257 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3258 "bad explicit places list"); 3259 next = scan; 3260 SKIP_DIGITS(next); 3261 count = __kmp_str_to_int(scan, *next); 3262 KMP_ASSERT(count >= 0); 3263 scan = next; 3264 3265 // 3266 // valid follow sets are ',' ':' and EOL 3267 // 3268 SKIP_WS(scan); 3269 if (*scan == '\0' || *scan == ',') { 3270 stride = +1; 3271 } 3272 else { 3273 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3274 scan++; // skip ':' 3275 3276 // 3277 // Read stride parameter 3278 // 3279 sign = +1; 3280 for (;;) { 3281 SKIP_WS(scan); 3282 if (*scan == '+') { 3283 scan++; // skip '+' 3284 continue; 3285 } 3286 if (*scan == '-') { 3287 sign *= -1; 3288 scan++; // skip '-' 3289 continue; 3290 } 3291 break; 3292 } 3293 SKIP_WS(scan); 3294 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3295 "bad explicit places list"); 3296 next = scan; 3297 SKIP_DIGITS(next); 3298 stride = __kmp_str_to_int(scan, *next); 3299 KMP_DEBUG_ASSERT(stride >= 0); 3300 scan = next; 3301 stride *= sign; 3302 } 3303 3304 // Add places determined by initial_place : count : stride 3305 for (i = 0; i < count; i++) { 3306 if (setSize == 0) { 3307 break; 3308 } 3309 // Add the current place, then build the next place (tempMask) from that 3310 KMP_CPU_COPY(previousMask, tempMask); 3311 ADD_MASK(previousMask); 3312 KMP_CPU_ZERO(tempMask); 3313 setSize = 0; 3314 KMP_CPU_SET_ITERATE(j, previousMask) { 3315 if (! KMP_CPU_ISSET(j, previousMask)) { 3316 continue; 3317 } 3318 else if ((j+stride > maxOsId) || (j+stride < 0) || 3319 (! KMP_CPU_ISSET(j+stride, KMP_CPU_INDEX(osId2Mask, j+stride)))) { 3320 if ((__kmp_affinity_verbose || (__kmp_affinity_warnings 3321 && (__kmp_affinity_type != affinity_none))) && i < count - 1) { 3322 KMP_WARNING(AffIgnoreInvalidProcID, j+stride); 3323 } 3324 } 3325 else { 3326 KMP_CPU_SET(j+stride, tempMask); 3327 setSize++; 3328 } 3329 } 3330 } 3331 KMP_CPU_ZERO(tempMask); 3332 setSize = 0; 3333 3334 // 3335 // valid follow sets are ',' and EOL 3336 // 3337 SKIP_WS(scan); 3338 if (*scan == '\0') { 3339 break; 3340 } 3341 if (*scan == ',') { 3342 scan++; // skip ',' 3343 continue; 3344 } 3345 3346 KMP_ASSERT2(0, "bad explicit places list"); 3347 } 3348 3349 *out_numMasks = nextNewMask; 3350 if (nextNewMask == 0) { 3351 *out_masks = NULL; 3352 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3353 return; 3354 } 3355 KMP_CPU_ALLOC_ARRAY((*out_masks), nextNewMask); 3356 KMP_CPU_FREE(tempMask); 3357 KMP_CPU_FREE(previousMask); 3358 for(i = 0; i < nextNewMask; i++) { 3359 kmp_affin_mask_t* src = KMP_CPU_INDEX(newMasks, i); 3360 kmp_affin_mask_t* dest = KMP_CPU_INDEX((*out_masks), i); 3361 KMP_CPU_COPY(dest, src); 3362 } 3363 KMP_CPU_INTERNAL_FREE_ARRAY(newMasks, numNewMasks); 3364 } 3365 3366 # endif /* OMP_40_ENABLED */ 3367 3368 #undef ADD_MASK 3369 #undef ADD_MASK_OSID 3370 3371 static void 3372 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3373 { 3374 if (__kmp_place_num_sockets == 0 && 3375 __kmp_place_num_cores == 0 && 3376 __kmp_place_num_threads_per_core == 0 ) 3377 return; // no topology limiting actions requested, exit 3378 if (__kmp_place_num_sockets == 0) 3379 __kmp_place_num_sockets = nPackages; // use all available sockets 3380 if (__kmp_place_num_cores == 0) 3381 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3382 if (__kmp_place_num_threads_per_core == 0 || 3383 __kmp_place_num_threads_per_core > __kmp_nThreadsPerCore) 3384 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3385 3386 if ( !__kmp_affinity_uniform_topology() ) { 3387 KMP_WARNING( AffThrPlaceNonUniform ); 3388 return; // don't support non-uniform topology 3389 } 3390 if ( depth != 3 ) { 3391 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3392 return; // don't support not-3-level topology 3393 } 3394 if (__kmp_place_socket_offset + __kmp_place_num_sockets > nPackages) { 3395 KMP_WARNING(AffThrPlaceManySockets); 3396 return; 3397 } 3398 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3399 KMP_WARNING( AffThrPlaceManyCores ); 3400 return; 3401 } 3402 3403 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3404 __kmp_place_num_sockets * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3405 3406 int i, j, k, n_old = 0, n_new = 0; 3407 for (i = 0; i < nPackages; ++i) 3408 if (i < __kmp_place_socket_offset || 3409 i >= __kmp_place_socket_offset + __kmp_place_num_sockets) 3410 n_old += nCoresPerPkg * __kmp_nThreadsPerCore; // skip not-requested socket 3411 else 3412 for (j = 0; j < nCoresPerPkg; ++j) // walk through requested socket 3413 if (j < __kmp_place_core_offset || 3414 j >= __kmp_place_core_offset + __kmp_place_num_cores) 3415 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3416 else 3417 for (k = 0; k < __kmp_nThreadsPerCore; ++k) { // walk through requested core 3418 if (k < __kmp_place_num_threads_per_core) { 3419 newAddr[n_new] = (*pAddr)[n_old]; // collect requested thread's data 3420 n_new++; 3421 } 3422 n_old++; 3423 } 3424 KMP_DEBUG_ASSERT(n_old == nPackages * nCoresPerPkg * __kmp_nThreadsPerCore); 3425 KMP_DEBUG_ASSERT(n_new == __kmp_place_num_sockets * __kmp_place_num_cores * 3426 __kmp_place_num_threads_per_core); 3427 3428 nPackages = __kmp_place_num_sockets; // correct nPackages 3429 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3430 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3431 __kmp_avail_proc = n_new; // correct avail_proc 3432 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3433 3434 __kmp_free( *pAddr ); 3435 *pAddr = newAddr; // replace old topology with new one 3436 } 3437 3438 3439 static AddrUnsPair *address2os = NULL; 3440 static int * procarr = NULL; 3441 static int __kmp_aff_depth = 0; 3442 3443 static void 3444 __kmp_aux_affinity_initialize(void) 3445 { 3446 if (__kmp_affinity_masks != NULL) { 3447 KMP_ASSERT(fullMask != NULL); 3448 return; 3449 } 3450 3451 // 3452 // Create the "full" mask - this defines all of the processors that we 3453 // consider to be in the machine model. If respect is set, then it is 3454 // the initialization thread's affinity mask. Otherwise, it is all 3455 // processors that we know about on the machine. 3456 // 3457 if (fullMask == NULL) { 3458 KMP_CPU_ALLOC(fullMask); 3459 } 3460 if (KMP_AFFINITY_CAPABLE()) { 3461 if (__kmp_affinity_respect_mask) { 3462 __kmp_get_system_affinity(fullMask, TRUE); 3463 3464 // 3465 // Count the number of available processors. 3466 // 3467 unsigned i; 3468 __kmp_avail_proc = 0; 3469 KMP_CPU_SET_ITERATE(i, fullMask) { 3470 if (! KMP_CPU_ISSET(i, fullMask)) { 3471 continue; 3472 } 3473 __kmp_avail_proc++; 3474 } 3475 if (__kmp_avail_proc > __kmp_xproc) { 3476 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3477 && (__kmp_affinity_type != affinity_none))) { 3478 KMP_WARNING(ErrorInitializeAffinity); 3479 } 3480 __kmp_affinity_type = affinity_none; 3481 KMP_AFFINITY_DISABLE(); 3482 return; 3483 } 3484 } 3485 else { 3486 __kmp_affinity_entire_machine_mask(fullMask); 3487 __kmp_avail_proc = __kmp_xproc; 3488 } 3489 } 3490 3491 int depth = -1; 3492 kmp_i18n_id_t msg_id = kmp_i18n_null; 3493 3494 // 3495 // For backward compatibility, setting KMP_CPUINFO_FILE => 3496 // KMP_TOPOLOGY_METHOD=cpuinfo 3497 // 3498 if ((__kmp_cpuinfo_file != NULL) && 3499 (__kmp_affinity_top_method == affinity_top_method_all)) { 3500 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3501 } 3502 3503 if (__kmp_affinity_top_method == affinity_top_method_all) { 3504 // 3505 // In the default code path, errors are not fatal - we just try using 3506 // another method. We only emit a warning message if affinity is on, 3507 // or the verbose flag is set, an the nowarnings flag was not set. 3508 // 3509 const char *file_name = NULL; 3510 int line = 0; 3511 # if KMP_USE_HWLOC 3512 if (depth < 0) { 3513 if (__kmp_affinity_verbose) { 3514 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3515 } 3516 if(!__kmp_hwloc_error) { 3517 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3518 if (depth == 0) { 3519 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3520 KMP_ASSERT(address2os == NULL); 3521 return; 3522 } else if(depth < 0 && __kmp_affinity_verbose) { 3523 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3524 } 3525 } else if(__kmp_affinity_verbose) { 3526 KMP_INFORM(AffIgnoringHwloc, "KMP_AFFINITY"); 3527 } 3528 } 3529 # endif 3530 3531 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3532 3533 if (depth < 0) { 3534 if (__kmp_affinity_verbose) { 3535 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3536 } 3537 3538 file_name = NULL; 3539 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3540 if (depth == 0) { 3541 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3542 KMP_ASSERT(address2os == NULL); 3543 return; 3544 } 3545 3546 if (depth < 0) { 3547 if (__kmp_affinity_verbose) { 3548 if (msg_id != kmp_i18n_null) { 3549 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3550 KMP_I18N_STR(DecodingLegacyAPIC)); 3551 } 3552 else { 3553 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3554 } 3555 } 3556 3557 file_name = NULL; 3558 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3559 if (depth == 0) { 3560 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3561 KMP_ASSERT(address2os == NULL); 3562 return; 3563 } 3564 } 3565 } 3566 3567 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3568 3569 # if KMP_OS_LINUX 3570 3571 if (depth < 0) { 3572 if (__kmp_affinity_verbose) { 3573 if (msg_id != kmp_i18n_null) { 3574 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3575 } 3576 else { 3577 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3578 } 3579 } 3580 3581 FILE *f = fopen("/proc/cpuinfo", "r"); 3582 if (f == NULL) { 3583 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3584 } 3585 else { 3586 file_name = "/proc/cpuinfo"; 3587 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3588 fclose(f); 3589 if (depth == 0) { 3590 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3591 KMP_ASSERT(address2os == NULL); 3592 return; 3593 } 3594 } 3595 } 3596 3597 # endif /* KMP_OS_LINUX */ 3598 3599 # if KMP_GROUP_AFFINITY 3600 3601 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3602 if (__kmp_affinity_verbose) { 3603 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3604 } 3605 3606 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3607 KMP_ASSERT(depth != 0); 3608 } 3609 3610 # endif /* KMP_GROUP_AFFINITY */ 3611 3612 if (depth < 0) { 3613 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3614 if (file_name == NULL) { 3615 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3616 } 3617 else if (line == 0) { 3618 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3619 } 3620 else { 3621 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3622 } 3623 } 3624 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3625 3626 file_name = ""; 3627 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3628 if (depth == 0) { 3629 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3630 KMP_ASSERT(address2os == NULL); 3631 return; 3632 } 3633 KMP_ASSERT(depth > 0); 3634 KMP_ASSERT(address2os != NULL); 3635 } 3636 } 3637 3638 // 3639 // If the user has specified that a paricular topology discovery method 3640 // is to be used, then we abort if that method fails. The exception is 3641 // group affinity, which might have been implicitly set. 3642 // 3643 3644 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3645 3646 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3647 if (__kmp_affinity_verbose) { 3648 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3649 KMP_I18N_STR(Decodingx2APIC)); 3650 } 3651 3652 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3653 if (depth == 0) { 3654 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3655 KMP_ASSERT(address2os == NULL); 3656 return; 3657 } 3658 if (depth < 0) { 3659 KMP_ASSERT(msg_id != kmp_i18n_null); 3660 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3661 } 3662 } 3663 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3664 if (__kmp_affinity_verbose) { 3665 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3666 KMP_I18N_STR(DecodingLegacyAPIC)); 3667 } 3668 3669 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3670 if (depth == 0) { 3671 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3672 KMP_ASSERT(address2os == NULL); 3673 return; 3674 } 3675 if (depth < 0) { 3676 KMP_ASSERT(msg_id != kmp_i18n_null); 3677 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3678 } 3679 } 3680 3681 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3682 3683 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3684 const char *filename; 3685 if (__kmp_cpuinfo_file != NULL) { 3686 filename = __kmp_cpuinfo_file; 3687 } 3688 else { 3689 filename = "/proc/cpuinfo"; 3690 } 3691 3692 if (__kmp_affinity_verbose) { 3693 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3694 } 3695 3696 FILE *f = fopen(filename, "r"); 3697 if (f == NULL) { 3698 int code = errno; 3699 if (__kmp_cpuinfo_file != NULL) { 3700 __kmp_msg( 3701 kmp_ms_fatal, 3702 KMP_MSG(CantOpenFileForReading, filename), 3703 KMP_ERR(code), 3704 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3705 __kmp_msg_null 3706 ); 3707 } 3708 else { 3709 __kmp_msg( 3710 kmp_ms_fatal, 3711 KMP_MSG(CantOpenFileForReading, filename), 3712 KMP_ERR(code), 3713 __kmp_msg_null 3714 ); 3715 } 3716 } 3717 int line = 0; 3718 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3719 fclose(f); 3720 if (depth < 0) { 3721 KMP_ASSERT(msg_id != kmp_i18n_null); 3722 if (line > 0) { 3723 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3724 } 3725 else { 3726 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3727 } 3728 } 3729 if (__kmp_affinity_type == affinity_none) { 3730 KMP_ASSERT(depth == 0); 3731 KMP_ASSERT(address2os == NULL); 3732 return; 3733 } 3734 } 3735 3736 # if KMP_GROUP_AFFINITY 3737 3738 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3739 if (__kmp_affinity_verbose) { 3740 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3741 } 3742 3743 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3744 KMP_ASSERT(depth != 0); 3745 if (depth < 0) { 3746 KMP_ASSERT(msg_id != kmp_i18n_null); 3747 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3748 } 3749 } 3750 3751 # endif /* KMP_GROUP_AFFINITY */ 3752 3753 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3754 if (__kmp_affinity_verbose) { 3755 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3756 } 3757 3758 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3759 if (depth == 0) { 3760 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3761 KMP_ASSERT(address2os == NULL); 3762 return; 3763 } 3764 // should not fail 3765 KMP_ASSERT(depth > 0); 3766 KMP_ASSERT(address2os != NULL); 3767 } 3768 3769 # if KMP_USE_HWLOC 3770 else if (__kmp_affinity_top_method == affinity_top_method_hwloc) { 3771 if (__kmp_affinity_verbose) { 3772 KMP_INFORM(AffUsingHwloc, "KMP_AFFINITY"); 3773 } 3774 depth = __kmp_affinity_create_hwloc_map(&address2os, &msg_id); 3775 if (depth == 0) { 3776 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3777 KMP_ASSERT(address2os == NULL); 3778 return; 3779 } 3780 # if KMP_DEBUG 3781 AddrUnsPair *otheraddress2os = NULL; 3782 int otherdepth = -1; 3783 # if KMP_MIC 3784 otherdepth = __kmp_affinity_create_apicid_map(&otheraddress2os, &msg_id); 3785 # else 3786 otherdepth = __kmp_affinity_create_x2apicid_map(&otheraddress2os, &msg_id); 3787 # endif 3788 if(otheraddress2os != NULL && address2os != NULL) { 3789 int i; 3790 unsigned arent_equal_flag = 0; 3791 for(i=0;i<__kmp_avail_proc;i++) { 3792 if(otheraddress2os[i] != address2os[i]) arent_equal_flag = 1; 3793 } 3794 if(arent_equal_flag) { 3795 KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are different from APICID\n")); 3796 KA_TRACE(10, ("__kmp_aux_affinity_initialize: APICID Table:\n")); 3797 for(i=0;i<__kmp_avail_proc;i++) { 3798 otheraddress2os[i].print(); __kmp_printf("\n"); 3799 } 3800 KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc Table:\n")); 3801 for(i=0;i<__kmp_avail_proc;i++) { 3802 address2os[i].print(); __kmp_printf("\n"); 3803 } 3804 } 3805 else { 3806 KA_TRACE(10, ("__kmp_aux_affinity_initialize: Hwloc affinity places are same as APICID\n")); 3807 } 3808 } 3809 # endif // KMP_DEBUG 3810 } 3811 # endif // KMP_USE_HWLOC 3812 3813 if (address2os == NULL) { 3814 if (KMP_AFFINITY_CAPABLE() 3815 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3816 && (__kmp_affinity_type != affinity_none)))) { 3817 KMP_WARNING(ErrorInitializeAffinity); 3818 } 3819 __kmp_affinity_type = affinity_none; 3820 KMP_AFFINITY_DISABLE(); 3821 return; 3822 } 3823 3824 __kmp_apply_thread_places(&address2os, depth); 3825 3826 // 3827 // Create the table of masks, indexed by thread Id. 3828 // 3829 unsigned maxIndex; 3830 unsigned numUnique; 3831 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3832 address2os, __kmp_avail_proc); 3833 if (__kmp_affinity_gran_levels == 0) { 3834 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3835 } 3836 3837 // 3838 // Set the childNums vector in all Address objects. This must be done 3839 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3840 // which takes into account the setting of __kmp_affinity_compact. 3841 // 3842 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3843 3844 switch (__kmp_affinity_type) { 3845 3846 case affinity_explicit: 3847 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3848 # if OMP_40_ENABLED 3849 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3850 # endif 3851 { 3852 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3853 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3854 maxIndex); 3855 } 3856 # if OMP_40_ENABLED 3857 else { 3858 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3859 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3860 maxIndex); 3861 } 3862 # endif 3863 if (__kmp_affinity_num_masks == 0) { 3864 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3865 && (__kmp_affinity_type != affinity_none))) { 3866 KMP_WARNING(AffNoValidProcID); 3867 } 3868 __kmp_affinity_type = affinity_none; 3869 return; 3870 } 3871 break; 3872 3873 // 3874 // The other affinity types rely on sorting the Addresses according 3875 // to some permutation of the machine topology tree. Set 3876 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3877 // then jump to a common code fragment to do the sort and create 3878 // the array of affinity masks. 3879 // 3880 3881 case affinity_logical: 3882 __kmp_affinity_compact = 0; 3883 if (__kmp_affinity_offset) { 3884 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3885 % __kmp_avail_proc; 3886 } 3887 goto sortAddresses; 3888 3889 case affinity_physical: 3890 if (__kmp_nThreadsPerCore > 1) { 3891 __kmp_affinity_compact = 1; 3892 if (__kmp_affinity_compact >= depth) { 3893 __kmp_affinity_compact = 0; 3894 } 3895 } else { 3896 __kmp_affinity_compact = 0; 3897 } 3898 if (__kmp_affinity_offset) { 3899 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3900 % __kmp_avail_proc; 3901 } 3902 goto sortAddresses; 3903 3904 case affinity_scatter: 3905 if (__kmp_affinity_compact >= depth) { 3906 __kmp_affinity_compact = 0; 3907 } 3908 else { 3909 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3910 } 3911 goto sortAddresses; 3912 3913 case affinity_compact: 3914 if (__kmp_affinity_compact >= depth) { 3915 __kmp_affinity_compact = depth - 1; 3916 } 3917 goto sortAddresses; 3918 3919 case affinity_balanced: 3920 // Balanced works only for the case of a single package 3921 if( nPackages > 1 ) { 3922 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3923 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3924 } 3925 __kmp_affinity_type = affinity_none; 3926 return; 3927 } else if( __kmp_affinity_uniform_topology() ) { 3928 break; 3929 } else { // Non-uniform topology 3930 3931 // Save the depth for further usage 3932 __kmp_aff_depth = depth; 3933 3934 // Number of hyper threads per core in HT machine 3935 int nth_per_core = __kmp_nThreadsPerCore; 3936 3937 int core_level; 3938 if( nth_per_core > 1 ) { 3939 core_level = depth - 2; 3940 } else { 3941 core_level = depth - 1; 3942 } 3943 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3944 int nproc = nth_per_core * ncores; 3945 3946 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3947 for( int i = 0; i < nproc; i++ ) { 3948 procarr[ i ] = -1; 3949 } 3950 3951 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3952 int proc = address2os[ i ].second; 3953 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3954 // If there is only one thread per core then depth == 2: level 0 - package, 3955 // level 1 - core. 3956 int level = depth - 1; 3957 3958 // __kmp_nth_per_core == 1 3959 int thread = 0; 3960 int core = address2os[ i ].first.labels[ level ]; 3961 // If the thread level exists, that is we have more than one thread context per core 3962 if( nth_per_core > 1 ) { 3963 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3964 core = address2os[ i ].first.labels[ level - 1 ]; 3965 } 3966 procarr[ core * nth_per_core + thread ] = proc; 3967 } 3968 3969 break; 3970 } 3971 3972 sortAddresses: 3973 // 3974 // Allocate the gtid->affinity mask table. 3975 // 3976 if (__kmp_affinity_dups) { 3977 __kmp_affinity_num_masks = __kmp_avail_proc; 3978 } 3979 else { 3980 __kmp_affinity_num_masks = numUnique; 3981 } 3982 3983 # if OMP_40_ENABLED 3984 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3985 && ( __kmp_affinity_num_places > 0 ) 3986 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3987 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3988 } 3989 # endif 3990 3991 KMP_CPU_ALLOC_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 3992 3993 // 3994 // Sort the address2os table according to the current setting of 3995 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3996 // 3997 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3998 __kmp_affinity_cmp_Address_child_num); 3999 { 4000 int i; 4001 unsigned j; 4002 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 4003 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 4004 continue; 4005 } 4006 unsigned osId = address2os[i].second; 4007 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 4008 kmp_affin_mask_t *dest 4009 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 4010 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 4011 KMP_CPU_COPY(dest, src); 4012 if (++j >= __kmp_affinity_num_masks) { 4013 break; 4014 } 4015 } 4016 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 4017 } 4018 break; 4019 4020 default: 4021 KMP_ASSERT2(0, "Unexpected affinity setting"); 4022 } 4023 4024 __kmp_free(osId2Mask); 4025 machine_hierarchy.init(address2os, __kmp_avail_proc); 4026 } 4027 4028 4029 void 4030 __kmp_affinity_initialize(void) 4031 { 4032 // 4033 // Much of the code above was written assumming that if a machine was not 4034 // affinity capable, then __kmp_affinity_type == affinity_none. We now 4035 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 4036 // 4037 // There are too many checks for __kmp_affinity_type == affinity_none 4038 // in this code. Instead of trying to change them all, check if 4039 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 4040 // affinity_none, call the real initialization routine, then restore 4041 // __kmp_affinity_type to affinity_disabled. 4042 // 4043 int disabled = (__kmp_affinity_type == affinity_disabled); 4044 if (! KMP_AFFINITY_CAPABLE()) { 4045 KMP_ASSERT(disabled); 4046 } 4047 if (disabled) { 4048 __kmp_affinity_type = affinity_none; 4049 } 4050 __kmp_aux_affinity_initialize(); 4051 if (disabled) { 4052 __kmp_affinity_type = affinity_disabled; 4053 } 4054 } 4055 4056 4057 void 4058 __kmp_affinity_uninitialize(void) 4059 { 4060 if (__kmp_affinity_masks != NULL) { 4061 KMP_CPU_FREE_ARRAY(__kmp_affinity_masks, __kmp_affinity_num_masks); 4062 __kmp_affinity_masks = NULL; 4063 } 4064 if (fullMask != NULL) { 4065 KMP_CPU_FREE(fullMask); 4066 fullMask = NULL; 4067 } 4068 __kmp_affinity_num_masks = 0; 4069 # if OMP_40_ENABLED 4070 __kmp_affinity_num_places = 0; 4071 # endif 4072 if (__kmp_affinity_proclist != NULL) { 4073 __kmp_free(__kmp_affinity_proclist); 4074 __kmp_affinity_proclist = NULL; 4075 } 4076 if( address2os != NULL ) { 4077 __kmp_free( address2os ); 4078 address2os = NULL; 4079 } 4080 if( procarr != NULL ) { 4081 __kmp_free( procarr ); 4082 procarr = NULL; 4083 } 4084 } 4085 4086 4087 void 4088 __kmp_affinity_set_init_mask(int gtid, int isa_root) 4089 { 4090 if (! KMP_AFFINITY_CAPABLE()) { 4091 return; 4092 } 4093 4094 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4095 if (th->th.th_affin_mask == NULL) { 4096 KMP_CPU_ALLOC(th->th.th_affin_mask); 4097 } 4098 else { 4099 KMP_CPU_ZERO(th->th.th_affin_mask); 4100 } 4101 4102 // 4103 // Copy the thread mask to the kmp_info_t strucuture. 4104 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 4105 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4106 // is set, then the full mask is the same as the mask of the initialization 4107 // thread. 4108 // 4109 kmp_affin_mask_t *mask; 4110 int i; 4111 4112 # if OMP_40_ENABLED 4113 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4114 # endif 4115 { 4116 if ((__kmp_affinity_type == affinity_none) || (__kmp_affinity_type == affinity_balanced) 4117 ) { 4118 # if KMP_GROUP_AFFINITY 4119 if (__kmp_num_proc_groups > 1) { 4120 return; 4121 } 4122 # endif 4123 KMP_ASSERT(fullMask != NULL); 4124 i = KMP_PLACE_ALL; 4125 mask = fullMask; 4126 } 4127 else { 4128 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4129 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4130 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4131 } 4132 } 4133 # if OMP_40_ENABLED 4134 else { 4135 if ((! isa_root) 4136 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4137 # if KMP_GROUP_AFFINITY 4138 if (__kmp_num_proc_groups > 1) { 4139 return; 4140 } 4141 # endif 4142 KMP_ASSERT(fullMask != NULL); 4143 i = KMP_PLACE_ALL; 4144 mask = fullMask; 4145 } 4146 else { 4147 // 4148 // int i = some hash function or just a counter that doesn't 4149 // always start at 0. Use gtid for now. 4150 // 4151 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4152 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4153 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4154 } 4155 } 4156 # endif 4157 4158 # if OMP_40_ENABLED 4159 th->th.th_current_place = i; 4160 if (isa_root) { 4161 th->th.th_new_place = i; 4162 th->th.th_first_place = 0; 4163 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4164 } 4165 4166 if (i == KMP_PLACE_ALL) { 4167 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4168 gtid)); 4169 } 4170 else { 4171 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4172 gtid, i)); 4173 } 4174 # else 4175 if (i == -1) { 4176 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4177 gtid)); 4178 } 4179 else { 4180 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4181 gtid, i)); 4182 } 4183 # endif /* OMP_40_ENABLED */ 4184 4185 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4186 4187 if (__kmp_affinity_verbose) { 4188 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4189 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4190 th->th.th_affin_mask); 4191 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4192 buf); 4193 } 4194 4195 # if KMP_OS_WINDOWS 4196 // 4197 // On Windows* OS, the process affinity mask might have changed. 4198 // If the user didn't request affinity and this call fails, 4199 // just continue silently. See CQ171393. 4200 // 4201 if ( __kmp_affinity_type == affinity_none ) { 4202 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4203 } 4204 else 4205 # endif 4206 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4207 } 4208 4209 4210 # if OMP_40_ENABLED 4211 4212 void 4213 __kmp_affinity_set_place(int gtid) 4214 { 4215 int retval; 4216 4217 if (! KMP_AFFINITY_CAPABLE()) { 4218 return; 4219 } 4220 4221 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4222 4223 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4224 gtid, th->th.th_new_place, th->th.th_current_place)); 4225 4226 // 4227 // Check that the new place is within this thread's partition. 4228 // 4229 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4230 KMP_ASSERT(th->th.th_new_place >= 0); 4231 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4232 if (th->th.th_first_place <= th->th.th_last_place) { 4233 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4234 && (th->th.th_new_place <= th->th.th_last_place)); 4235 } 4236 else { 4237 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4238 || (th->th.th_new_place >= th->th.th_last_place)); 4239 } 4240 4241 // 4242 // Copy the thread mask to the kmp_info_t strucuture, 4243 // and set this thread's affinity. 4244 // 4245 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4246 th->th.th_new_place); 4247 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4248 th->th.th_current_place = th->th.th_new_place; 4249 4250 if (__kmp_affinity_verbose) { 4251 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4252 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4253 th->th.th_affin_mask); 4254 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4255 gtid, buf); 4256 } 4257 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4258 } 4259 4260 # endif /* OMP_40_ENABLED */ 4261 4262 4263 int 4264 __kmp_aux_set_affinity(void **mask) 4265 { 4266 int gtid; 4267 kmp_info_t *th; 4268 int retval; 4269 4270 if (! KMP_AFFINITY_CAPABLE()) { 4271 return -1; 4272 } 4273 4274 gtid = __kmp_entry_gtid(); 4275 KA_TRACE(1000, ;{ 4276 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4277 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4278 (kmp_affin_mask_t *)(*mask)); 4279 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4280 gtid, buf); 4281 }); 4282 4283 if (__kmp_env_consistency_check) { 4284 if ((mask == NULL) || (*mask == NULL)) { 4285 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4286 } 4287 else { 4288 unsigned proc; 4289 int num_procs = 0; 4290 4291 KMP_CPU_SET_ITERATE(proc, ((kmp_affin_mask_t*)(*mask))) { 4292 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4293 continue; 4294 } 4295 num_procs++; 4296 if (! KMP_CPU_ISSET(proc, fullMask)) { 4297 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4298 break; 4299 } 4300 } 4301 if (num_procs == 0) { 4302 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4303 } 4304 4305 # if KMP_GROUP_AFFINITY 4306 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4307 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4308 } 4309 # endif /* KMP_GROUP_AFFINITY */ 4310 4311 } 4312 } 4313 4314 th = __kmp_threads[gtid]; 4315 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4316 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4317 if (retval == 0) { 4318 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4319 } 4320 4321 # if OMP_40_ENABLED 4322 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4323 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4324 th->th.th_first_place = 0; 4325 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4326 4327 // 4328 // Turn off 4.0 affinity for the current tread at this parallel level. 4329 // 4330 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4331 # endif 4332 4333 return retval; 4334 } 4335 4336 4337 int 4338 __kmp_aux_get_affinity(void **mask) 4339 { 4340 int gtid; 4341 int retval; 4342 kmp_info_t *th; 4343 4344 if (! KMP_AFFINITY_CAPABLE()) { 4345 return -1; 4346 } 4347 4348 gtid = __kmp_entry_gtid(); 4349 th = __kmp_threads[gtid]; 4350 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4351 4352 KA_TRACE(1000, ;{ 4353 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4354 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4355 th->th.th_affin_mask); 4356 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4357 }); 4358 4359 if (__kmp_env_consistency_check) { 4360 if ((mask == NULL) || (*mask == NULL)) { 4361 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4362 } 4363 } 4364 4365 # if !KMP_OS_WINDOWS 4366 4367 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4368 KA_TRACE(1000, ;{ 4369 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4370 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4371 (kmp_affin_mask_t *)(*mask)); 4372 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4373 }); 4374 return retval; 4375 4376 # else 4377 4378 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4379 return 0; 4380 4381 # endif /* KMP_OS_WINDOWS */ 4382 4383 } 4384 4385 int 4386 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4387 { 4388 int retval; 4389 4390 if (! KMP_AFFINITY_CAPABLE()) { 4391 return -1; 4392 } 4393 4394 KA_TRACE(1000, ;{ 4395 int gtid = __kmp_entry_gtid(); 4396 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4397 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4398 (kmp_affin_mask_t *)(*mask)); 4399 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4400 proc, gtid, buf); 4401 }); 4402 4403 if (__kmp_env_consistency_check) { 4404 if ((mask == NULL) || (*mask == NULL)) { 4405 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4406 } 4407 } 4408 4409 if ((proc < 0) 4410 # if !KMP_USE_HWLOC 4411 || ((unsigned)proc >= KMP_CPU_SETSIZE) 4412 # endif 4413 ) { 4414 return -1; 4415 } 4416 if (! KMP_CPU_ISSET(proc, fullMask)) { 4417 return -2; 4418 } 4419 4420 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4421 return 0; 4422 } 4423 4424 4425 int 4426 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4427 { 4428 int retval; 4429 4430 if (! KMP_AFFINITY_CAPABLE()) { 4431 return -1; 4432 } 4433 4434 KA_TRACE(1000, ;{ 4435 int gtid = __kmp_entry_gtid(); 4436 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4437 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4438 (kmp_affin_mask_t *)(*mask)); 4439 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4440 proc, gtid, buf); 4441 }); 4442 4443 if (__kmp_env_consistency_check) { 4444 if ((mask == NULL) || (*mask == NULL)) { 4445 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4446 } 4447 } 4448 4449 if ((proc < 0) 4450 # if !KMP_USE_HWLOC 4451 || ((unsigned)proc >= KMP_CPU_SETSIZE) 4452 # endif 4453 ) { 4454 return -1; 4455 } 4456 if (! KMP_CPU_ISSET(proc, fullMask)) { 4457 return -2; 4458 } 4459 4460 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4461 return 0; 4462 } 4463 4464 4465 int 4466 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4467 { 4468 int retval; 4469 4470 if (! KMP_AFFINITY_CAPABLE()) { 4471 return -1; 4472 } 4473 4474 KA_TRACE(1000, ;{ 4475 int gtid = __kmp_entry_gtid(); 4476 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4477 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4478 (kmp_affin_mask_t *)(*mask)); 4479 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4480 proc, gtid, buf); 4481 }); 4482 4483 if (__kmp_env_consistency_check) { 4484 if ((mask == NULL) || (*mask == NULL)) { 4485 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity_mask_proc"); 4486 } 4487 } 4488 4489 if ((proc < 0) 4490 # if !KMP_USE_HWLOC 4491 || ((unsigned)proc >= KMP_CPU_SETSIZE) 4492 # endif 4493 ) { 4494 return -1; 4495 } 4496 if (! KMP_CPU_ISSET(proc, fullMask)) { 4497 return 0; 4498 } 4499 4500 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4501 } 4502 4503 4504 // Dynamic affinity settings - Affinity balanced 4505 void __kmp_balanced_affinity( int tid, int nthreads ) 4506 { 4507 if( __kmp_affinity_uniform_topology() ) { 4508 int coreID; 4509 int threadID; 4510 // Number of hyper threads per core in HT machine 4511 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4512 // Number of cores 4513 int ncores = __kmp_ncores; 4514 // How many threads will be bound to each core 4515 int chunk = nthreads / ncores; 4516 // How many cores will have an additional thread bound to it - "big cores" 4517 int big_cores = nthreads % ncores; 4518 // Number of threads on the big cores 4519 int big_nth = ( chunk + 1 ) * big_cores; 4520 if( tid < big_nth ) { 4521 coreID = tid / (chunk + 1 ); 4522 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4523 } else { //tid >= big_nth 4524 coreID = ( tid - big_cores ) / chunk; 4525 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4526 } 4527 4528 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4529 "Illegal set affinity operation when not capable"); 4530 4531 kmp_affin_mask_t *mask; 4532 KMP_CPU_ALLOC_ON_STACK(mask); 4533 KMP_CPU_ZERO(mask); 4534 4535 // Granularity == thread 4536 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4537 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4538 KMP_CPU_SET( osID, mask); 4539 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4540 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4541 int osID; 4542 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4543 KMP_CPU_SET( osID, mask); 4544 } 4545 } 4546 if (__kmp_affinity_verbose) { 4547 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4548 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4549 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4550 tid, buf); 4551 } 4552 __kmp_set_system_affinity( mask, TRUE ); 4553 KMP_CPU_FREE_FROM_STACK(mask); 4554 } else { // Non-uniform topology 4555 4556 kmp_affin_mask_t *mask; 4557 KMP_CPU_ALLOC_ON_STACK(mask); 4558 KMP_CPU_ZERO(mask); 4559 4560 // Number of hyper threads per core in HT machine 4561 int nth_per_core = __kmp_nThreadsPerCore; 4562 int core_level; 4563 if( nth_per_core > 1 ) { 4564 core_level = __kmp_aff_depth - 2; 4565 } else { 4566 core_level = __kmp_aff_depth - 1; 4567 } 4568 4569 // Number of cores - maximum value; it does not count trail cores with 0 processors 4570 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4571 4572 // For performance gain consider the special case nthreads == __kmp_avail_proc 4573 if( nthreads == __kmp_avail_proc ) { 4574 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4575 int osID = address2os[ tid ].second; 4576 KMP_CPU_SET( osID, mask); 4577 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4578 int coreID = address2os[ tid ].first.labels[ core_level ]; 4579 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4580 // since the address2os is sortied we can break when cnt==nth_per_core 4581 int cnt = 0; 4582 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4583 int osID = address2os[ i ].second; 4584 int core = address2os[ i ].first.labels[ core_level ]; 4585 if( core == coreID ) { 4586 KMP_CPU_SET( osID, mask); 4587 cnt++; 4588 if( cnt == nth_per_core ) { 4589 break; 4590 } 4591 } 4592 } 4593 } 4594 } else if( nthreads <= __kmp_ncores ) { 4595 4596 int core = 0; 4597 for( int i = 0; i < ncores; i++ ) { 4598 // Check if this core from procarr[] is in the mask 4599 int in_mask = 0; 4600 for( int j = 0; j < nth_per_core; j++ ) { 4601 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4602 in_mask = 1; 4603 break; 4604 } 4605 } 4606 if( in_mask ) { 4607 if( tid == core ) { 4608 for( int j = 0; j < nth_per_core; j++ ) { 4609 int osID = procarr[ i * nth_per_core + j ]; 4610 if( osID != -1 ) { 4611 KMP_CPU_SET( osID, mask ); 4612 // For granularity=thread it is enough to set the first available osID for this core 4613 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4614 break; 4615 } 4616 } 4617 } 4618 break; 4619 } else { 4620 core++; 4621 } 4622 } 4623 } 4624 4625 } else { // nthreads > __kmp_ncores 4626 4627 // Array to save the number of processors at each core 4628 int* nproc_at_core = (int*)KMP_ALLOCA(sizeof(int)*ncores); 4629 // Array to save the number of cores with "x" available processors; 4630 int* ncores_with_x_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4631 // Array to save the number of cores with # procs from x to nth_per_core 4632 int* ncores_with_x_to_max_procs = (int*)KMP_ALLOCA(sizeof(int)*(nth_per_core+1)); 4633 4634 for( int i = 0; i <= nth_per_core; i++ ) { 4635 ncores_with_x_procs[ i ] = 0; 4636 ncores_with_x_to_max_procs[ i ] = 0; 4637 } 4638 4639 for( int i = 0; i < ncores; i++ ) { 4640 int cnt = 0; 4641 for( int j = 0; j < nth_per_core; j++ ) { 4642 if( procarr[ i * nth_per_core + j ] != -1 ) { 4643 cnt++; 4644 } 4645 } 4646 nproc_at_core[ i ] = cnt; 4647 ncores_with_x_procs[ cnt ]++; 4648 } 4649 4650 for( int i = 0; i <= nth_per_core; i++ ) { 4651 for( int j = i; j <= nth_per_core; j++ ) { 4652 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4653 } 4654 } 4655 4656 // Max number of processors 4657 int nproc = nth_per_core * ncores; 4658 // An array to keep number of threads per each context 4659 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4660 for( int i = 0; i < nproc; i++ ) { 4661 newarr[ i ] = 0; 4662 } 4663 4664 int nth = nthreads; 4665 int flag = 0; 4666 while( nth > 0 ) { 4667 for( int j = 1; j <= nth_per_core; j++ ) { 4668 int cnt = ncores_with_x_to_max_procs[ j ]; 4669 for( int i = 0; i < ncores; i++ ) { 4670 // Skip the core with 0 processors 4671 if( nproc_at_core[ i ] == 0 ) { 4672 continue; 4673 } 4674 for( int k = 0; k < nth_per_core; k++ ) { 4675 if( procarr[ i * nth_per_core + k ] != -1 ) { 4676 if( newarr[ i * nth_per_core + k ] == 0 ) { 4677 newarr[ i * nth_per_core + k ] = 1; 4678 cnt--; 4679 nth--; 4680 break; 4681 } else { 4682 if( flag != 0 ) { 4683 newarr[ i * nth_per_core + k ] ++; 4684 cnt--; 4685 nth--; 4686 break; 4687 } 4688 } 4689 } 4690 } 4691 if( cnt == 0 || nth == 0 ) { 4692 break; 4693 } 4694 } 4695 if( nth == 0 ) { 4696 break; 4697 } 4698 } 4699 flag = 1; 4700 } 4701 int sum = 0; 4702 for( int i = 0; i < nproc; i++ ) { 4703 sum += newarr[ i ]; 4704 if( sum > tid ) { 4705 // Granularity == thread 4706 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4707 int osID = procarr[ i ]; 4708 KMP_CPU_SET( osID, mask); 4709 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4710 int coreID = i / nth_per_core; 4711 for( int ii = 0; ii < nth_per_core; ii++ ) { 4712 int osID = procarr[ coreID * nth_per_core + ii ]; 4713 if( osID != -1 ) { 4714 KMP_CPU_SET( osID, mask); 4715 } 4716 } 4717 } 4718 break; 4719 } 4720 } 4721 __kmp_free( newarr ); 4722 } 4723 4724 if (__kmp_affinity_verbose) { 4725 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4726 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4727 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4728 tid, buf); 4729 } 4730 __kmp_set_system_affinity( mask, TRUE ); 4731 KMP_CPU_FREE_FROM_STACK(mask); 4732 } 4733 } 4734 4735 #if KMP_OS_LINUX 4736 // We don't need this entry for Windows because 4737 // there is GetProcessAffinityMask() api 4738 // 4739 // The intended usage is indicated by these steps: 4740 // 1) The user gets the current affinity mask 4741 // 2) Then sets the affinity by calling this function 4742 // 3) Error check the return value 4743 // 4) Use non-OpenMP parallelization 4744 // 5) Reset the affinity to what was stored in step 1) 4745 #ifdef __cplusplus 4746 extern "C" 4747 #endif 4748 int 4749 kmp_set_thread_affinity_mask_initial() 4750 // the function returns 0 on success, 4751 // -1 if we cannot bind thread 4752 // >0 (errno) if an error happened during binding 4753 { 4754 int gtid = __kmp_get_gtid(); 4755 if (gtid < 0) { 4756 // Do not touch non-omp threads 4757 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4758 "non-omp thread, returning\n")); 4759 return -1; 4760 } 4761 if (!KMP_AFFINITY_CAPABLE() || !__kmp_init_middle) { 4762 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4763 "affinity not initialized, returning\n")); 4764 return -1; 4765 } 4766 KA_TRACE(30, ( "kmp_set_thread_affinity_mask_initial: " 4767 "set full mask for thread %d\n", gtid)); 4768 KMP_DEBUG_ASSERT(fullMask != NULL); 4769 return __kmp_set_system_affinity(fullMask, FALSE); 4770 } 4771 #endif 4772 4773 #endif // KMP_AFFINITY_SUPPORTED 4774