1 /* 2 * kmp_affinity.cpp -- affinity management 3 * $Revision: 43473 $ 4 * $Date: 2014-09-26 15:02:57 -0500 (Fri, 26 Sep 2014) $ 5 */ 6 7 8 //===----------------------------------------------------------------------===// 9 // 10 // The LLVM Compiler Infrastructure 11 // 12 // This file is dual licensed under the MIT and the University of Illinois Open 13 // Source Licenses. See LICENSE.txt for details. 14 // 15 //===----------------------------------------------------------------------===// 16 17 18 #include "kmp.h" 19 #include "kmp_i18n.h" 20 #include "kmp_io.h" 21 #include "kmp_str.h" 22 #include "kmp_wrapper_getpid.h" 23 24 #if KMP_AFFINITY_SUPPORTED 25 26 // 27 // Print the affinity mask to the character array in a pretty format. 28 // 29 char * 30 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 31 { 32 KMP_ASSERT(buf_len >= 40); 33 char *scan = buf; 34 char *end = buf + buf_len - 1; 35 36 // 37 // Find first element / check for empty set. 38 // 39 size_t i; 40 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 41 if (KMP_CPU_ISSET(i, mask)) { 42 break; 43 } 44 } 45 if (i == KMP_CPU_SETSIZE) { 46 sprintf(scan, "{<empty>}"); 47 while (*scan != '\0') scan++; 48 KMP_ASSERT(scan <= end); 49 return buf; 50 } 51 52 sprintf(scan, "{%ld", (long)i); 53 while (*scan != '\0') scan++; 54 i++; 55 for (; i < KMP_CPU_SETSIZE; i++) { 56 if (! KMP_CPU_ISSET(i, mask)) { 57 continue; 58 } 59 60 // 61 // Check for buffer overflow. A string of the form ",<n>" will have 62 // at most 10 characters, plus we want to leave room to print ",...}" 63 // if the set is too large to print for a total of 15 characters. 64 // We already left room for '\0' in setting end. 65 // 66 if (end - scan < 15) { 67 break; 68 } 69 sprintf(scan, ",%-ld", (long)i); 70 while (*scan != '\0') scan++; 71 } 72 if (i < KMP_CPU_SETSIZE) { 73 sprintf(scan, ",..."); 74 while (*scan != '\0') scan++; 75 } 76 sprintf(scan, "}"); 77 while (*scan != '\0') scan++; 78 KMP_ASSERT(scan <= end); 79 return buf; 80 } 81 82 83 void 84 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 85 { 86 KMP_CPU_ZERO(mask); 87 88 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 89 90 if (__kmp_num_proc_groups > 1) { 91 int group; 92 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 93 for (group = 0; group < __kmp_num_proc_groups; group++) { 94 int i; 95 int num = __kmp_GetActiveProcessorCount(group); 96 for (i = 0; i < num; i++) { 97 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 98 } 99 } 100 } 101 else 102 103 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 104 105 { 106 int proc; 107 for (proc = 0; proc < __kmp_xproc; proc++) { 108 KMP_CPU_SET(proc, mask); 109 } 110 } 111 } 112 113 114 // 115 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 116 // functions. 117 // 118 // The icc codegen emits sections with extremely long names, of the form 119 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 120 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 121 // some sort of memory corruption or table overflow that is triggered by 122 // these long strings. I checked the latest version of the linker - 123 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 124 // fixed. 125 // 126 // Unfortunately, my attempts to reproduce it in a smaller example have 127 // failed - I'm not sure what the prospects are of getting it fixed 128 // properly - but we need a reproducer smaller than all of libiomp. 129 // 130 // Work around the problem by avoiding inline constructors in such builds. 131 // We do this for all platforms, not just Linux* OS - non-inline functions are 132 // more debuggable and provide better coverage into than inline functions. 133 // Use inline functions in shipping libs, for performance. 134 // 135 136 # if !defined(KMP_DEBUG) && !defined(COVER) 137 138 class Address { 139 public: 140 static const unsigned maxDepth = 32; 141 unsigned labels[maxDepth]; 142 unsigned childNums[maxDepth]; 143 unsigned depth; 144 unsigned leader; 145 Address(unsigned _depth) 146 : depth(_depth), leader(FALSE) { 147 } 148 Address &operator=(const Address &b) { 149 depth = b.depth; 150 for (unsigned i = 0; i < depth; i++) { 151 labels[i] = b.labels[i]; 152 childNums[i] = b.childNums[i]; 153 } 154 leader = FALSE; 155 return *this; 156 } 157 bool operator==(const Address &b) const { 158 if (depth != b.depth) 159 return false; 160 for (unsigned i = 0; i < depth; i++) 161 if(labels[i] != b.labels[i]) 162 return false; 163 return true; 164 } 165 bool isClose(const Address &b, int level) const { 166 if (depth != b.depth) 167 return false; 168 if ((unsigned)level >= depth) 169 return true; 170 for (unsigned i = 0; i < (depth - level); i++) 171 if(labels[i] != b.labels[i]) 172 return false; 173 return true; 174 } 175 bool operator!=(const Address &b) const { 176 return !operator==(b); 177 } 178 }; 179 180 class AddrUnsPair { 181 public: 182 Address first; 183 unsigned second; 184 AddrUnsPair(Address _first, unsigned _second) 185 : first(_first), second(_second) { 186 } 187 AddrUnsPair &operator=(const AddrUnsPair &b) 188 { 189 first = b.first; 190 second = b.second; 191 return *this; 192 } 193 }; 194 195 # else 196 197 class Address { 198 public: 199 static const unsigned maxDepth = 32; 200 unsigned labels[maxDepth]; 201 unsigned childNums[maxDepth]; 202 unsigned depth; 203 unsigned leader; 204 Address(unsigned _depth); 205 Address &operator=(const Address &b); 206 bool operator==(const Address &b) const; 207 bool isClose(const Address &b, int level) const; 208 bool operator!=(const Address &b) const; 209 }; 210 211 Address::Address(unsigned _depth) 212 { 213 depth = _depth; 214 leader = FALSE; 215 } 216 217 Address &Address::operator=(const Address &b) { 218 depth = b.depth; 219 for (unsigned i = 0; i < depth; i++) { 220 labels[i] = b.labels[i]; 221 childNums[i] = b.childNums[i]; 222 } 223 leader = FALSE; 224 return *this; 225 } 226 227 bool Address::operator==(const Address &b) const { 228 if (depth != b.depth) 229 return false; 230 for (unsigned i = 0; i < depth; i++) 231 if(labels[i] != b.labels[i]) 232 return false; 233 return true; 234 } 235 236 bool Address::isClose(const Address &b, int level) const { 237 if (depth != b.depth) 238 return false; 239 if ((unsigned)level >= depth) 240 return true; 241 for (unsigned i = 0; i < (depth - level); i++) 242 if(labels[i] != b.labels[i]) 243 return false; 244 return true; 245 } 246 247 bool Address::operator!=(const Address &b) const { 248 return !operator==(b); 249 } 250 251 class AddrUnsPair { 252 public: 253 Address first; 254 unsigned second; 255 AddrUnsPair(Address _first, unsigned _second); 256 AddrUnsPair &operator=(const AddrUnsPair &b); 257 }; 258 259 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 260 : first(_first), second(_second) 261 { 262 } 263 264 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 265 { 266 first = b.first; 267 second = b.second; 268 return *this; 269 } 270 271 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 272 273 274 static int 275 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 276 { 277 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 278 ->first); 279 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 280 ->first); 281 unsigned depth = aa->depth; 282 unsigned i; 283 KMP_DEBUG_ASSERT(depth == bb->depth); 284 for (i = 0; i < depth; i++) { 285 if (aa->labels[i] < bb->labels[i]) return -1; 286 if (aa->labels[i] > bb->labels[i]) return 1; 287 } 288 return 0; 289 } 290 291 292 static int 293 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 294 { 295 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 296 ->first); 297 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 298 ->first); 299 unsigned depth = aa->depth; 300 unsigned i; 301 KMP_DEBUG_ASSERT(depth == bb->depth); 302 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 303 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 304 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 305 int j = depth - i - 1; 306 if (aa->childNums[j] < bb->childNums[j]) return -1; 307 if (aa->childNums[j] > bb->childNums[j]) return 1; 308 } 309 for (; i < depth; i++) { 310 int j = i - __kmp_affinity_compact; 311 if (aa->childNums[j] < bb->childNums[j]) return -1; 312 if (aa->childNums[j] > bb->childNums[j]) return 1; 313 } 314 return 0; 315 } 316 317 /** A structure for holding machine-specific hierarchy info to be computed once at init. */ 318 class hierarchy_info { 319 public: 320 /** Typical levels are threads/core, cores/package or socket, packages/node, nodes/machine, 321 etc. We don't want to get specific with nomenclature */ 322 static const kmp_uint32 maxLevels=7; 323 324 /** This is specifically the depth of the machine configuration hierarchy, in terms of the 325 number of levels along the longest path from root to any leaf. It corresponds to the 326 number of entries in numPerLevel if we exclude all but one trailing 1. */ 327 kmp_uint32 depth; 328 kmp_uint32 base_depth; 329 kmp_uint32 base_num_threads; 330 bool uninitialized; 331 332 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children the parent of a 333 node at level i has. For example, if we have a machine with 4 packages, 4 cores/package 334 and 2 HT per core, then numPerLevel = {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 335 kmp_uint32 numPerLevel[maxLevels]; 336 kmp_uint32 skipPerLevel[maxLevels]; 337 338 void deriveLevels(AddrUnsPair *adr2os, int num_addrs) { 339 int hier_depth = adr2os[0].first.depth; 340 int level = 0; 341 for (int i=hier_depth-1; i>=0; --i) { 342 int max = -1; 343 for (int j=0; j<num_addrs; ++j) { 344 int next = adr2os[j].first.childNums[i]; 345 if (next > max) max = next; 346 } 347 numPerLevel[level] = max+1; 348 ++level; 349 } 350 } 351 352 hierarchy_info() : depth(1), uninitialized(true) {} 353 void init(AddrUnsPair *adr2os, int num_addrs) 354 { 355 uninitialized = false; 356 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 357 numPerLevel[i] = 1; 358 skipPerLevel[i] = 1; 359 } 360 361 // Sort table by physical ID 362 if (adr2os) { 363 qsort(adr2os, num_addrs, sizeof(*adr2os), __kmp_affinity_cmp_Address_labels); 364 deriveLevels(adr2os, num_addrs); 365 } 366 else { 367 numPerLevel[0] = 4; 368 numPerLevel[1] = num_addrs/4; 369 if (num_addrs%4) numPerLevel[1]++; 370 } 371 372 base_num_threads = num_addrs; 373 for (int i=maxLevels-1; i>=0; --i) // count non-empty levels to get depth 374 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 375 depth++; 376 377 kmp_uint32 branch = 4; 378 if (numPerLevel[0] == 1) branch = num_addrs/4; 379 if (branch<4) branch=4; 380 for (kmp_uint32 d=0; d<depth-1; ++d) { // optimize hierarchy width 381 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 382 if (numPerLevel[d] & 1) numPerLevel[d]++; 383 numPerLevel[d] = numPerLevel[d] >> 1; 384 if (numPerLevel[d+1] == 1) depth++; 385 numPerLevel[d+1] = numPerLevel[d+1] << 1; 386 } 387 if(numPerLevel[0] == 1) { 388 branch = branch >> 1; 389 if (branch<4) branch = 4; 390 } 391 } 392 393 for (kmp_uint32 i=1; i<depth; ++i) 394 skipPerLevel[i] = numPerLevel[i-1] * skipPerLevel[i-1]; 395 396 base_depth = depth; 397 } 398 }; 399 400 static hierarchy_info machine_hierarchy; 401 402 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 403 if (machine_hierarchy.uninitialized) 404 machine_hierarchy.init(NULL, nproc); 405 406 if (nproc <= machine_hierarchy.base_num_threads) 407 machine_hierarchy.depth = machine_hierarchy.base_depth; 408 KMP_DEBUG_ASSERT(machine_hierarchy.depth > 0); 409 while (nproc > machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1]) { 410 machine_hierarchy.depth++; 411 machine_hierarchy.skipPerLevel[machine_hierarchy.depth-1] = 2*machine_hierarchy.skipPerLevel[machine_hierarchy.depth-2]; 412 } 413 thr_bar->depth = machine_hierarchy.depth; 414 thr_bar->base_leaf_kids = (kmp_uint8)machine_hierarchy.numPerLevel[0]-1; 415 thr_bar->skip_per_level = machine_hierarchy.skipPerLevel; 416 } 417 418 // 419 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 420 // called to renumber the labels from [0..n] and place them into the child_num 421 // vector of the address object. This is done in case the labels used for 422 // the children at one node of the hierarchy differ from those used for 423 // another node at the same level. Example: suppose the machine has 2 nodes 424 // with 2 packages each. The first node contains packages 601 and 602, and 425 // second node contains packages 603 and 604. If we try to sort the table 426 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 427 // because we are paying attention to the labels themselves, not the ordinal 428 // child numbers. By using the child numbers in the sort, the result is 429 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 430 // 431 static void 432 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 433 int numAddrs) 434 { 435 KMP_DEBUG_ASSERT(numAddrs > 0); 436 int depth = address2os->first.depth; 437 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 438 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 439 * sizeof(unsigned)); 440 int labCt; 441 for (labCt = 0; labCt < depth; labCt++) { 442 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 443 lastLabel[labCt] = address2os[0].first.labels[labCt]; 444 } 445 int i; 446 for (i = 1; i < numAddrs; i++) { 447 for (labCt = 0; labCt < depth; labCt++) { 448 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 449 int labCt2; 450 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 451 counts[labCt2] = 0; 452 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 453 } 454 counts[labCt]++; 455 lastLabel[labCt] = address2os[i].first.labels[labCt]; 456 break; 457 } 458 } 459 for (labCt = 0; labCt < depth; labCt++) { 460 address2os[i].first.childNums[labCt] = counts[labCt]; 461 } 462 for (; labCt < (int)Address::maxDepth; labCt++) { 463 address2os[i].first.childNums[labCt] = 0; 464 } 465 } 466 } 467 468 469 // 470 // All of the __kmp_affinity_create_*_map() routines should set 471 // __kmp_affinity_masks to a vector of affinity mask objects of length 472 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 473 // return the number of levels in the machine topology tree (zero if 474 // __kmp_affinity_type == affinity_none). 475 // 476 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 477 // to the affinity mask for the initialization thread. They need to save and 478 // restore the mask, and it could be needed later, so saving it is just an 479 // optimization to avoid calling kmp_get_system_affinity() again. 480 // 481 static kmp_affin_mask_t *fullMask = NULL; 482 483 kmp_affin_mask_t * 484 __kmp_affinity_get_fullMask() { return fullMask; } 485 486 487 static int nCoresPerPkg, nPackages; 488 int __kmp_nThreadsPerCore; 489 490 // 491 // __kmp_affinity_uniform_topology() doesn't work when called from 492 // places which support arbitrarily many levels in the machine topology 493 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 494 // __kmp_affinity_create_x2apicid_map(). 495 // 496 inline static bool 497 __kmp_affinity_uniform_topology() 498 { 499 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 500 } 501 502 503 // 504 // Print out the detailed machine topology map, i.e. the physical locations 505 // of each OS proc. 506 // 507 static void 508 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 509 int pkgLevel, int coreLevel, int threadLevel) 510 { 511 int proc; 512 513 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 514 for (proc = 0; proc < len; proc++) { 515 int level; 516 kmp_str_buf_t buf; 517 __kmp_str_buf_init(&buf); 518 for (level = 0; level < depth; level++) { 519 if (level == threadLevel) { 520 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 521 } 522 else if (level == coreLevel) { 523 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 524 } 525 else if (level == pkgLevel) { 526 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 527 } 528 else if (level > pkgLevel) { 529 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 530 level - pkgLevel - 1); 531 } 532 else { 533 __kmp_str_buf_print(&buf, "L%d ", level); 534 } 535 __kmp_str_buf_print(&buf, "%d ", 536 address2os[proc].first.labels[level]); 537 } 538 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 539 buf.str); 540 __kmp_str_buf_free(&buf); 541 } 542 } 543 544 545 // 546 // If we don't know how to retrieve the machine's processor topology, or 547 // encounter an error in doing so, this routine is called to form a "flat" 548 // mapping of os thread id's <-> processor id's. 549 // 550 static int 551 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 552 kmp_i18n_id_t *const msg_id) 553 { 554 *address2os = NULL; 555 *msg_id = kmp_i18n_null; 556 557 // 558 // Even if __kmp_affinity_type == affinity_none, this routine might still 559 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as 560 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 561 // 562 if (! KMP_AFFINITY_CAPABLE()) { 563 KMP_ASSERT(__kmp_affinity_type == affinity_none); 564 __kmp_ncores = nPackages = __kmp_xproc; 565 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 566 __kmp_ht_enabled = FALSE; 567 if (__kmp_affinity_verbose) { 568 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 569 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 570 KMP_INFORM(Uniform, "KMP_AFFINITY"); 571 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 572 __kmp_nThreadsPerCore, __kmp_ncores); 573 } 574 return 0; 575 } 576 577 // 578 // When affinity is off, this routine will still be called to set 579 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 580 // nCoresPerPkg, & nPackages. Make sure all these vars are set 581 // correctly, and return now if affinity is not enabled. 582 // 583 __kmp_ncores = nPackages = __kmp_avail_proc; 584 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 585 __kmp_ht_enabled = FALSE; 586 if (__kmp_affinity_verbose) { 587 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 588 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 589 590 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 591 if (__kmp_affinity_respect_mask) { 592 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 593 } else { 594 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 595 } 596 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 597 KMP_INFORM(Uniform, "KMP_AFFINITY"); 598 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 599 __kmp_nThreadsPerCore, __kmp_ncores); 600 } 601 if (__kmp_affinity_type == affinity_none) { 602 return 0; 603 } 604 605 // 606 // Contruct the data structure to be returned. 607 // 608 *address2os = (AddrUnsPair*) 609 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 610 int avail_ct = 0; 611 unsigned int i; 612 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 613 // 614 // Skip this proc if it is not included in the machine model. 615 // 616 if (! KMP_CPU_ISSET(i, fullMask)) { 617 continue; 618 } 619 620 Address addr(1); 621 addr.labels[0] = i; 622 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 623 } 624 if (__kmp_affinity_verbose) { 625 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 626 } 627 628 if (__kmp_affinity_gran_levels < 0) { 629 // 630 // Only the package level is modeled in the machine topology map, 631 // so the #levels of granularity is either 0 or 1. 632 // 633 if (__kmp_affinity_gran > affinity_gran_package) { 634 __kmp_affinity_gran_levels = 1; 635 } 636 else { 637 __kmp_affinity_gran_levels = 0; 638 } 639 } 640 return 1; 641 } 642 643 644 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 645 646 // 647 // If multiple Windows* OS processor groups exist, we can create a 2-level 648 // topology map with the groups at level 0 and the individual procs at 649 // level 1. 650 // 651 // This facilitates letting the threads float among all procs in a group, 652 // if granularity=group (the default when there are multiple groups). 653 // 654 static int 655 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 656 kmp_i18n_id_t *const msg_id) 657 { 658 *address2os = NULL; 659 *msg_id = kmp_i18n_null; 660 661 // 662 // If we don't have multiple processor groups, return now. 663 // The flat mapping will be used. 664 // 665 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 666 // FIXME set *msg_id 667 return -1; 668 } 669 670 // 671 // Contruct the data structure to be returned. 672 // 673 *address2os = (AddrUnsPair*) 674 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 675 int avail_ct = 0; 676 int i; 677 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 678 // 679 // Skip this proc if it is not included in the machine model. 680 // 681 if (! KMP_CPU_ISSET(i, fullMask)) { 682 continue; 683 } 684 685 Address addr(2); 686 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 687 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 688 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 689 690 if (__kmp_affinity_verbose) { 691 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 692 addr.labels[1]); 693 } 694 } 695 696 if (__kmp_affinity_gran_levels < 0) { 697 if (__kmp_affinity_gran == affinity_gran_group) { 698 __kmp_affinity_gran_levels = 1; 699 } 700 else if ((__kmp_affinity_gran == affinity_gran_fine) 701 || (__kmp_affinity_gran == affinity_gran_thread)) { 702 __kmp_affinity_gran_levels = 0; 703 } 704 else { 705 const char *gran_str = NULL; 706 if (__kmp_affinity_gran == affinity_gran_core) { 707 gran_str = "core"; 708 } 709 else if (__kmp_affinity_gran == affinity_gran_package) { 710 gran_str = "package"; 711 } 712 else if (__kmp_affinity_gran == affinity_gran_node) { 713 gran_str = "node"; 714 } 715 else { 716 KMP_ASSERT(0); 717 } 718 719 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 720 __kmp_affinity_gran_levels = 0; 721 } 722 } 723 return 2; 724 } 725 726 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 727 728 729 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 730 731 static int 732 __kmp_cpuid_mask_width(int count) { 733 int r = 0; 734 735 while((1<<r) < count) 736 ++r; 737 return r; 738 } 739 740 741 class apicThreadInfo { 742 public: 743 unsigned osId; // param to __kmp_affinity_bind_thread 744 unsigned apicId; // from cpuid after binding 745 unsigned maxCoresPerPkg; // "" 746 unsigned maxThreadsPerPkg; // "" 747 unsigned pkgId; // inferred from above values 748 unsigned coreId; // "" 749 unsigned threadId; // "" 750 }; 751 752 753 static int 754 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 755 { 756 const apicThreadInfo *aa = (const apicThreadInfo *)a; 757 const apicThreadInfo *bb = (const apicThreadInfo *)b; 758 if (aa->osId < bb->osId) return -1; 759 if (aa->osId > bb->osId) return 1; 760 return 0; 761 } 762 763 764 static int 765 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 766 { 767 const apicThreadInfo *aa = (const apicThreadInfo *)a; 768 const apicThreadInfo *bb = (const apicThreadInfo *)b; 769 if (aa->pkgId < bb->pkgId) return -1; 770 if (aa->pkgId > bb->pkgId) return 1; 771 if (aa->coreId < bb->coreId) return -1; 772 if (aa->coreId > bb->coreId) return 1; 773 if (aa->threadId < bb->threadId) return -1; 774 if (aa->threadId > bb->threadId) return 1; 775 return 0; 776 } 777 778 779 // 780 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 781 // an algorithm which cycles through the available os threads, setting 782 // the current thread's affinity mask to that thread, and then retrieves 783 // the Apic Id for each thread context using the cpuid instruction. 784 // 785 static int 786 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 787 kmp_i18n_id_t *const msg_id) 788 { 789 int rc; 790 *address2os = NULL; 791 *msg_id = kmp_i18n_null; 792 793 # if KMP_MIC 794 { 795 // The code below will use cpuid(4). 796 // Check if cpuid(4) is supported. 797 // FIXME? - this really doesn't need to be specific to MIC. 798 kmp_cpuid buf; 799 __kmp_x86_cpuid(0, 0, &buf); 800 if (buf.eax < 4) { 801 *msg_id = kmp_i18n_str_NoLeaf4Support; 802 return -1; 803 } 804 } 805 # endif // KMP_MIC 806 807 // 808 // Even if __kmp_affinity_type == affinity_none, this routine is still 809 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as 810 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 811 // 812 // The algorithm used starts by setting the affinity to each available 813 // thread and retreiving info from the cpuid instruction, so if we are not 814 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(), 815 // then we need to do something else. 816 // 817 if (! KMP_AFFINITY_CAPABLE()) { 818 // 819 // Hack to try and infer the machine topology using only the data 820 // available from cpuid on the current thread, and __kmp_xproc. 821 // 822 KMP_ASSERT(__kmp_affinity_type == affinity_none); 823 824 // 825 // Get an upper bound on the number of threads per package using 826 // cpuid(1). 827 // 828 // On some OS/chps combinations where HT is supported by the chip 829 // but is disabled, this value will be 2 on a single core chip. 830 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 831 // 832 kmp_cpuid buf; 833 __kmp_x86_cpuid(1, 0, &buf); 834 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 835 if (maxThreadsPerPkg == 0) { 836 maxThreadsPerPkg = 1; 837 } 838 839 // 840 // The num cores per pkg comes from cpuid(4). 841 // 1 must be added to the encoded value. 842 // 843 // The author of cpu_count.cpp treated this only an upper bound 844 // on the number of cores, but I haven't seen any cases where it 845 // was greater than the actual number of cores, so we will treat 846 // it as exact in this block of code. 847 // 848 // First, we need to check if cpuid(4) is supported on this chip. 849 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 850 // has the value n or greater. 851 // 852 __kmp_x86_cpuid(0, 0, &buf); 853 if (buf.eax >= 4) { 854 __kmp_x86_cpuid(4, 0, &buf); 855 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 856 } 857 else { 858 nCoresPerPkg = 1; 859 } 860 861 // 862 // There is no way to reliably tell if HT is enabled without issuing 863 // the cpuid instruction from every thread, can correlating the cpuid 864 // info, so if the machine is not affinity capable, we assume that HT 865 // is off. We have seen quite a few machines where maxThreadsPerPkg 866 // is 2, yet the machine does not support HT. 867 // 868 // - Older OSes are usually found on machines with older chips, which 869 // do not support HT. 870 // 871 // - The performance penalty for mistakenly identifying a machine as 872 // HT when it isn't (which results in blocktime being incorrecly set 873 // to 0) is greater than the penalty when for mistakenly identifying 874 // a machine as being 1 thread/core when it is really HT enabled 875 // (which results in blocktime being incorrectly set to a positive 876 // value). 877 // 878 __kmp_ncores = __kmp_xproc; 879 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 880 __kmp_nThreadsPerCore = 1; 881 __kmp_ht_enabled = FALSE; 882 if (__kmp_affinity_verbose) { 883 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 884 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 885 if (__kmp_affinity_uniform_topology()) { 886 KMP_INFORM(Uniform, "KMP_AFFINITY"); 887 } else { 888 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 889 } 890 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 891 __kmp_nThreadsPerCore, __kmp_ncores); 892 } 893 return 0; 894 } 895 896 // 897 // 898 // From here on, we can assume that it is safe to call 899 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 900 // even if __kmp_affinity_type = affinity_none. 901 // 902 903 // 904 // Save the affinity mask for the current thread. 905 // 906 kmp_affin_mask_t *oldMask; 907 KMP_CPU_ALLOC(oldMask); 908 KMP_ASSERT(oldMask != NULL); 909 __kmp_get_system_affinity(oldMask, TRUE); 910 911 // 912 // Run through each of the available contexts, binding the current thread 913 // to it, and obtaining the pertinent information using the cpuid instr. 914 // 915 // The relevant information is: 916 // 917 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 918 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 919 // 920 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 921 // value of this field determines the width of the core# + thread# 922 // fields in the Apic Id. It is also an upper bound on the number 923 // of threads per package, but it has been verified that situations 924 // happen were it is not exact. In particular, on certain OS/chip 925 // combinations where Intel(R) Hyper-Threading Technology is supported 926 // by the chip but has 927 // been disabled, the value of this field will be 2 (for a single core 928 // chip). On other OS/chip combinations supporting 929 // Intel(R) Hyper-Threading Technology, the value of 930 // this field will be 1 when Intel(R) Hyper-Threading Technology is 931 // disabled and 2 when it is enabled. 932 // 933 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 934 // value of this field (+1) determines the width of the core# field in 935 // the Apic Id. The comments in "cpucount.cpp" say that this value is 936 // an upper bound, but the IA-32 architecture manual says that it is 937 // exactly the number of cores per package, and I haven't seen any 938 // case where it wasn't. 939 // 940 // From this information, deduce the package Id, core Id, and thread Id, 941 // and set the corresponding fields in the apicThreadInfo struct. 942 // 943 unsigned i; 944 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 945 __kmp_avail_proc * sizeof(apicThreadInfo)); 946 unsigned nApics = 0; 947 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 948 // 949 // Skip this proc if it is not included in the machine model. 950 // 951 if (! KMP_CPU_ISSET(i, fullMask)) { 952 continue; 953 } 954 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 955 956 __kmp_affinity_bind_thread(i); 957 threadInfo[nApics].osId = i; 958 959 // 960 // The apic id and max threads per pkg come from cpuid(1). 961 // 962 kmp_cpuid buf; 963 __kmp_x86_cpuid(1, 0, &buf); 964 if (! (buf.edx >> 9) & 1) { 965 __kmp_set_system_affinity(oldMask, TRUE); 966 __kmp_free(threadInfo); 967 KMP_CPU_FREE(oldMask); 968 *msg_id = kmp_i18n_str_ApicNotPresent; 969 return -1; 970 } 971 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 972 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 973 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 974 threadInfo[nApics].maxThreadsPerPkg = 1; 975 } 976 977 // 978 // Max cores per pkg comes from cpuid(4). 979 // 1 must be added to the encoded value. 980 // 981 // First, we need to check if cpuid(4) is supported on this chip. 982 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 983 // has the value n or greater. 984 // 985 __kmp_x86_cpuid(0, 0, &buf); 986 if (buf.eax >= 4) { 987 __kmp_x86_cpuid(4, 0, &buf); 988 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 989 } 990 else { 991 threadInfo[nApics].maxCoresPerPkg = 1; 992 } 993 994 // 995 // Infer the pkgId / coreId / threadId using only the info 996 // obtained locally. 997 // 998 int widthCT = __kmp_cpuid_mask_width( 999 threadInfo[nApics].maxThreadsPerPkg); 1000 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 1001 1002 int widthC = __kmp_cpuid_mask_width( 1003 threadInfo[nApics].maxCoresPerPkg); 1004 int widthT = widthCT - widthC; 1005 if (widthT < 0) { 1006 // 1007 // I've never seen this one happen, but I suppose it could, if 1008 // the cpuid instruction on a chip was really screwed up. 1009 // Make sure to restore the affinity mask before the tail call. 1010 // 1011 __kmp_set_system_affinity(oldMask, TRUE); 1012 __kmp_free(threadInfo); 1013 KMP_CPU_FREE(oldMask); 1014 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1015 return -1; 1016 } 1017 1018 int maskC = (1 << widthC) - 1; 1019 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 1020 &maskC; 1021 1022 int maskT = (1 << widthT) - 1; 1023 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 1024 1025 nApics++; 1026 } 1027 1028 // 1029 // We've collected all the info we need. 1030 // Restore the old affinity mask for this thread. 1031 // 1032 __kmp_set_system_affinity(oldMask, TRUE); 1033 1034 // 1035 // If there's only one thread context to bind to, form an Address object 1036 // with depth 1 and return immediately (or, if affinity is off, set 1037 // address2os to NULL and return). 1038 // 1039 // If it is configured to omit the package level when there is only a 1040 // single package, the logic at the end of this routine won't work if 1041 // there is only a single thread - it would try to form an Address 1042 // object with depth 0. 1043 // 1044 KMP_ASSERT(nApics > 0); 1045 if (nApics == 1) { 1046 __kmp_ncores = nPackages = 1; 1047 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1048 __kmp_ht_enabled = FALSE; 1049 if (__kmp_affinity_verbose) { 1050 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1051 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1052 1053 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1054 if (__kmp_affinity_respect_mask) { 1055 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1056 } else { 1057 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1058 } 1059 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1060 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1061 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1062 __kmp_nThreadsPerCore, __kmp_ncores); 1063 } 1064 1065 if (__kmp_affinity_type == affinity_none) { 1066 __kmp_free(threadInfo); 1067 KMP_CPU_FREE(oldMask); 1068 return 0; 1069 } 1070 1071 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 1072 Address addr(1); 1073 addr.labels[0] = threadInfo[0].pkgId; 1074 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 1075 1076 if (__kmp_affinity_gran_levels < 0) { 1077 __kmp_affinity_gran_levels = 0; 1078 } 1079 1080 if (__kmp_affinity_verbose) { 1081 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 1082 } 1083 1084 __kmp_free(threadInfo); 1085 KMP_CPU_FREE(oldMask); 1086 return 1; 1087 } 1088 1089 // 1090 // Sort the threadInfo table by physical Id. 1091 // 1092 qsort(threadInfo, nApics, sizeof(*threadInfo), 1093 __kmp_affinity_cmp_apicThreadInfo_phys_id); 1094 1095 // 1096 // The table is now sorted by pkgId / coreId / threadId, but we really 1097 // don't know the radix of any of the fields. pkgId's may be sparsely 1098 // assigned among the chips on a system. Although coreId's are usually 1099 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1100 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1101 // 1102 // For that matter, we don't know what coresPerPkg and threadsPerCore 1103 // (or the total # packages) are at this point - we want to determine 1104 // that now. We only have an upper bound on the first two figures. 1105 // 1106 // We also perform a consistency check at this point: the values returned 1107 // by the cpuid instruction for any thread bound to a given package had 1108 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1109 // 1110 nPackages = 1; 1111 nCoresPerPkg = 1; 1112 __kmp_nThreadsPerCore = 1; 1113 unsigned nCores = 1; 1114 1115 unsigned pkgCt = 1; // to determine radii 1116 unsigned lastPkgId = threadInfo[0].pkgId; 1117 unsigned coreCt = 1; 1118 unsigned lastCoreId = threadInfo[0].coreId; 1119 unsigned threadCt = 1; 1120 unsigned lastThreadId = threadInfo[0].threadId; 1121 1122 // intra-pkg consist checks 1123 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1124 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1125 1126 for (i = 1; i < nApics; i++) { 1127 if (threadInfo[i].pkgId != lastPkgId) { 1128 nCores++; 1129 pkgCt++; 1130 lastPkgId = threadInfo[i].pkgId; 1131 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1132 coreCt = 1; 1133 lastCoreId = threadInfo[i].coreId; 1134 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1135 threadCt = 1; 1136 lastThreadId = threadInfo[i].threadId; 1137 1138 // 1139 // This is a different package, so go on to the next iteration 1140 // without doing any consistency checks. Reset the consistency 1141 // check vars, though. 1142 // 1143 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1144 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1145 continue; 1146 } 1147 1148 if (threadInfo[i].coreId != lastCoreId) { 1149 nCores++; 1150 coreCt++; 1151 lastCoreId = threadInfo[i].coreId; 1152 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1153 threadCt = 1; 1154 lastThreadId = threadInfo[i].threadId; 1155 } 1156 else if (threadInfo[i].threadId != lastThreadId) { 1157 threadCt++; 1158 lastThreadId = threadInfo[i].threadId; 1159 } 1160 else { 1161 __kmp_free(threadInfo); 1162 KMP_CPU_FREE(oldMask); 1163 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1164 return -1; 1165 } 1166 1167 // 1168 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1169 // fields agree between all the threads bounds to a given package. 1170 // 1171 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1172 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1173 __kmp_free(threadInfo); 1174 KMP_CPU_FREE(oldMask); 1175 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1176 return -1; 1177 } 1178 } 1179 nPackages = pkgCt; 1180 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1181 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1182 1183 // 1184 // When affinity is off, this routine will still be called to set 1185 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 1186 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1187 // correctly, and return now if affinity is not enabled. 1188 // 1189 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1190 __kmp_ncores = nCores; 1191 if (__kmp_affinity_verbose) { 1192 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1193 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1194 1195 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1196 if (__kmp_affinity_respect_mask) { 1197 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1198 } else { 1199 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1200 } 1201 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1202 if (__kmp_affinity_uniform_topology()) { 1203 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1204 } else { 1205 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1206 } 1207 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1208 __kmp_nThreadsPerCore, __kmp_ncores); 1209 1210 } 1211 1212 if (__kmp_affinity_type == affinity_none) { 1213 __kmp_free(threadInfo); 1214 KMP_CPU_FREE(oldMask); 1215 return 0; 1216 } 1217 1218 // 1219 // Now that we've determined the number of packages, the number of cores 1220 // per package, and the number of threads per core, we can construct the 1221 // data structure that is to be returned. 1222 // 1223 int pkgLevel = 0; 1224 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1225 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1226 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1227 1228 KMP_ASSERT(depth > 0); 1229 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1230 1231 for (i = 0; i < nApics; ++i) { 1232 Address addr(depth); 1233 unsigned os = threadInfo[i].osId; 1234 int d = 0; 1235 1236 if (pkgLevel >= 0) { 1237 addr.labels[d++] = threadInfo[i].pkgId; 1238 } 1239 if (coreLevel >= 0) { 1240 addr.labels[d++] = threadInfo[i].coreId; 1241 } 1242 if (threadLevel >= 0) { 1243 addr.labels[d++] = threadInfo[i].threadId; 1244 } 1245 (*address2os)[i] = AddrUnsPair(addr, os); 1246 } 1247 1248 if (__kmp_affinity_gran_levels < 0) { 1249 // 1250 // Set the granularity level based on what levels are modeled 1251 // in the machine topology map. 1252 // 1253 __kmp_affinity_gran_levels = 0; 1254 if ((threadLevel >= 0) 1255 && (__kmp_affinity_gran > affinity_gran_thread)) { 1256 __kmp_affinity_gran_levels++; 1257 } 1258 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1259 __kmp_affinity_gran_levels++; 1260 } 1261 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1262 __kmp_affinity_gran_levels++; 1263 } 1264 } 1265 1266 if (__kmp_affinity_verbose) { 1267 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1268 coreLevel, threadLevel); 1269 } 1270 1271 __kmp_free(threadInfo); 1272 KMP_CPU_FREE(oldMask); 1273 return depth; 1274 } 1275 1276 1277 // 1278 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1279 // architectures support a newer interface for specifying the x2APIC Ids, 1280 // based on cpuid leaf 11. 1281 // 1282 static int 1283 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1284 kmp_i18n_id_t *const msg_id) 1285 { 1286 kmp_cpuid buf; 1287 1288 *address2os = NULL; 1289 *msg_id = kmp_i18n_null; 1290 1291 // 1292 // Check to see if cpuid leaf 11 is supported. 1293 // 1294 __kmp_x86_cpuid(0, 0, &buf); 1295 if (buf.eax < 11) { 1296 *msg_id = kmp_i18n_str_NoLeaf11Support; 1297 return -1; 1298 } 1299 __kmp_x86_cpuid(11, 0, &buf); 1300 if (buf.ebx == 0) { 1301 *msg_id = kmp_i18n_str_NoLeaf11Support; 1302 return -1; 1303 } 1304 1305 // 1306 // Find the number of levels in the machine topology. While we're at it, 1307 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1308 // try to get more accurate values later by explicitly counting them, 1309 // but get reasonable defaults now, in case we return early. 1310 // 1311 int level; 1312 int threadLevel = -1; 1313 int coreLevel = -1; 1314 int pkgLevel = -1; 1315 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1316 1317 for (level = 0;; level++) { 1318 if (level > 31) { 1319 // 1320 // FIXME: Hack for DPD200163180 1321 // 1322 // If level is big then something went wrong -> exiting 1323 // 1324 // There could actually be 32 valid levels in the machine topology, 1325 // but so far, the only machine we have seen which does not exit 1326 // this loop before iteration 32 has fubar x2APIC settings. 1327 // 1328 // For now, just reject this case based upon loop trip count. 1329 // 1330 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1331 return -1; 1332 } 1333 __kmp_x86_cpuid(11, level, &buf); 1334 if (buf.ebx == 0) { 1335 if (pkgLevel < 0) { 1336 // 1337 // Will infer nPackages from __kmp_xproc 1338 // 1339 pkgLevel = level; 1340 level++; 1341 } 1342 break; 1343 } 1344 int kind = (buf.ecx >> 8) & 0xff; 1345 if (kind == 1) { 1346 // 1347 // SMT level 1348 // 1349 threadLevel = level; 1350 coreLevel = -1; 1351 pkgLevel = -1; 1352 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1353 if (__kmp_nThreadsPerCore == 0) { 1354 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1355 return -1; 1356 } 1357 } 1358 else if (kind == 2) { 1359 // 1360 // core level 1361 // 1362 coreLevel = level; 1363 pkgLevel = -1; 1364 nCoresPerPkg = buf.ebx & 0xff; 1365 if (nCoresPerPkg == 0) { 1366 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1367 return -1; 1368 } 1369 } 1370 else { 1371 if (level <= 0) { 1372 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1373 return -1; 1374 } 1375 if (pkgLevel >= 0) { 1376 continue; 1377 } 1378 pkgLevel = level; 1379 nPackages = buf.ebx & 0xff; 1380 if (nPackages == 0) { 1381 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1382 return -1; 1383 } 1384 } 1385 } 1386 int depth = level; 1387 1388 // 1389 // In the above loop, "level" was counted from the finest level (usually 1390 // thread) to the coarsest. The caller expects that we will place the 1391 // labels in (*address2os)[].first.labels[] in the inverse order, so 1392 // we need to invert the vars saying which level means what. 1393 // 1394 if (threadLevel >= 0) { 1395 threadLevel = depth - threadLevel - 1; 1396 } 1397 if (coreLevel >= 0) { 1398 coreLevel = depth - coreLevel - 1; 1399 } 1400 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1401 pkgLevel = depth - pkgLevel - 1; 1402 1403 // 1404 // The algorithm used starts by setting the affinity to each available 1405 // thread and retrieving info from the cpuid instruction, so if we are not 1406 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(), 1407 // then we need to do something else - use the defaults that we calculated 1408 // from issuing cpuid without binding to each proc. 1409 // 1410 if (! KMP_AFFINITY_CAPABLE()) 1411 { 1412 // 1413 // Hack to try and infer the machine topology using only the data 1414 // available from cpuid on the current thread, and __kmp_xproc. 1415 // 1416 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1417 1418 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1419 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1420 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1421 if (__kmp_affinity_verbose) { 1422 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1423 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1424 if (__kmp_affinity_uniform_topology()) { 1425 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1426 } else { 1427 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1428 } 1429 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1430 __kmp_nThreadsPerCore, __kmp_ncores); 1431 } 1432 return 0; 1433 } 1434 1435 // 1436 // 1437 // From here on, we can assume that it is safe to call 1438 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1439 // even if __kmp_affinity_type = affinity_none. 1440 // 1441 1442 // 1443 // Save the affinity mask for the current thread. 1444 // 1445 kmp_affin_mask_t *oldMask; 1446 KMP_CPU_ALLOC(oldMask); 1447 __kmp_get_system_affinity(oldMask, TRUE); 1448 1449 // 1450 // Allocate the data structure to be returned. 1451 // 1452 AddrUnsPair *retval = (AddrUnsPair *) 1453 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1454 1455 // 1456 // Run through each of the available contexts, binding the current thread 1457 // to it, and obtaining the pertinent information using the cpuid instr. 1458 // 1459 unsigned int proc; 1460 int nApics = 0; 1461 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1462 // 1463 // Skip this proc if it is not included in the machine model. 1464 // 1465 if (! KMP_CPU_ISSET(proc, fullMask)) { 1466 continue; 1467 } 1468 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1469 1470 __kmp_affinity_bind_thread(proc); 1471 1472 // 1473 // Extrach the labels for each level in the machine topology map 1474 // from the Apic ID. 1475 // 1476 Address addr(depth); 1477 int prev_shift = 0; 1478 1479 for (level = 0; level < depth; level++) { 1480 __kmp_x86_cpuid(11, level, &buf); 1481 unsigned apicId = buf.edx; 1482 if (buf.ebx == 0) { 1483 if (level != depth - 1) { 1484 KMP_CPU_FREE(oldMask); 1485 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1486 return -1; 1487 } 1488 addr.labels[depth - level - 1] = apicId >> prev_shift; 1489 level++; 1490 break; 1491 } 1492 int shift = buf.eax & 0x1f; 1493 int mask = (1 << shift) - 1; 1494 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1495 prev_shift = shift; 1496 } 1497 if (level != depth) { 1498 KMP_CPU_FREE(oldMask); 1499 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1500 return -1; 1501 } 1502 1503 retval[nApics] = AddrUnsPair(addr, proc); 1504 nApics++; 1505 } 1506 1507 // 1508 // We've collected all the info we need. 1509 // Restore the old affinity mask for this thread. 1510 // 1511 __kmp_set_system_affinity(oldMask, TRUE); 1512 1513 // 1514 // If there's only one thread context to bind to, return now. 1515 // 1516 KMP_ASSERT(nApics > 0); 1517 if (nApics == 1) { 1518 __kmp_ncores = nPackages = 1; 1519 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1520 __kmp_ht_enabled = FALSE; 1521 if (__kmp_affinity_verbose) { 1522 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1523 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1524 1525 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1526 if (__kmp_affinity_respect_mask) { 1527 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1528 } else { 1529 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1530 } 1531 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1532 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1533 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1534 __kmp_nThreadsPerCore, __kmp_ncores); 1535 } 1536 1537 if (__kmp_affinity_type == affinity_none) { 1538 __kmp_free(retval); 1539 KMP_CPU_FREE(oldMask); 1540 return 0; 1541 } 1542 1543 // 1544 // Form an Address object which only includes the package level. 1545 // 1546 Address addr(1); 1547 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1548 retval[0].first = addr; 1549 1550 if (__kmp_affinity_gran_levels < 0) { 1551 __kmp_affinity_gran_levels = 0; 1552 } 1553 1554 if (__kmp_affinity_verbose) { 1555 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1556 } 1557 1558 *address2os = retval; 1559 KMP_CPU_FREE(oldMask); 1560 return 1; 1561 } 1562 1563 // 1564 // Sort the table by physical Id. 1565 // 1566 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1567 1568 // 1569 // Find the radix at each of the levels. 1570 // 1571 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1572 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1573 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1574 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1575 for (level = 0; level < depth; level++) { 1576 totals[level] = 1; 1577 maxCt[level] = 1; 1578 counts[level] = 1; 1579 last[level] = retval[0].first.labels[level]; 1580 } 1581 1582 // 1583 // From here on, the iteration variable "level" runs from the finest 1584 // level to the coarsest, i.e. we iterate forward through 1585 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1586 // backwards. 1587 // 1588 for (proc = 1; (int)proc < nApics; proc++) { 1589 int level; 1590 for (level = 0; level < depth; level++) { 1591 if (retval[proc].first.labels[level] != last[level]) { 1592 int j; 1593 for (j = level + 1; j < depth; j++) { 1594 totals[j]++; 1595 counts[j] = 1; 1596 // The line below causes printing incorrect topology information 1597 // in case the max value for some level (maxCt[level]) is encountered earlier than 1598 // some less value while going through the array. 1599 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1600 // whereas it must be 4. 1601 // TODO!!! Check if it can be commented safely 1602 //maxCt[j] = 1; 1603 last[j] = retval[proc].first.labels[j]; 1604 } 1605 totals[level]++; 1606 counts[level]++; 1607 if (counts[level] > maxCt[level]) { 1608 maxCt[level] = counts[level]; 1609 } 1610 last[level] = retval[proc].first.labels[level]; 1611 break; 1612 } 1613 else if (level == depth - 1) { 1614 __kmp_free(last); 1615 __kmp_free(maxCt); 1616 __kmp_free(counts); 1617 __kmp_free(totals); 1618 __kmp_free(retval); 1619 KMP_CPU_FREE(oldMask); 1620 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1621 return -1; 1622 } 1623 } 1624 } 1625 1626 // 1627 // When affinity is off, this routine will still be called to set 1628 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 1629 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1630 // correctly, and return if affinity is not enabled. 1631 // 1632 if (threadLevel >= 0) { 1633 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1634 } 1635 else { 1636 __kmp_nThreadsPerCore = 1; 1637 } 1638 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1639 1640 nPackages = totals[pkgLevel]; 1641 1642 if (coreLevel >= 0) { 1643 __kmp_ncores = totals[coreLevel]; 1644 nCoresPerPkg = maxCt[coreLevel]; 1645 } 1646 else { 1647 __kmp_ncores = nPackages; 1648 nCoresPerPkg = 1; 1649 } 1650 1651 // 1652 // Check to see if the machine topology is uniform 1653 // 1654 unsigned prod = maxCt[0]; 1655 for (level = 1; level < depth; level++) { 1656 prod *= maxCt[level]; 1657 } 1658 bool uniform = (prod == totals[level - 1]); 1659 1660 // 1661 // Print the machine topology summary. 1662 // 1663 if (__kmp_affinity_verbose) { 1664 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1665 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1666 1667 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1668 if (__kmp_affinity_respect_mask) { 1669 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1670 } else { 1671 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1672 } 1673 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1674 if (uniform) { 1675 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1676 } else { 1677 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1678 } 1679 1680 kmp_str_buf_t buf; 1681 __kmp_str_buf_init(&buf); 1682 1683 __kmp_str_buf_print(&buf, "%d", totals[0]); 1684 for (level = 1; level <= pkgLevel; level++) { 1685 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1686 } 1687 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1688 __kmp_nThreadsPerCore, __kmp_ncores); 1689 1690 __kmp_str_buf_free(&buf); 1691 } 1692 1693 if (__kmp_affinity_type == affinity_none) { 1694 __kmp_free(last); 1695 __kmp_free(maxCt); 1696 __kmp_free(counts); 1697 __kmp_free(totals); 1698 __kmp_free(retval); 1699 KMP_CPU_FREE(oldMask); 1700 return 0; 1701 } 1702 1703 // 1704 // Find any levels with radiix 1, and remove them from the map 1705 // (except for the package level). 1706 // 1707 int new_depth = 0; 1708 for (level = 0; level < depth; level++) { 1709 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1710 continue; 1711 } 1712 new_depth++; 1713 } 1714 1715 // 1716 // If we are removing any levels, allocate a new vector to return, 1717 // and copy the relevant information to it. 1718 // 1719 if (new_depth != depth) { 1720 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1721 sizeof(AddrUnsPair) * nApics); 1722 for (proc = 0; (int)proc < nApics; proc++) { 1723 Address addr(new_depth); 1724 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1725 } 1726 int new_level = 0; 1727 for (level = 0; level < depth; level++) { 1728 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1729 if (level == threadLevel) { 1730 threadLevel = -1; 1731 } 1732 else if ((threadLevel >= 0) && (level < threadLevel)) { 1733 threadLevel--; 1734 } 1735 if (level == coreLevel) { 1736 coreLevel = -1; 1737 } 1738 else if ((coreLevel >= 0) && (level < coreLevel)) { 1739 coreLevel--; 1740 } 1741 if (level < pkgLevel) { 1742 pkgLevel--; 1743 } 1744 continue; 1745 } 1746 for (proc = 0; (int)proc < nApics; proc++) { 1747 new_retval[proc].first.labels[new_level] 1748 = retval[proc].first.labels[level]; 1749 } 1750 new_level++; 1751 } 1752 1753 __kmp_free(retval); 1754 retval = new_retval; 1755 depth = new_depth; 1756 } 1757 1758 if (__kmp_affinity_gran_levels < 0) { 1759 // 1760 // Set the granularity level based on what levels are modeled 1761 // in the machine topology map. 1762 // 1763 __kmp_affinity_gran_levels = 0; 1764 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1765 __kmp_affinity_gran_levels++; 1766 } 1767 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1768 __kmp_affinity_gran_levels++; 1769 } 1770 if (__kmp_affinity_gran > affinity_gran_package) { 1771 __kmp_affinity_gran_levels++; 1772 } 1773 } 1774 1775 if (__kmp_affinity_verbose) { 1776 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1777 coreLevel, threadLevel); 1778 } 1779 1780 __kmp_free(last); 1781 __kmp_free(maxCt); 1782 __kmp_free(counts); 1783 __kmp_free(totals); 1784 KMP_CPU_FREE(oldMask); 1785 *address2os = retval; 1786 return depth; 1787 } 1788 1789 1790 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1791 1792 1793 #define osIdIndex 0 1794 #define threadIdIndex 1 1795 #define coreIdIndex 2 1796 #define pkgIdIndex 3 1797 #define nodeIdIndex 4 1798 1799 typedef unsigned *ProcCpuInfo; 1800 static unsigned maxIndex = pkgIdIndex; 1801 1802 1803 static int 1804 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1805 { 1806 const unsigned *aa = (const unsigned *)a; 1807 const unsigned *bb = (const unsigned *)b; 1808 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1809 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1810 return 0; 1811 }; 1812 1813 1814 static int 1815 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1816 { 1817 unsigned i; 1818 const unsigned *aa = *((const unsigned **)a); 1819 const unsigned *bb = *((const unsigned **)b); 1820 for (i = maxIndex; ; i--) { 1821 if (aa[i] < bb[i]) return -1; 1822 if (aa[i] > bb[i]) return 1; 1823 if (i == osIdIndex) break; 1824 } 1825 return 0; 1826 } 1827 1828 1829 // 1830 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1831 // affinity map. 1832 // 1833 static int 1834 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1835 kmp_i18n_id_t *const msg_id, FILE *f) 1836 { 1837 *address2os = NULL; 1838 *msg_id = kmp_i18n_null; 1839 1840 // 1841 // Scan of the file, and count the number of "processor" (osId) fields, 1842 // and find the highest value of <n> for a node_<n> field. 1843 // 1844 char buf[256]; 1845 unsigned num_records = 0; 1846 while (! feof(f)) { 1847 buf[sizeof(buf) - 1] = 1; 1848 if (! fgets(buf, sizeof(buf), f)) { 1849 // 1850 // Read errors presumably because of EOF 1851 // 1852 break; 1853 } 1854 1855 char s1[] = "processor"; 1856 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1857 num_records++; 1858 continue; 1859 } 1860 1861 // 1862 // FIXME - this will match "node_<n> <garbage>" 1863 // 1864 unsigned level; 1865 if (sscanf(buf, "node_%d id", &level) == 1) { 1866 if (nodeIdIndex + level >= maxIndex) { 1867 maxIndex = nodeIdIndex + level; 1868 } 1869 continue; 1870 } 1871 } 1872 1873 // 1874 // Check for empty file / no valid processor records, or too many. 1875 // The number of records can't exceed the number of valid bits in the 1876 // affinity mask. 1877 // 1878 if (num_records == 0) { 1879 *line = 0; 1880 *msg_id = kmp_i18n_str_NoProcRecords; 1881 return -1; 1882 } 1883 if (num_records > (unsigned)__kmp_xproc) { 1884 *line = 0; 1885 *msg_id = kmp_i18n_str_TooManyProcRecords; 1886 return -1; 1887 } 1888 1889 // 1890 // Set the file pointer back to the begginning, so that we can scan the 1891 // file again, this time performing a full parse of the data. 1892 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1893 // Adding an extra element at the end allows us to remove a lot of extra 1894 // checks for termination conditions. 1895 // 1896 if (fseek(f, 0, SEEK_SET) != 0) { 1897 *line = 0; 1898 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1899 return -1; 1900 } 1901 1902 // 1903 // Allocate the array of records to store the proc info in. The dummy 1904 // element at the end makes the logic in filling them out easier to code. 1905 // 1906 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1907 * sizeof(unsigned *)); 1908 unsigned i; 1909 for (i = 0; i <= num_records; i++) { 1910 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1911 * sizeof(unsigned)); 1912 } 1913 1914 #define CLEANUP_THREAD_INFO \ 1915 for (i = 0; i <= num_records; i++) { \ 1916 __kmp_free(threadInfo[i]); \ 1917 } \ 1918 __kmp_free(threadInfo); 1919 1920 // 1921 // A value of UINT_MAX means that we didn't find the field 1922 // 1923 unsigned __index; 1924 1925 #define INIT_PROC_INFO(p) \ 1926 for (__index = 0; __index <= maxIndex; __index++) { \ 1927 (p)[__index] = UINT_MAX; \ 1928 } 1929 1930 for (i = 0; i <= num_records; i++) { 1931 INIT_PROC_INFO(threadInfo[i]); 1932 } 1933 1934 unsigned num_avail = 0; 1935 *line = 0; 1936 while (! feof(f)) { 1937 // 1938 // Create an inner scoping level, so that all the goto targets at the 1939 // end of the loop appear in an outer scoping level. This avoids 1940 // warnings about jumping past an initialization to a target in the 1941 // same block. 1942 // 1943 { 1944 buf[sizeof(buf) - 1] = 1; 1945 bool long_line = false; 1946 if (! fgets(buf, sizeof(buf), f)) { 1947 // 1948 // Read errors presumably because of EOF 1949 // 1950 // If there is valid data in threadInfo[num_avail], then fake 1951 // a blank line in ensure that the last address gets parsed. 1952 // 1953 bool valid = false; 1954 for (i = 0; i <= maxIndex; i++) { 1955 if (threadInfo[num_avail][i] != UINT_MAX) { 1956 valid = true; 1957 } 1958 } 1959 if (! valid) { 1960 break; 1961 } 1962 buf[0] = 0; 1963 } else if (!buf[sizeof(buf) - 1]) { 1964 // 1965 // The line is longer than the buffer. Set a flag and don't 1966 // emit an error if we were going to ignore the line, anyway. 1967 // 1968 long_line = true; 1969 1970 #define CHECK_LINE \ 1971 if (long_line) { \ 1972 CLEANUP_THREAD_INFO; \ 1973 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1974 return -1; \ 1975 } 1976 } 1977 (*line)++; 1978 1979 char s1[] = "processor"; 1980 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1981 CHECK_LINE; 1982 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1983 unsigned val; 1984 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1985 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1986 threadInfo[num_avail][osIdIndex] = val; 1987 #if KMP_OS_LINUX && USE_SYSFS_INFO 1988 char path[256]; 1989 snprintf(path, sizeof(path), 1990 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1991 threadInfo[num_avail][osIdIndex]); 1992 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1993 1994 snprintf(path, sizeof(path), 1995 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1996 threadInfo[num_avail][osIdIndex]); 1997 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1998 continue; 1999 #else 2000 } 2001 char s2[] = "physical id"; 2002 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 2003 CHECK_LINE; 2004 char *p = strchr(buf + sizeof(s2) - 1, ':'); 2005 unsigned val; 2006 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2007 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 2008 threadInfo[num_avail][pkgIdIndex] = val; 2009 continue; 2010 } 2011 char s3[] = "core id"; 2012 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 2013 CHECK_LINE; 2014 char *p = strchr(buf + sizeof(s3) - 1, ':'); 2015 unsigned val; 2016 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2017 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 2018 threadInfo[num_avail][coreIdIndex] = val; 2019 continue; 2020 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 2021 } 2022 char s4[] = "thread id"; 2023 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 2024 CHECK_LINE; 2025 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2026 unsigned val; 2027 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2028 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 2029 threadInfo[num_avail][threadIdIndex] = val; 2030 continue; 2031 } 2032 unsigned level; 2033 if (sscanf(buf, "node_%d id", &level) == 1) { 2034 CHECK_LINE; 2035 char *p = strchr(buf + sizeof(s4) - 1, ':'); 2036 unsigned val; 2037 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 2038 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 2039 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 2040 threadInfo[num_avail][nodeIdIndex + level] = val; 2041 continue; 2042 } 2043 2044 // 2045 // We didn't recognize the leading token on the line. 2046 // There are lots of leading tokens that we don't recognize - 2047 // if the line isn't empty, go on to the next line. 2048 // 2049 if ((*buf != 0) && (*buf != '\n')) { 2050 // 2051 // If the line is longer than the buffer, read characters 2052 // until we find a newline. 2053 // 2054 if (long_line) { 2055 int ch; 2056 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 2057 } 2058 continue; 2059 } 2060 2061 // 2062 // A newline has signalled the end of the processor record. 2063 // Check that there aren't too many procs specified. 2064 // 2065 if ((int)num_avail == __kmp_xproc) { 2066 CLEANUP_THREAD_INFO; 2067 *msg_id = kmp_i18n_str_TooManyEntries; 2068 return -1; 2069 } 2070 2071 // 2072 // Check for missing fields. The osId field must be there, and we 2073 // currently require that the physical id field is specified, also. 2074 // 2075 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 2076 CLEANUP_THREAD_INFO; 2077 *msg_id = kmp_i18n_str_MissingProcField; 2078 return -1; 2079 } 2080 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 2081 CLEANUP_THREAD_INFO; 2082 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 2083 return -1; 2084 } 2085 2086 // 2087 // Skip this proc if it is not included in the machine model. 2088 // 2089 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 2090 INIT_PROC_INFO(threadInfo[num_avail]); 2091 continue; 2092 } 2093 2094 // 2095 // We have a successful parse of this proc's info. 2096 // Increment the counter, and prepare for the next proc. 2097 // 2098 num_avail++; 2099 KMP_ASSERT(num_avail <= num_records); 2100 INIT_PROC_INFO(threadInfo[num_avail]); 2101 } 2102 continue; 2103 2104 no_val: 2105 CLEANUP_THREAD_INFO; 2106 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2107 return -1; 2108 2109 dup_field: 2110 CLEANUP_THREAD_INFO; 2111 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2112 return -1; 2113 } 2114 *line = 0; 2115 2116 # if KMP_MIC && REDUCE_TEAM_SIZE 2117 unsigned teamSize = 0; 2118 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2119 2120 // check for num_records == __kmp_xproc ??? 2121 2122 // 2123 // If there's only one thread context to bind to, form an Address object 2124 // with depth 1 and return immediately (or, if affinity is off, set 2125 // address2os to NULL and return). 2126 // 2127 // If it is configured to omit the package level when there is only a 2128 // single package, the logic at the end of this routine won't work if 2129 // there is only a single thread - it would try to form an Address 2130 // object with depth 0. 2131 // 2132 KMP_ASSERT(num_avail > 0); 2133 KMP_ASSERT(num_avail <= num_records); 2134 if (num_avail == 1) { 2135 __kmp_ncores = 1; 2136 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2137 __kmp_ht_enabled = FALSE; 2138 if (__kmp_affinity_verbose) { 2139 if (! KMP_AFFINITY_CAPABLE()) { 2140 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2141 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2142 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2143 } 2144 else { 2145 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2146 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2147 fullMask); 2148 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2149 if (__kmp_affinity_respect_mask) { 2150 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2151 } else { 2152 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2153 } 2154 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2155 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2156 } 2157 int index; 2158 kmp_str_buf_t buf; 2159 __kmp_str_buf_init(&buf); 2160 __kmp_str_buf_print(&buf, "1"); 2161 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2162 __kmp_str_buf_print(&buf, " x 1"); 2163 } 2164 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2165 __kmp_str_buf_free(&buf); 2166 } 2167 2168 if (__kmp_affinity_type == affinity_none) { 2169 CLEANUP_THREAD_INFO; 2170 return 0; 2171 } 2172 2173 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2174 Address addr(1); 2175 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2176 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2177 2178 if (__kmp_affinity_gran_levels < 0) { 2179 __kmp_affinity_gran_levels = 0; 2180 } 2181 2182 if (__kmp_affinity_verbose) { 2183 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2184 } 2185 2186 CLEANUP_THREAD_INFO; 2187 return 1; 2188 } 2189 2190 // 2191 // Sort the threadInfo table by physical Id. 2192 // 2193 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2194 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2195 2196 // 2197 // The table is now sorted by pkgId / coreId / threadId, but we really 2198 // don't know the radix of any of the fields. pkgId's may be sparsely 2199 // assigned among the chips on a system. Although coreId's are usually 2200 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2201 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2202 // 2203 // For that matter, we don't know what coresPerPkg and threadsPerCore 2204 // (or the total # packages) are at this point - we want to determine 2205 // that now. We only have an upper bound on the first two figures. 2206 // 2207 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2208 * sizeof(unsigned)); 2209 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2210 * sizeof(unsigned)); 2211 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2212 * sizeof(unsigned)); 2213 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2214 * sizeof(unsigned)); 2215 2216 bool assign_thread_ids = false; 2217 unsigned threadIdCt; 2218 unsigned index; 2219 2220 restart_radix_check: 2221 threadIdCt = 0; 2222 2223 // 2224 // Initialize the counter arrays with data from threadInfo[0]. 2225 // 2226 if (assign_thread_ids) { 2227 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2228 threadInfo[0][threadIdIndex] = threadIdCt++; 2229 } 2230 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2231 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2232 } 2233 } 2234 for (index = 0; index <= maxIndex; index++) { 2235 counts[index] = 1; 2236 maxCt[index] = 1; 2237 totals[index] = 1; 2238 lastId[index] = threadInfo[0][index];; 2239 } 2240 2241 // 2242 // Run through the rest of the OS procs. 2243 // 2244 for (i = 1; i < num_avail; i++) { 2245 // 2246 // Find the most significant index whose id differs 2247 // from the id for the previous OS proc. 2248 // 2249 for (index = maxIndex; index >= threadIdIndex; index--) { 2250 if (assign_thread_ids && (index == threadIdIndex)) { 2251 // 2252 // Auto-assign the thread id field if it wasn't specified. 2253 // 2254 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2255 threadInfo[i][threadIdIndex] = threadIdCt++; 2256 } 2257 2258 // 2259 // Aparrently the thread id field was specified for some 2260 // entries and not others. Start the thread id counter 2261 // off at the next higher thread id. 2262 // 2263 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2264 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2265 } 2266 } 2267 if (threadInfo[i][index] != lastId[index]) { 2268 // 2269 // Run through all indices which are less significant, 2270 // and reset the counts to 1. 2271 // 2272 // At all levels up to and including index, we need to 2273 // increment the totals and record the last id. 2274 // 2275 unsigned index2; 2276 for (index2 = threadIdIndex; index2 < index; index2++) { 2277 totals[index2]++; 2278 if (counts[index2] > maxCt[index2]) { 2279 maxCt[index2] = counts[index2]; 2280 } 2281 counts[index2] = 1; 2282 lastId[index2] = threadInfo[i][index2]; 2283 } 2284 counts[index]++; 2285 totals[index]++; 2286 lastId[index] = threadInfo[i][index]; 2287 2288 if (assign_thread_ids && (index > threadIdIndex)) { 2289 2290 # if KMP_MIC && REDUCE_TEAM_SIZE 2291 // 2292 // The default team size is the total #threads in the machine 2293 // minus 1 thread for every core that has 3 or more threads. 2294 // 2295 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2296 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2297 2298 // 2299 // Restart the thread counter, as we are on a new core. 2300 // 2301 threadIdCt = 0; 2302 2303 // 2304 // Auto-assign the thread id field if it wasn't specified. 2305 // 2306 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2307 threadInfo[i][threadIdIndex] = threadIdCt++; 2308 } 2309 2310 // 2311 // Aparrently the thread id field was specified for some 2312 // entries and not others. Start the thread id counter 2313 // off at the next higher thread id. 2314 // 2315 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2316 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2317 } 2318 } 2319 break; 2320 } 2321 } 2322 if (index < threadIdIndex) { 2323 // 2324 // If thread ids were specified, it is an error if they are not 2325 // unique. Also, check that we waven't already restarted the 2326 // loop (to be safe - shouldn't need to). 2327 // 2328 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2329 || assign_thread_ids) { 2330 __kmp_free(lastId); 2331 __kmp_free(totals); 2332 __kmp_free(maxCt); 2333 __kmp_free(counts); 2334 CLEANUP_THREAD_INFO; 2335 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2336 return -1; 2337 } 2338 2339 // 2340 // If the thread ids were not specified and we see entries 2341 // entries that are duplicates, start the loop over and 2342 // assign the thread ids manually. 2343 // 2344 assign_thread_ids = true; 2345 goto restart_radix_check; 2346 } 2347 } 2348 2349 # if KMP_MIC && REDUCE_TEAM_SIZE 2350 // 2351 // The default team size is the total #threads in the machine 2352 // minus 1 thread for every core that has 3 or more threads. 2353 // 2354 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2355 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2356 2357 for (index = threadIdIndex; index <= maxIndex; index++) { 2358 if (counts[index] > maxCt[index]) { 2359 maxCt[index] = counts[index]; 2360 } 2361 } 2362 2363 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2364 nCoresPerPkg = maxCt[coreIdIndex]; 2365 nPackages = totals[pkgIdIndex]; 2366 2367 // 2368 // Check to see if the machine topology is uniform 2369 // 2370 unsigned prod = totals[maxIndex]; 2371 for (index = threadIdIndex; index < maxIndex; index++) { 2372 prod *= maxCt[index]; 2373 } 2374 bool uniform = (prod == totals[threadIdIndex]); 2375 2376 // 2377 // When affinity is off, this routine will still be called to set 2378 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 2379 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2380 // correctly, and return now if affinity is not enabled. 2381 // 2382 __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1 2383 __kmp_ncores = totals[coreIdIndex]; 2384 2385 if (__kmp_affinity_verbose) { 2386 if (! KMP_AFFINITY_CAPABLE()) { 2387 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2388 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2389 if (uniform) { 2390 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2391 } else { 2392 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2393 } 2394 } 2395 else { 2396 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2397 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2398 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2399 if (__kmp_affinity_respect_mask) { 2400 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2401 } else { 2402 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2403 } 2404 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2405 if (uniform) { 2406 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2407 } else { 2408 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2409 } 2410 } 2411 kmp_str_buf_t buf; 2412 __kmp_str_buf_init(&buf); 2413 2414 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2415 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2416 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2417 } 2418 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2419 maxCt[threadIdIndex], __kmp_ncores); 2420 2421 __kmp_str_buf_free(&buf); 2422 } 2423 2424 # if KMP_MIC && REDUCE_TEAM_SIZE 2425 // 2426 // Set the default team size. 2427 // 2428 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2429 __kmp_dflt_team_nth = teamSize; 2430 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2431 __kmp_dflt_team_nth)); 2432 } 2433 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2434 2435 if (__kmp_affinity_type == affinity_none) { 2436 __kmp_free(lastId); 2437 __kmp_free(totals); 2438 __kmp_free(maxCt); 2439 __kmp_free(counts); 2440 CLEANUP_THREAD_INFO; 2441 return 0; 2442 } 2443 2444 // 2445 // Count the number of levels which have more nodes at that level than 2446 // at the parent's level (with there being an implicit root node of 2447 // the top level). This is equivalent to saying that there is at least 2448 // one node at this level which has a sibling. These levels are in the 2449 // map, and the package level is always in the map. 2450 // 2451 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2452 int level = 0; 2453 for (index = threadIdIndex; index < maxIndex; index++) { 2454 KMP_ASSERT(totals[index] >= totals[index + 1]); 2455 inMap[index] = (totals[index] > totals[index + 1]); 2456 } 2457 inMap[maxIndex] = (totals[maxIndex] > 1); 2458 inMap[pkgIdIndex] = true; 2459 2460 int depth = 0; 2461 for (index = threadIdIndex; index <= maxIndex; index++) { 2462 if (inMap[index]) { 2463 depth++; 2464 } 2465 } 2466 KMP_ASSERT(depth > 0); 2467 2468 // 2469 // Construct the data structure that is to be returned. 2470 // 2471 *address2os = (AddrUnsPair*) 2472 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2473 int pkgLevel = -1; 2474 int coreLevel = -1; 2475 int threadLevel = -1; 2476 2477 for (i = 0; i < num_avail; ++i) { 2478 Address addr(depth); 2479 unsigned os = threadInfo[i][osIdIndex]; 2480 int src_index; 2481 int dst_index = 0; 2482 2483 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2484 if (! inMap[src_index]) { 2485 continue; 2486 } 2487 addr.labels[dst_index] = threadInfo[i][src_index]; 2488 if (src_index == pkgIdIndex) { 2489 pkgLevel = dst_index; 2490 } 2491 else if (src_index == coreIdIndex) { 2492 coreLevel = dst_index; 2493 } 2494 else if (src_index == threadIdIndex) { 2495 threadLevel = dst_index; 2496 } 2497 dst_index++; 2498 } 2499 (*address2os)[i] = AddrUnsPair(addr, os); 2500 } 2501 2502 if (__kmp_affinity_gran_levels < 0) { 2503 // 2504 // Set the granularity level based on what levels are modeled 2505 // in the machine topology map. 2506 // 2507 unsigned src_index; 2508 __kmp_affinity_gran_levels = 0; 2509 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2510 if (! inMap[src_index]) { 2511 continue; 2512 } 2513 switch (src_index) { 2514 case threadIdIndex: 2515 if (__kmp_affinity_gran > affinity_gran_thread) { 2516 __kmp_affinity_gran_levels++; 2517 } 2518 2519 break; 2520 case coreIdIndex: 2521 if (__kmp_affinity_gran > affinity_gran_core) { 2522 __kmp_affinity_gran_levels++; 2523 } 2524 break; 2525 2526 case pkgIdIndex: 2527 if (__kmp_affinity_gran > affinity_gran_package) { 2528 __kmp_affinity_gran_levels++; 2529 } 2530 break; 2531 } 2532 } 2533 } 2534 2535 if (__kmp_affinity_verbose) { 2536 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2537 coreLevel, threadLevel); 2538 } 2539 2540 __kmp_free(inMap); 2541 __kmp_free(lastId); 2542 __kmp_free(totals); 2543 __kmp_free(maxCt); 2544 __kmp_free(counts); 2545 CLEANUP_THREAD_INFO; 2546 return depth; 2547 } 2548 2549 2550 // 2551 // Create and return a table of affinity masks, indexed by OS thread ID. 2552 // This routine handles OR'ing together all the affinity masks of threads 2553 // that are sufficiently close, if granularity > fine. 2554 // 2555 static kmp_affin_mask_t * 2556 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2557 AddrUnsPair *address2os, unsigned numAddrs) 2558 { 2559 // 2560 // First form a table of affinity masks in order of OS thread id. 2561 // 2562 unsigned depth; 2563 unsigned maxOsId; 2564 unsigned i; 2565 2566 KMP_ASSERT(numAddrs > 0); 2567 depth = address2os[0].first.depth; 2568 2569 maxOsId = 0; 2570 for (i = 0; i < numAddrs; i++) { 2571 unsigned osId = address2os[i].second; 2572 if (osId > maxOsId) { 2573 maxOsId = osId; 2574 } 2575 } 2576 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2577 (maxOsId + 1) * __kmp_affin_mask_size); 2578 2579 // 2580 // Sort the address2os table according to physical order. Doing so 2581 // will put all threads on the same core/package/node in consecutive 2582 // locations. 2583 // 2584 qsort(address2os, numAddrs, sizeof(*address2os), 2585 __kmp_affinity_cmp_Address_labels); 2586 2587 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2588 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2589 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2590 } 2591 if (__kmp_affinity_gran_levels >= (int)depth) { 2592 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2593 && (__kmp_affinity_type != affinity_none))) { 2594 KMP_WARNING(AffThreadsMayMigrate); 2595 } 2596 } 2597 2598 // 2599 // Run through the table, forming the masks for all threads on each 2600 // core. Threads on the same core will have identical "Address" 2601 // objects, not considering the last level, which must be the thread 2602 // id. All threads on a core will appear consecutively. 2603 // 2604 unsigned unique = 0; 2605 unsigned j = 0; // index of 1st thread on core 2606 unsigned leader = 0; 2607 Address *leaderAddr = &(address2os[0].first); 2608 kmp_affin_mask_t *sum 2609 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 2610 KMP_CPU_ZERO(sum); 2611 KMP_CPU_SET(address2os[0].second, sum); 2612 for (i = 1; i < numAddrs; i++) { 2613 // 2614 // If this thread is sufficiently close to the leader (within the 2615 // granularity setting), then set the bit for this os thread in the 2616 // affinity mask for this group, and go on to the next thread. 2617 // 2618 if (leaderAddr->isClose(address2os[i].first, 2619 __kmp_affinity_gran_levels)) { 2620 KMP_CPU_SET(address2os[i].second, sum); 2621 continue; 2622 } 2623 2624 // 2625 // For every thread in this group, copy the mask to the thread's 2626 // entry in the osId2Mask table. Mark the first address as a 2627 // leader. 2628 // 2629 for (; j < i; j++) { 2630 unsigned osId = address2os[j].second; 2631 KMP_DEBUG_ASSERT(osId <= maxOsId); 2632 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2633 KMP_CPU_COPY(mask, sum); 2634 address2os[j].first.leader = (j == leader); 2635 } 2636 unique++; 2637 2638 // 2639 // Start a new mask. 2640 // 2641 leader = i; 2642 leaderAddr = &(address2os[i].first); 2643 KMP_CPU_ZERO(sum); 2644 KMP_CPU_SET(address2os[i].second, sum); 2645 } 2646 2647 // 2648 // For every thread in last group, copy the mask to the thread's 2649 // entry in the osId2Mask table. 2650 // 2651 for (; j < i; j++) { 2652 unsigned osId = address2os[j].second; 2653 KMP_DEBUG_ASSERT(osId <= maxOsId); 2654 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2655 KMP_CPU_COPY(mask, sum); 2656 address2os[j].first.leader = (j == leader); 2657 } 2658 unique++; 2659 2660 *maxIndex = maxOsId; 2661 *numUnique = unique; 2662 return osId2Mask; 2663 } 2664 2665 2666 // 2667 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2668 // as file-static than to try and pass them through the calling sequence of 2669 // the recursive-descent OMP_PLACES parser. 2670 // 2671 static kmp_affin_mask_t *newMasks; 2672 static int numNewMasks; 2673 static int nextNewMask; 2674 2675 #define ADD_MASK(_mask) \ 2676 { \ 2677 if (nextNewMask >= numNewMasks) { \ 2678 numNewMasks *= 2; \ 2679 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2680 numNewMasks * __kmp_affin_mask_size); \ 2681 } \ 2682 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2683 nextNewMask++; \ 2684 } 2685 2686 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2687 { \ 2688 if (((_osId) > _maxOsId) || \ 2689 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX((_osId2Mask), (_osId))))) { \ 2690 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2691 && (__kmp_affinity_type != affinity_none))) { \ 2692 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2693 } \ 2694 } \ 2695 else { \ 2696 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2697 } \ 2698 } 2699 2700 2701 // 2702 // Re-parse the proclist (for the explicit affinity type), and form the list 2703 // of affinity newMasks indexed by gtid. 2704 // 2705 static void 2706 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2707 unsigned int *out_numMasks, const char *proclist, 2708 kmp_affin_mask_t *osId2Mask, int maxOsId) 2709 { 2710 const char *scan = proclist; 2711 const char *next = proclist; 2712 2713 // 2714 // We use malloc() for the temporary mask vector, 2715 // so that we can use realloc() to extend it. 2716 // 2717 numNewMasks = 2; 2718 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2719 * __kmp_affin_mask_size); 2720 nextNewMask = 0; 2721 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2722 __kmp_affin_mask_size); 2723 int setSize = 0; 2724 2725 for (;;) { 2726 int start, end, stride; 2727 2728 SKIP_WS(scan); 2729 next = scan; 2730 if (*next == '\0') { 2731 break; 2732 } 2733 2734 if (*next == '{') { 2735 int num; 2736 setSize = 0; 2737 next++; // skip '{' 2738 SKIP_WS(next); 2739 scan = next; 2740 2741 // 2742 // Read the first integer in the set. 2743 // 2744 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2745 "bad proclist"); 2746 SKIP_DIGITS(next); 2747 num = __kmp_str_to_int(scan, *next); 2748 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2749 2750 // 2751 // Copy the mask for that osId to the sum (union) mask. 2752 // 2753 if ((num > maxOsId) || 2754 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2755 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2756 && (__kmp_affinity_type != affinity_none))) { 2757 KMP_WARNING(AffIgnoreInvalidProcID, num); 2758 } 2759 KMP_CPU_ZERO(sumMask); 2760 } 2761 else { 2762 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2763 setSize = 1; 2764 } 2765 2766 for (;;) { 2767 // 2768 // Check for end of set. 2769 // 2770 SKIP_WS(next); 2771 if (*next == '}') { 2772 next++; // skip '}' 2773 break; 2774 } 2775 2776 // 2777 // Skip optional comma. 2778 // 2779 if (*next == ',') { 2780 next++; 2781 } 2782 SKIP_WS(next); 2783 2784 // 2785 // Read the next integer in the set. 2786 // 2787 scan = next; 2788 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2789 "bad explicit proc list"); 2790 2791 SKIP_DIGITS(next); 2792 num = __kmp_str_to_int(scan, *next); 2793 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2794 2795 // 2796 // Add the mask for that osId to the sum mask. 2797 // 2798 if ((num > maxOsId) || 2799 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2800 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2801 && (__kmp_affinity_type != affinity_none))) { 2802 KMP_WARNING(AffIgnoreInvalidProcID, num); 2803 } 2804 } 2805 else { 2806 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2807 setSize++; 2808 } 2809 } 2810 if (setSize > 0) { 2811 ADD_MASK(sumMask); 2812 } 2813 2814 SKIP_WS(next); 2815 if (*next == ',') { 2816 next++; 2817 } 2818 scan = next; 2819 continue; 2820 } 2821 2822 // 2823 // Read the first integer. 2824 // 2825 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2826 SKIP_DIGITS(next); 2827 start = __kmp_str_to_int(scan, *next); 2828 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2829 SKIP_WS(next); 2830 2831 // 2832 // If this isn't a range, then add a mask to the list and go on. 2833 // 2834 if (*next != '-') { 2835 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2836 2837 // 2838 // Skip optional comma. 2839 // 2840 if (*next == ',') { 2841 next++; 2842 } 2843 scan = next; 2844 continue; 2845 } 2846 2847 // 2848 // This is a range. Skip over the '-' and read in the 2nd int. 2849 // 2850 next++; // skip '-' 2851 SKIP_WS(next); 2852 scan = next; 2853 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2854 SKIP_DIGITS(next); 2855 end = __kmp_str_to_int(scan, *next); 2856 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2857 2858 // 2859 // Check for a stride parameter 2860 // 2861 stride = 1; 2862 SKIP_WS(next); 2863 if (*next == ':') { 2864 // 2865 // A stride is specified. Skip over the ':" and read the 3rd int. 2866 // 2867 int sign = +1; 2868 next++; // skip ':' 2869 SKIP_WS(next); 2870 scan = next; 2871 if (*next == '-') { 2872 sign = -1; 2873 next++; 2874 SKIP_WS(next); 2875 scan = next; 2876 } 2877 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2878 "bad explicit proc list"); 2879 SKIP_DIGITS(next); 2880 stride = __kmp_str_to_int(scan, *next); 2881 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2882 stride *= sign; 2883 } 2884 2885 // 2886 // Do some range checks. 2887 // 2888 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2889 if (stride > 0) { 2890 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2891 } 2892 else { 2893 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2894 } 2895 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2896 2897 // 2898 // Add the mask for each OS proc # to the list. 2899 // 2900 if (stride > 0) { 2901 do { 2902 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2903 start += stride; 2904 } while (start <= end); 2905 } 2906 else { 2907 do { 2908 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2909 start += stride; 2910 } while (start >= end); 2911 } 2912 2913 // 2914 // Skip optional comma. 2915 // 2916 SKIP_WS(next); 2917 if (*next == ',') { 2918 next++; 2919 } 2920 scan = next; 2921 } 2922 2923 *out_numMasks = nextNewMask; 2924 if (nextNewMask == 0) { 2925 *out_masks = NULL; 2926 KMP_INTERNAL_FREE(newMasks); 2927 return; 2928 } 2929 *out_masks 2930 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2931 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2932 __kmp_free(sumMask); 2933 KMP_INTERNAL_FREE(newMasks); 2934 } 2935 2936 2937 # if OMP_40_ENABLED 2938 2939 /*----------------------------------------------------------------------------- 2940 2941 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2942 places. Again, Here is the grammar: 2943 2944 place_list := place 2945 place_list := place , place_list 2946 place := num 2947 place := place : num 2948 place := place : num : signed 2949 place := { subplacelist } 2950 place := ! place // (lowest priority) 2951 subplace_list := subplace 2952 subplace_list := subplace , subplace_list 2953 subplace := num 2954 subplace := num : num 2955 subplace := num : num : signed 2956 signed := num 2957 signed := + signed 2958 signed := - signed 2959 2960 -----------------------------------------------------------------------------*/ 2961 2962 static void 2963 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2964 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2965 { 2966 const char *next; 2967 2968 for (;;) { 2969 int start, count, stride, i; 2970 2971 // 2972 // Read in the starting proc id 2973 // 2974 SKIP_WS(*scan); 2975 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2976 "bad explicit places list"); 2977 next = *scan; 2978 SKIP_DIGITS(next); 2979 start = __kmp_str_to_int(*scan, *next); 2980 KMP_ASSERT(start >= 0); 2981 *scan = next; 2982 2983 // 2984 // valid follow sets are ',' ':' and '}' 2985 // 2986 SKIP_WS(*scan); 2987 if (**scan == '}' || **scan == ',') { 2988 if ((start > maxOsId) || 2989 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2990 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2991 && (__kmp_affinity_type != affinity_none))) { 2992 KMP_WARNING(AffIgnoreInvalidProcID, start); 2993 } 2994 } 2995 else { 2996 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2997 (*setSize)++; 2998 } 2999 if (**scan == '}') { 3000 break; 3001 } 3002 (*scan)++; // skip ',' 3003 continue; 3004 } 3005 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3006 (*scan)++; // skip ':' 3007 3008 // 3009 // Read count parameter 3010 // 3011 SKIP_WS(*scan); 3012 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3013 "bad explicit places list"); 3014 next = *scan; 3015 SKIP_DIGITS(next); 3016 count = __kmp_str_to_int(*scan, *next); 3017 KMP_ASSERT(count >= 0); 3018 *scan = next; 3019 3020 // 3021 // valid follow sets are ',' ':' and '}' 3022 // 3023 SKIP_WS(*scan); 3024 if (**scan == '}' || **scan == ',') { 3025 for (i = 0; i < count; i++) { 3026 if ((start > maxOsId) || 3027 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3028 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3029 && (__kmp_affinity_type != affinity_none))) { 3030 KMP_WARNING(AffIgnoreInvalidProcID, start); 3031 } 3032 break; // don't proliferate warnings for large count 3033 } 3034 else { 3035 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3036 start++; 3037 (*setSize)++; 3038 } 3039 } 3040 if (**scan == '}') { 3041 break; 3042 } 3043 (*scan)++; // skip ',' 3044 continue; 3045 } 3046 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 3047 (*scan)++; // skip ':' 3048 3049 // 3050 // Read stride parameter 3051 // 3052 int sign = +1; 3053 for (;;) { 3054 SKIP_WS(*scan); 3055 if (**scan == '+') { 3056 (*scan)++; // skip '+' 3057 continue; 3058 } 3059 if (**scan == '-') { 3060 sign *= -1; 3061 (*scan)++; // skip '-' 3062 continue; 3063 } 3064 break; 3065 } 3066 SKIP_WS(*scan); 3067 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 3068 "bad explicit places list"); 3069 next = *scan; 3070 SKIP_DIGITS(next); 3071 stride = __kmp_str_to_int(*scan, *next); 3072 KMP_ASSERT(stride >= 0); 3073 *scan = next; 3074 stride *= sign; 3075 3076 // 3077 // valid follow sets are ',' and '}' 3078 // 3079 SKIP_WS(*scan); 3080 if (**scan == '}' || **scan == ',') { 3081 for (i = 0; i < count; i++) { 3082 if ((start > maxOsId) || 3083 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 3084 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3085 && (__kmp_affinity_type != affinity_none))) { 3086 KMP_WARNING(AffIgnoreInvalidProcID, start); 3087 } 3088 break; // don't proliferate warnings for large count 3089 } 3090 else { 3091 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 3092 start += stride; 3093 (*setSize)++; 3094 } 3095 } 3096 if (**scan == '}') { 3097 break; 3098 } 3099 (*scan)++; // skip ',' 3100 continue; 3101 } 3102 3103 KMP_ASSERT2(0, "bad explicit places list"); 3104 } 3105 } 3106 3107 3108 static void 3109 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3110 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3111 { 3112 const char *next; 3113 3114 // 3115 // valid follow sets are '{' '!' and num 3116 // 3117 SKIP_WS(*scan); 3118 if (**scan == '{') { 3119 (*scan)++; // skip '{' 3120 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3121 setSize); 3122 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3123 (*scan)++; // skip '}' 3124 } 3125 else if (**scan == '!') { 3126 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3127 KMP_CPU_COMPLEMENT(tempMask); 3128 (*scan)++; // skip '!' 3129 } 3130 else if ((**scan >= '0') && (**scan <= '9')) { 3131 next = *scan; 3132 SKIP_DIGITS(next); 3133 int num = __kmp_str_to_int(*scan, *next); 3134 KMP_ASSERT(num >= 0); 3135 if ((num > maxOsId) || 3136 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3137 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3138 && (__kmp_affinity_type != affinity_none))) { 3139 KMP_WARNING(AffIgnoreInvalidProcID, num); 3140 } 3141 } 3142 else { 3143 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3144 (*setSize)++; 3145 } 3146 *scan = next; // skip num 3147 } 3148 else { 3149 KMP_ASSERT2(0, "bad explicit places list"); 3150 } 3151 } 3152 3153 3154 //static void 3155 void 3156 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3157 unsigned int *out_numMasks, const char *placelist, 3158 kmp_affin_mask_t *osId2Mask, int maxOsId) 3159 { 3160 const char *scan = placelist; 3161 const char *next = placelist; 3162 3163 numNewMasks = 2; 3164 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3165 * __kmp_affin_mask_size); 3166 nextNewMask = 0; 3167 3168 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3169 __kmp_affin_mask_size); 3170 KMP_CPU_ZERO(tempMask); 3171 int setSize = 0; 3172 3173 for (;;) { 3174 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3175 3176 // 3177 // valid follow sets are ',' ':' and EOL 3178 // 3179 SKIP_WS(scan); 3180 if (*scan == '\0' || *scan == ',') { 3181 if (setSize > 0) { 3182 ADD_MASK(tempMask); 3183 } 3184 KMP_CPU_ZERO(tempMask); 3185 setSize = 0; 3186 if (*scan == '\0') { 3187 break; 3188 } 3189 scan++; // skip ',' 3190 continue; 3191 } 3192 3193 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3194 scan++; // skip ':' 3195 3196 // 3197 // Read count parameter 3198 // 3199 SKIP_WS(scan); 3200 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3201 "bad explicit places list"); 3202 next = scan; 3203 SKIP_DIGITS(next); 3204 int count = __kmp_str_to_int(scan, *next); 3205 KMP_ASSERT(count >= 0); 3206 scan = next; 3207 3208 // 3209 // valid follow sets are ',' ':' and EOL 3210 // 3211 SKIP_WS(scan); 3212 int stride; 3213 if (*scan == '\0' || *scan == ',') { 3214 stride = +1; 3215 } 3216 else { 3217 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3218 scan++; // skip ':' 3219 3220 // 3221 // Read stride parameter 3222 // 3223 int sign = +1; 3224 for (;;) { 3225 SKIP_WS(scan); 3226 if (*scan == '+') { 3227 scan++; // skip '+' 3228 continue; 3229 } 3230 if (*scan == '-') { 3231 sign *= -1; 3232 scan++; // skip '-' 3233 continue; 3234 } 3235 break; 3236 } 3237 SKIP_WS(scan); 3238 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3239 "bad explicit places list"); 3240 next = scan; 3241 SKIP_DIGITS(next); 3242 stride = __kmp_str_to_int(scan, *next); 3243 KMP_DEBUG_ASSERT(stride >= 0); 3244 scan = next; 3245 stride *= sign; 3246 } 3247 3248 if (stride > 0) { 3249 int i; 3250 for (i = 0; i < count; i++) { 3251 int j; 3252 if (setSize == 0) { 3253 break; 3254 } 3255 ADD_MASK(tempMask); 3256 setSize = 0; 3257 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3258 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3259 KMP_CPU_CLR(j, tempMask); 3260 } 3261 else if ((j > maxOsId) || 3262 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3263 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3264 && (__kmp_affinity_type != affinity_none))) { 3265 KMP_WARNING(AffIgnoreInvalidProcID, j); 3266 } 3267 KMP_CPU_CLR(j, tempMask); 3268 } 3269 else { 3270 KMP_CPU_SET(j, tempMask); 3271 setSize++; 3272 } 3273 } 3274 for (; j >= 0; j--) { 3275 KMP_CPU_CLR(j, tempMask); 3276 } 3277 } 3278 } 3279 else { 3280 int i; 3281 for (i = 0; i < count; i++) { 3282 int j; 3283 if (setSize == 0) { 3284 break; 3285 } 3286 ADD_MASK(tempMask); 3287 setSize = 0; 3288 for (j = 0; j < ((int)__kmp_affin_mask_size * CHAR_BIT) + stride; 3289 j++) { 3290 if (! KMP_CPU_ISSET(j - stride, tempMask)) { 3291 KMP_CPU_CLR(j, tempMask); 3292 } 3293 else if ((j > maxOsId) || 3294 (! KMP_CPU_ISSET(j, KMP_CPU_INDEX(osId2Mask, j)))) { 3295 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3296 && (__kmp_affinity_type != affinity_none))) { 3297 KMP_WARNING(AffIgnoreInvalidProcID, j); 3298 } 3299 KMP_CPU_CLR(j, tempMask); 3300 } 3301 else { 3302 KMP_CPU_SET(j, tempMask); 3303 setSize++; 3304 } 3305 } 3306 for (; j < (int)__kmp_affin_mask_size * CHAR_BIT; j++) { 3307 KMP_CPU_CLR(j, tempMask); 3308 } 3309 } 3310 } 3311 KMP_CPU_ZERO(tempMask); 3312 setSize = 0; 3313 3314 // 3315 // valid follow sets are ',' and EOL 3316 // 3317 SKIP_WS(scan); 3318 if (*scan == '\0') { 3319 break; 3320 } 3321 if (*scan == ',') { 3322 scan++; // skip ',' 3323 continue; 3324 } 3325 3326 KMP_ASSERT2(0, "bad explicit places list"); 3327 } 3328 3329 *out_numMasks = nextNewMask; 3330 if (nextNewMask == 0) { 3331 *out_masks = NULL; 3332 KMP_INTERNAL_FREE(newMasks); 3333 return; 3334 } 3335 *out_masks 3336 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3337 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3338 __kmp_free(tempMask); 3339 KMP_INTERNAL_FREE(newMasks); 3340 } 3341 3342 # endif /* OMP_40_ENABLED */ 3343 3344 #undef ADD_MASK 3345 #undef ADD_MASK_OSID 3346 3347 3348 # if KMP_MIC 3349 3350 static void 3351 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3352 { 3353 if ( __kmp_place_num_cores == 0 ) { 3354 if ( __kmp_place_num_threads_per_core == 0 ) { 3355 return; // no cores limiting actions requested, exit 3356 } 3357 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3358 } 3359 if ( !__kmp_affinity_uniform_topology() ) { 3360 KMP_WARNING( AffThrPlaceNonUniform ); 3361 return; // don't support non-uniform topology 3362 } 3363 if ( depth != 3 ) { 3364 KMP_WARNING( AffThrPlaceNonThreeLevel ); 3365 return; // don't support not-3-level topology 3366 } 3367 if ( __kmp_place_num_threads_per_core == 0 ) { 3368 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3369 } 3370 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3371 KMP_WARNING( AffThrPlaceManyCores ); 3372 return; 3373 } 3374 3375 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3376 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3377 int i, j, k, n_old = 0, n_new = 0; 3378 for ( i = 0; i < nPackages; ++i ) { 3379 for ( j = 0; j < nCoresPerPkg; ++j ) { 3380 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3381 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3382 } else { 3383 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3384 if ( k < __kmp_place_num_threads_per_core ) { 3385 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3386 n_new++; 3387 } 3388 n_old++; 3389 } 3390 } 3391 } 3392 } 3393 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3394 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3395 __kmp_avail_proc = n_new; // correct avail_proc 3396 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3397 3398 __kmp_free( *pAddr ); 3399 *pAddr = newAddr; // replace old topology with new one 3400 } 3401 3402 # endif /* KMP_MIC */ 3403 3404 3405 static AddrUnsPair *address2os = NULL; 3406 static int * procarr = NULL; 3407 static int __kmp_aff_depth = 0; 3408 3409 static void 3410 __kmp_aux_affinity_initialize(void) 3411 { 3412 if (__kmp_affinity_masks != NULL) { 3413 KMP_ASSERT(fullMask != NULL); 3414 return; 3415 } 3416 3417 // 3418 // Create the "full" mask - this defines all of the processors that we 3419 // consider to be in the machine model. If respect is set, then it is 3420 // the initialization thread's affinity mask. Otherwise, it is all 3421 // processors that we know about on the machine. 3422 // 3423 if (fullMask == NULL) { 3424 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3425 } 3426 if (KMP_AFFINITY_CAPABLE()) { 3427 if (__kmp_affinity_respect_mask) { 3428 __kmp_get_system_affinity(fullMask, TRUE); 3429 3430 // 3431 // Count the number of available processors. 3432 // 3433 unsigned i; 3434 __kmp_avail_proc = 0; 3435 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3436 if (! KMP_CPU_ISSET(i, fullMask)) { 3437 continue; 3438 } 3439 __kmp_avail_proc++; 3440 } 3441 if (__kmp_avail_proc > __kmp_xproc) { 3442 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3443 && (__kmp_affinity_type != affinity_none))) { 3444 KMP_WARNING(ErrorInitializeAffinity); 3445 } 3446 __kmp_affinity_type = affinity_none; 3447 __kmp_affin_mask_size = 0; 3448 return; 3449 } 3450 } 3451 else { 3452 __kmp_affinity_entire_machine_mask(fullMask); 3453 __kmp_avail_proc = __kmp_xproc; 3454 } 3455 } 3456 3457 int depth = -1; 3458 kmp_i18n_id_t msg_id = kmp_i18n_null; 3459 3460 // 3461 // For backward compatibility, setting KMP_CPUINFO_FILE => 3462 // KMP_TOPOLOGY_METHOD=cpuinfo 3463 // 3464 if ((__kmp_cpuinfo_file != NULL) && 3465 (__kmp_affinity_top_method == affinity_top_method_all)) { 3466 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3467 } 3468 3469 if (__kmp_affinity_top_method == affinity_top_method_all) { 3470 // 3471 // In the default code path, errors are not fatal - we just try using 3472 // another method. We only emit a warning message if affinity is on, 3473 // or the verbose flag is set, an the nowarnings flag was not set. 3474 // 3475 const char *file_name = NULL; 3476 int line = 0; 3477 3478 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3479 3480 if (__kmp_affinity_verbose) { 3481 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3482 } 3483 3484 file_name = NULL; 3485 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3486 if (depth == 0) { 3487 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3488 KMP_ASSERT(address2os == NULL); 3489 return; 3490 } 3491 3492 if (depth < 0) { 3493 if (__kmp_affinity_verbose) { 3494 if (msg_id != kmp_i18n_null) { 3495 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3496 KMP_I18N_STR(DecodingLegacyAPIC)); 3497 } 3498 else { 3499 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(DecodingLegacyAPIC)); 3500 } 3501 } 3502 3503 file_name = NULL; 3504 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3505 if (depth == 0) { 3506 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3507 KMP_ASSERT(address2os == NULL); 3508 return; 3509 } 3510 } 3511 3512 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3513 3514 # if KMP_OS_LINUX 3515 3516 if (depth < 0) { 3517 if (__kmp_affinity_verbose) { 3518 if (msg_id != kmp_i18n_null) { 3519 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3520 } 3521 else { 3522 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3523 } 3524 } 3525 3526 FILE *f = fopen("/proc/cpuinfo", "r"); 3527 if (f == NULL) { 3528 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3529 } 3530 else { 3531 file_name = "/proc/cpuinfo"; 3532 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3533 fclose(f); 3534 if (depth == 0) { 3535 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3536 KMP_ASSERT(address2os == NULL); 3537 return; 3538 } 3539 } 3540 } 3541 3542 # endif /* KMP_OS_LINUX */ 3543 3544 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3545 3546 if ((depth < 0) && (__kmp_num_proc_groups > 1)) { 3547 if (__kmp_affinity_verbose) { 3548 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3549 } 3550 3551 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3552 KMP_ASSERT(depth != 0); 3553 } 3554 3555 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 3556 3557 if (depth < 0) { 3558 if (__kmp_affinity_verbose && (msg_id != kmp_i18n_null)) { 3559 if (file_name == NULL) { 3560 KMP_INFORM(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3561 } 3562 else if (line == 0) { 3563 KMP_INFORM(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3564 } 3565 else { 3566 KMP_INFORM(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3567 } 3568 } 3569 // FIXME - print msg if msg_id = kmp_i18n_null ??? 3570 3571 file_name = ""; 3572 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3573 if (depth == 0) { 3574 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3575 KMP_ASSERT(address2os == NULL); 3576 return; 3577 } 3578 KMP_ASSERT(depth > 0); 3579 KMP_ASSERT(address2os != NULL); 3580 } 3581 } 3582 3583 // 3584 // If the user has specified that a paricular topology discovery method 3585 // is to be used, then we abort if that method fails. The exception is 3586 // group affinity, which might have been implicitly set. 3587 // 3588 3589 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3590 3591 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3592 if (__kmp_affinity_verbose) { 3593 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3594 KMP_I18N_STR(Decodingx2APIC)); 3595 } 3596 3597 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3598 if (depth == 0) { 3599 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3600 KMP_ASSERT(address2os == NULL); 3601 return; 3602 } 3603 if (depth < 0) { 3604 KMP_ASSERT(msg_id != kmp_i18n_null); 3605 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3606 } 3607 } 3608 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3609 if (__kmp_affinity_verbose) { 3610 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3611 KMP_I18N_STR(DecodingLegacyAPIC)); 3612 } 3613 3614 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3615 if (depth == 0) { 3616 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3617 KMP_ASSERT(address2os == NULL); 3618 return; 3619 } 3620 if (depth < 0) { 3621 KMP_ASSERT(msg_id != kmp_i18n_null); 3622 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3623 } 3624 } 3625 3626 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3627 3628 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3629 const char *filename; 3630 if (__kmp_cpuinfo_file != NULL) { 3631 filename = __kmp_cpuinfo_file; 3632 } 3633 else { 3634 filename = "/proc/cpuinfo"; 3635 } 3636 3637 if (__kmp_affinity_verbose) { 3638 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3639 } 3640 3641 FILE *f = fopen(filename, "r"); 3642 if (f == NULL) { 3643 int code = errno; 3644 if (__kmp_cpuinfo_file != NULL) { 3645 __kmp_msg( 3646 kmp_ms_fatal, 3647 KMP_MSG(CantOpenFileForReading, filename), 3648 KMP_ERR(code), 3649 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3650 __kmp_msg_null 3651 ); 3652 } 3653 else { 3654 __kmp_msg( 3655 kmp_ms_fatal, 3656 KMP_MSG(CantOpenFileForReading, filename), 3657 KMP_ERR(code), 3658 __kmp_msg_null 3659 ); 3660 } 3661 } 3662 int line = 0; 3663 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3664 fclose(f); 3665 if (depth < 0) { 3666 KMP_ASSERT(msg_id != kmp_i18n_null); 3667 if (line > 0) { 3668 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3669 } 3670 else { 3671 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3672 } 3673 } 3674 if (__kmp_affinity_type == affinity_none) { 3675 KMP_ASSERT(depth == 0); 3676 KMP_ASSERT(address2os == NULL); 3677 return; 3678 } 3679 } 3680 3681 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3682 3683 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3684 if (__kmp_affinity_verbose) { 3685 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3686 } 3687 3688 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3689 KMP_ASSERT(depth != 0); 3690 if (depth < 0) { 3691 KMP_ASSERT(msg_id != kmp_i18n_null); 3692 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3693 } 3694 } 3695 3696 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 3697 3698 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3699 if (__kmp_affinity_verbose) { 3700 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3701 } 3702 3703 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3704 if (depth == 0) { 3705 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3706 KMP_ASSERT(address2os == NULL); 3707 return; 3708 } 3709 // should not fail 3710 KMP_ASSERT(depth > 0); 3711 KMP_ASSERT(address2os != NULL); 3712 } 3713 3714 if (address2os == NULL) { 3715 if (KMP_AFFINITY_CAPABLE() 3716 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3717 && (__kmp_affinity_type != affinity_none)))) { 3718 KMP_WARNING(ErrorInitializeAffinity); 3719 } 3720 __kmp_affinity_type = affinity_none; 3721 __kmp_affin_mask_size = 0; 3722 return; 3723 } 3724 3725 # if KMP_MIC 3726 __kmp_apply_thread_places(&address2os, depth); 3727 # endif 3728 3729 // 3730 // Create the table of masks, indexed by thread Id. 3731 // 3732 unsigned maxIndex; 3733 unsigned numUnique; 3734 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3735 address2os, __kmp_avail_proc); 3736 if (__kmp_affinity_gran_levels == 0) { 3737 KMP_DEBUG_ASSERT((int)numUnique == __kmp_avail_proc); 3738 } 3739 3740 // 3741 // Set the childNums vector in all Address objects. This must be done 3742 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3743 // which takes into account the setting of __kmp_affinity_compact. 3744 // 3745 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3746 3747 switch (__kmp_affinity_type) { 3748 3749 case affinity_explicit: 3750 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3751 # if OMP_40_ENABLED 3752 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3753 # endif 3754 { 3755 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3756 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3757 maxIndex); 3758 } 3759 # if OMP_40_ENABLED 3760 else { 3761 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3762 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3763 maxIndex); 3764 } 3765 # endif 3766 if (__kmp_affinity_num_masks == 0) { 3767 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3768 && (__kmp_affinity_type != affinity_none))) { 3769 KMP_WARNING(AffNoValidProcID); 3770 } 3771 __kmp_affinity_type = affinity_none; 3772 return; 3773 } 3774 break; 3775 3776 // 3777 // The other affinity types rely on sorting the Addresses according 3778 // to some permutation of the machine topology tree. Set 3779 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3780 // then jump to a common code fragment to do the sort and create 3781 // the array of affinity masks. 3782 // 3783 3784 case affinity_logical: 3785 __kmp_affinity_compact = 0; 3786 if (__kmp_affinity_offset) { 3787 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3788 % __kmp_avail_proc; 3789 } 3790 goto sortAddresses; 3791 3792 case affinity_physical: 3793 if (__kmp_nThreadsPerCore > 1) { 3794 __kmp_affinity_compact = 1; 3795 if (__kmp_affinity_compact >= depth) { 3796 __kmp_affinity_compact = 0; 3797 } 3798 } else { 3799 __kmp_affinity_compact = 0; 3800 } 3801 if (__kmp_affinity_offset) { 3802 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3803 % __kmp_avail_proc; 3804 } 3805 goto sortAddresses; 3806 3807 case affinity_scatter: 3808 if (__kmp_affinity_compact >= depth) { 3809 __kmp_affinity_compact = 0; 3810 } 3811 else { 3812 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3813 } 3814 goto sortAddresses; 3815 3816 case affinity_compact: 3817 if (__kmp_affinity_compact >= depth) { 3818 __kmp_affinity_compact = depth - 1; 3819 } 3820 goto sortAddresses; 3821 3822 # if KMP_MIC 3823 case affinity_balanced: 3824 // Balanced works only for the case of a single package and uniform topology 3825 if( nPackages > 1 ) { 3826 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3827 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3828 } 3829 __kmp_affinity_type = affinity_none; 3830 return; 3831 } else if( __kmp_affinity_uniform_topology() ) { 3832 break; 3833 } else { // Non-uniform topology 3834 3835 // Save the depth for further usage 3836 __kmp_aff_depth = depth; 3837 3838 // Number of hyper threads per core in HT machine 3839 int nth_per_core = __kmp_nThreadsPerCore; 3840 3841 int core_level; 3842 if( nth_per_core > 1 ) { 3843 core_level = depth - 2; 3844 } else { 3845 core_level = depth - 1; 3846 } 3847 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3848 int nproc = nth_per_core * ncores; 3849 3850 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3851 for( int i = 0; i < nproc; i++ ) { 3852 procarr[ i ] = -1; 3853 } 3854 3855 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3856 int proc = address2os[ i ].second; 3857 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3858 // If there is only one thread per core then depth == 2: level 0 - package, 3859 // level 1 - core. 3860 int level = depth - 1; 3861 3862 // __kmp_nth_per_core == 1 3863 int thread = 0; 3864 int core = address2os[ i ].first.labels[ level ]; 3865 // If the thread level exists, that is we have more than one thread context per core 3866 if( nth_per_core > 1 ) { 3867 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3868 core = address2os[ i ].first.labels[ level - 1 ]; 3869 } 3870 procarr[ core * nth_per_core + thread ] = proc; 3871 } 3872 3873 break; 3874 } 3875 # endif 3876 3877 sortAddresses: 3878 // 3879 // Allocate the gtid->affinity mask table. 3880 // 3881 if (__kmp_affinity_dups) { 3882 __kmp_affinity_num_masks = __kmp_avail_proc; 3883 } 3884 else { 3885 __kmp_affinity_num_masks = numUnique; 3886 } 3887 3888 # if OMP_40_ENABLED 3889 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3890 && ( __kmp_affinity_num_places > 0 ) 3891 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3892 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3893 } 3894 # endif 3895 3896 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3897 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3898 3899 // 3900 // Sort the address2os table according to the current setting of 3901 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3902 // 3903 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3904 __kmp_affinity_cmp_Address_child_num); 3905 { 3906 int i; 3907 unsigned j; 3908 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3909 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3910 continue; 3911 } 3912 unsigned osId = address2os[i].second; 3913 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3914 kmp_affin_mask_t *dest 3915 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3916 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3917 KMP_CPU_COPY(dest, src); 3918 if (++j >= __kmp_affinity_num_masks) { 3919 break; 3920 } 3921 } 3922 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3923 } 3924 break; 3925 3926 default: 3927 KMP_ASSERT2(0, "Unexpected affinity setting"); 3928 } 3929 3930 __kmp_free(osId2Mask); 3931 machine_hierarchy.init(address2os, __kmp_avail_proc); 3932 } 3933 3934 3935 void 3936 __kmp_affinity_initialize(void) 3937 { 3938 // 3939 // Much of the code above was written assumming that if a machine was not 3940 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3941 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3942 // 3943 // There are too many checks for __kmp_affinity_type == affinity_none 3944 // in this code. Instead of trying to change them all, check if 3945 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3946 // affinity_none, call the real initialization routine, then restore 3947 // __kmp_affinity_type to affinity_disabled. 3948 // 3949 int disabled = (__kmp_affinity_type == affinity_disabled); 3950 if (! KMP_AFFINITY_CAPABLE()) { 3951 KMP_ASSERT(disabled); 3952 } 3953 if (disabled) { 3954 __kmp_affinity_type = affinity_none; 3955 } 3956 __kmp_aux_affinity_initialize(); 3957 if (disabled) { 3958 __kmp_affinity_type = affinity_disabled; 3959 } 3960 } 3961 3962 3963 void 3964 __kmp_affinity_uninitialize(void) 3965 { 3966 if (__kmp_affinity_masks != NULL) { 3967 __kmp_free(__kmp_affinity_masks); 3968 __kmp_affinity_masks = NULL; 3969 } 3970 if (fullMask != NULL) { 3971 KMP_CPU_FREE(fullMask); 3972 fullMask = NULL; 3973 } 3974 __kmp_affinity_num_masks = 0; 3975 # if OMP_40_ENABLED 3976 __kmp_affinity_num_places = 0; 3977 # endif 3978 if (__kmp_affinity_proclist != NULL) { 3979 __kmp_free(__kmp_affinity_proclist); 3980 __kmp_affinity_proclist = NULL; 3981 } 3982 if( address2os != NULL ) { 3983 __kmp_free( address2os ); 3984 address2os = NULL; 3985 } 3986 if( procarr != NULL ) { 3987 __kmp_free( procarr ); 3988 procarr = NULL; 3989 } 3990 } 3991 3992 3993 void 3994 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3995 { 3996 if (! KMP_AFFINITY_CAPABLE()) { 3997 return; 3998 } 3999 4000 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4001 if (th->th.th_affin_mask == NULL) { 4002 KMP_CPU_ALLOC(th->th.th_affin_mask); 4003 } 4004 else { 4005 KMP_CPU_ZERO(th->th.th_affin_mask); 4006 } 4007 4008 // 4009 // Copy the thread mask to the kmp_info_t strucuture. 4010 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 4011 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 4012 // is set, then the full mask is the same as the mask of the initialization 4013 // thread. 4014 // 4015 kmp_affin_mask_t *mask; 4016 int i; 4017 4018 # if OMP_40_ENABLED 4019 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 4020 # endif 4021 { 4022 if ((__kmp_affinity_type == affinity_none) 4023 # if KMP_MIC 4024 || (__kmp_affinity_type == affinity_balanced) 4025 # endif 4026 ) { 4027 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 4028 if (__kmp_num_proc_groups > 1) { 4029 return; 4030 } 4031 # endif 4032 KMP_ASSERT(fullMask != NULL); 4033 i = KMP_PLACE_ALL; 4034 mask = fullMask; 4035 } 4036 else { 4037 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4038 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4039 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4040 } 4041 } 4042 # if OMP_40_ENABLED 4043 else { 4044 if ((! isa_root) 4045 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 4046 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 4047 if (__kmp_num_proc_groups > 1) { 4048 return; 4049 } 4050 # endif 4051 KMP_ASSERT(fullMask != NULL); 4052 i = KMP_PLACE_ALL; 4053 mask = fullMask; 4054 } 4055 else { 4056 // 4057 // int i = some hash function or just a counter that doesn't 4058 // always start at 0. Use gtid for now. 4059 // 4060 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 4061 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 4062 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 4063 } 4064 } 4065 # endif 4066 4067 # if OMP_40_ENABLED 4068 th->th.th_current_place = i; 4069 if (isa_root) { 4070 th->th.th_new_place = i; 4071 th->th.th_first_place = 0; 4072 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4073 } 4074 4075 if (i == KMP_PLACE_ALL) { 4076 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4077 gtid)); 4078 } 4079 else { 4080 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4081 gtid, i)); 4082 } 4083 # else 4084 if (i == -1) { 4085 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4086 gtid)); 4087 } 4088 else { 4089 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4090 gtid, i)); 4091 } 4092 # endif /* OMP_40_ENABLED */ 4093 4094 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4095 4096 if (__kmp_affinity_verbose) { 4097 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4098 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4099 th->th.th_affin_mask); 4100 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), gtid, 4101 buf); 4102 } 4103 4104 # if KMP_OS_WINDOWS 4105 // 4106 // On Windows* OS, the process affinity mask might have changed. 4107 // If the user didn't request affinity and this call fails, 4108 // just continue silently. See CQ171393. 4109 // 4110 if ( __kmp_affinity_type == affinity_none ) { 4111 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4112 } 4113 else 4114 # endif 4115 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4116 } 4117 4118 4119 # if OMP_40_ENABLED 4120 4121 void 4122 __kmp_affinity_set_place(int gtid) 4123 { 4124 int retval; 4125 4126 if (! KMP_AFFINITY_CAPABLE()) { 4127 return; 4128 } 4129 4130 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4131 4132 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4133 gtid, th->th.th_new_place, th->th.th_current_place)); 4134 4135 // 4136 // Check that the new place is within this thread's partition. 4137 // 4138 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4139 KMP_ASSERT(th->th.th_new_place >= 0); 4140 KMP_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4141 if (th->th.th_first_place <= th->th.th_last_place) { 4142 KMP_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4143 && (th->th.th_new_place <= th->th.th_last_place)); 4144 } 4145 else { 4146 KMP_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4147 || (th->th.th_new_place >= th->th.th_last_place)); 4148 } 4149 4150 // 4151 // Copy the thread mask to the kmp_info_t strucuture, 4152 // and set this thread's affinity. 4153 // 4154 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4155 th->th.th_new_place); 4156 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4157 th->th.th_current_place = th->th.th_new_place; 4158 4159 if (__kmp_affinity_verbose) { 4160 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4161 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4162 th->th.th_affin_mask); 4163 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", (kmp_int32)getpid(), 4164 gtid, buf); 4165 } 4166 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4167 } 4168 4169 # endif /* OMP_40_ENABLED */ 4170 4171 4172 int 4173 __kmp_aux_set_affinity(void **mask) 4174 { 4175 int gtid; 4176 kmp_info_t *th; 4177 int retval; 4178 4179 if (! KMP_AFFINITY_CAPABLE()) { 4180 return -1; 4181 } 4182 4183 gtid = __kmp_entry_gtid(); 4184 KA_TRACE(1000, ;{ 4185 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4186 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4187 (kmp_affin_mask_t *)(*mask)); 4188 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4189 gtid, buf); 4190 }); 4191 4192 if (__kmp_env_consistency_check) { 4193 if ((mask == NULL) || (*mask == NULL)) { 4194 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4195 } 4196 else { 4197 unsigned proc; 4198 int num_procs = 0; 4199 4200 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4201 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4202 continue; 4203 } 4204 num_procs++; 4205 if (! KMP_CPU_ISSET(proc, fullMask)) { 4206 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4207 break; 4208 } 4209 } 4210 if (num_procs == 0) { 4211 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4212 } 4213 4214 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 4215 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4216 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4217 } 4218 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 4219 4220 } 4221 } 4222 4223 th = __kmp_threads[gtid]; 4224 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4225 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4226 if (retval == 0) { 4227 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4228 } 4229 4230 # if OMP_40_ENABLED 4231 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4232 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4233 th->th.th_first_place = 0; 4234 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4235 4236 // 4237 // Turn off 4.0 affinity for the current tread at this parallel level. 4238 // 4239 th->th.th_current_task->td_icvs.proc_bind = proc_bind_false; 4240 # endif 4241 4242 return retval; 4243 } 4244 4245 4246 int 4247 __kmp_aux_get_affinity(void **mask) 4248 { 4249 int gtid; 4250 int retval; 4251 kmp_info_t *th; 4252 4253 if (! KMP_AFFINITY_CAPABLE()) { 4254 return -1; 4255 } 4256 4257 gtid = __kmp_entry_gtid(); 4258 th = __kmp_threads[gtid]; 4259 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4260 4261 KA_TRACE(1000, ;{ 4262 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4263 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4264 th->th.th_affin_mask); 4265 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4266 }); 4267 4268 if (__kmp_env_consistency_check) { 4269 if ((mask == NULL) || (*mask == NULL)) { 4270 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4271 } 4272 } 4273 4274 # if !KMP_OS_WINDOWS 4275 4276 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4277 KA_TRACE(1000, ;{ 4278 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4279 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4280 (kmp_affin_mask_t *)(*mask)); 4281 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4282 }); 4283 return retval; 4284 4285 # else 4286 4287 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4288 return 0; 4289 4290 # endif /* KMP_OS_WINDOWS */ 4291 4292 } 4293 4294 int 4295 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4296 { 4297 int retval; 4298 4299 if (! KMP_AFFINITY_CAPABLE()) { 4300 return -1; 4301 } 4302 4303 KA_TRACE(1000, ;{ 4304 int gtid = __kmp_entry_gtid(); 4305 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4306 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4307 (kmp_affin_mask_t *)(*mask)); 4308 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4309 proc, gtid, buf); 4310 }); 4311 4312 if (__kmp_env_consistency_check) { 4313 if ((mask == NULL) || (*mask == NULL)) { 4314 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4315 } 4316 } 4317 4318 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4319 return -1; 4320 } 4321 if (! KMP_CPU_ISSET(proc, fullMask)) { 4322 return -2; 4323 } 4324 4325 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4326 return 0; 4327 } 4328 4329 4330 int 4331 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4332 { 4333 int retval; 4334 4335 if (! KMP_AFFINITY_CAPABLE()) { 4336 return -1; 4337 } 4338 4339 KA_TRACE(1000, ;{ 4340 int gtid = __kmp_entry_gtid(); 4341 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4342 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4343 (kmp_affin_mask_t *)(*mask)); 4344 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4345 proc, gtid, buf); 4346 }); 4347 4348 if (__kmp_env_consistency_check) { 4349 if ((mask == NULL) || (*mask == NULL)) { 4350 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4351 } 4352 } 4353 4354 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4355 return -1; 4356 } 4357 if (! KMP_CPU_ISSET(proc, fullMask)) { 4358 return -2; 4359 } 4360 4361 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4362 return 0; 4363 } 4364 4365 4366 int 4367 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4368 { 4369 int retval; 4370 4371 if (! KMP_AFFINITY_CAPABLE()) { 4372 return -1; 4373 } 4374 4375 KA_TRACE(1000, ;{ 4376 int gtid = __kmp_entry_gtid(); 4377 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4378 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4379 (kmp_affin_mask_t *)(*mask)); 4380 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4381 proc, gtid, buf); 4382 }); 4383 4384 if (__kmp_env_consistency_check) { 4385 if ((mask == NULL) || (*mask == NULL)) { 4386 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4387 } 4388 } 4389 4390 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4391 return 0; 4392 } 4393 if (! KMP_CPU_ISSET(proc, fullMask)) { 4394 return 0; 4395 } 4396 4397 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4398 } 4399 4400 # if KMP_MIC 4401 4402 // Dynamic affinity settings - Affinity balanced 4403 void __kmp_balanced_affinity( int tid, int nthreads ) 4404 { 4405 if( __kmp_affinity_uniform_topology() ) { 4406 int coreID; 4407 int threadID; 4408 // Number of hyper threads per core in HT machine 4409 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4410 // Number of cores 4411 int ncores = __kmp_ncores; 4412 // How many threads will be bound to each core 4413 int chunk = nthreads / ncores; 4414 // How many cores will have an additional thread bound to it - "big cores" 4415 int big_cores = nthreads % ncores; 4416 // Number of threads on the big cores 4417 int big_nth = ( chunk + 1 ) * big_cores; 4418 if( tid < big_nth ) { 4419 coreID = tid / (chunk + 1 ); 4420 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4421 } else { //tid >= big_nth 4422 coreID = ( tid - big_cores ) / chunk; 4423 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4424 } 4425 4426 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4427 "Illegal set affinity operation when not capable"); 4428 4429 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4430 KMP_CPU_ZERO(mask); 4431 4432 // Granularity == thread 4433 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4434 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4435 KMP_CPU_SET( osID, mask); 4436 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4437 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4438 int osID; 4439 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4440 KMP_CPU_SET( osID, mask); 4441 } 4442 } 4443 if (__kmp_affinity_verbose) { 4444 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4445 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4446 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4447 tid, buf); 4448 } 4449 __kmp_set_system_affinity( mask, TRUE ); 4450 } else { // Non-uniform topology 4451 4452 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4453 KMP_CPU_ZERO(mask); 4454 4455 // Number of hyper threads per core in HT machine 4456 int nth_per_core = __kmp_nThreadsPerCore; 4457 int core_level; 4458 if( nth_per_core > 1 ) { 4459 core_level = __kmp_aff_depth - 2; 4460 } else { 4461 core_level = __kmp_aff_depth - 1; 4462 } 4463 4464 // Number of cores - maximum value; it does not count trail cores with 0 processors 4465 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4466 4467 // For performance gain consider the special case nthreads == __kmp_avail_proc 4468 if( nthreads == __kmp_avail_proc ) { 4469 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4470 int osID = address2os[ tid ].second; 4471 KMP_CPU_SET( osID, mask); 4472 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4473 int coreID = address2os[ tid ].first.labels[ core_level ]; 4474 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4475 // since the address2os is sortied we can break when cnt==nth_per_core 4476 int cnt = 0; 4477 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4478 int osID = address2os[ i ].second; 4479 int core = address2os[ i ].first.labels[ core_level ]; 4480 if( core == coreID ) { 4481 KMP_CPU_SET( osID, mask); 4482 cnt++; 4483 if( cnt == nth_per_core ) { 4484 break; 4485 } 4486 } 4487 } 4488 } 4489 } else if( nthreads <= __kmp_ncores ) { 4490 4491 int core = 0; 4492 for( int i = 0; i < ncores; i++ ) { 4493 // Check if this core from procarr[] is in the mask 4494 int in_mask = 0; 4495 for( int j = 0; j < nth_per_core; j++ ) { 4496 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4497 in_mask = 1; 4498 break; 4499 } 4500 } 4501 if( in_mask ) { 4502 if( tid == core ) { 4503 for( int j = 0; j < nth_per_core; j++ ) { 4504 int osID = procarr[ i * nth_per_core + j ]; 4505 if( osID != -1 ) { 4506 KMP_CPU_SET( osID, mask ); 4507 // For granularity=thread it is enough to set the first available osID for this core 4508 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4509 break; 4510 } 4511 } 4512 } 4513 break; 4514 } else { 4515 core++; 4516 } 4517 } 4518 } 4519 4520 } else { // nthreads > __kmp_ncores 4521 4522 // Array to save the number of processors at each core 4523 int nproc_at_core[ ncores ]; 4524 // Array to save the number of cores with "x" available processors; 4525 int ncores_with_x_procs[ nth_per_core + 1 ]; 4526 // Array to save the number of cores with # procs from x to nth_per_core 4527 int ncores_with_x_to_max_procs[ nth_per_core + 1 ]; 4528 4529 for( int i = 0; i <= nth_per_core; i++ ) { 4530 ncores_with_x_procs[ i ] = 0; 4531 ncores_with_x_to_max_procs[ i ] = 0; 4532 } 4533 4534 for( int i = 0; i < ncores; i++ ) { 4535 int cnt = 0; 4536 for( int j = 0; j < nth_per_core; j++ ) { 4537 if( procarr[ i * nth_per_core + j ] != -1 ) { 4538 cnt++; 4539 } 4540 } 4541 nproc_at_core[ i ] = cnt; 4542 ncores_with_x_procs[ cnt ]++; 4543 } 4544 4545 for( int i = 0; i <= nth_per_core; i++ ) { 4546 for( int j = i; j <= nth_per_core; j++ ) { 4547 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4548 } 4549 } 4550 4551 // Max number of processors 4552 int nproc = nth_per_core * ncores; 4553 // An array to keep number of threads per each context 4554 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4555 for( int i = 0; i < nproc; i++ ) { 4556 newarr[ i ] = 0; 4557 } 4558 4559 int nth = nthreads; 4560 int flag = 0; 4561 while( nth > 0 ) { 4562 for( int j = 1; j <= nth_per_core; j++ ) { 4563 int cnt = ncores_with_x_to_max_procs[ j ]; 4564 for( int i = 0; i < ncores; i++ ) { 4565 // Skip the core with 0 processors 4566 if( nproc_at_core[ i ] == 0 ) { 4567 continue; 4568 } 4569 for( int k = 0; k < nth_per_core; k++ ) { 4570 if( procarr[ i * nth_per_core + k ] != -1 ) { 4571 if( newarr[ i * nth_per_core + k ] == 0 ) { 4572 newarr[ i * nth_per_core + k ] = 1; 4573 cnt--; 4574 nth--; 4575 break; 4576 } else { 4577 if( flag != 0 ) { 4578 newarr[ i * nth_per_core + k ] ++; 4579 cnt--; 4580 nth--; 4581 break; 4582 } 4583 } 4584 } 4585 } 4586 if( cnt == 0 || nth == 0 ) { 4587 break; 4588 } 4589 } 4590 if( nth == 0 ) { 4591 break; 4592 } 4593 } 4594 flag = 1; 4595 } 4596 int sum = 0; 4597 for( int i = 0; i < nproc; i++ ) { 4598 sum += newarr[ i ]; 4599 if( sum > tid ) { 4600 // Granularity == thread 4601 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4602 int osID = procarr[ i ]; 4603 KMP_CPU_SET( osID, mask); 4604 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4605 int coreID = i / nth_per_core; 4606 for( int ii = 0; ii < nth_per_core; ii++ ) { 4607 int osID = procarr[ coreID * nth_per_core + ii ]; 4608 if( osID != -1 ) { 4609 KMP_CPU_SET( osID, mask); 4610 } 4611 } 4612 } 4613 break; 4614 } 4615 } 4616 __kmp_free( newarr ); 4617 } 4618 4619 if (__kmp_affinity_verbose) { 4620 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4621 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4622 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", (kmp_int32)getpid(), 4623 tid, buf); 4624 } 4625 __kmp_set_system_affinity( mask, TRUE ); 4626 } 4627 } 4628 4629 # endif /* KMP_MIC */ 4630 4631 #else 4632 // affinity not supported 4633 4634 kmp_uint32 mac_skipPerLevel[7]; 4635 kmp_uint32 mac_depth; 4636 kmp_uint8 mac_leaf_kids; 4637 void __kmp_get_hierarchy(kmp_uint32 nproc, kmp_bstate_t *thr_bar) { 4638 static int first = 1; 4639 if (first) { 4640 const kmp_uint32 maxLevels = 7; 4641 kmp_uint32 numPerLevel[maxLevels]; 4642 4643 for (kmp_uint32 i=0; i<maxLevels; ++i) { // init numPerLevel[*] to 1 item per level 4644 numPerLevel[i] = 1; 4645 mac_skipPerLevel[i] = 1; 4646 } 4647 4648 mac_depth = 2; 4649 numPerLevel[0] = nproc; 4650 4651 kmp_uint32 branch = 4; 4652 if (numPerLevel[0] == 1) branch = nproc/4; 4653 if (branch<4) branch=4; 4654 for (kmp_uint32 d=0; d<mac_depth-1; ++d) { // optimize hierarchy width 4655 while (numPerLevel[d] > branch || (d==0 && numPerLevel[d]>4)) { // max 4 on level 0! 4656 if (numPerLevel[d] & 1) numPerLevel[d]++; 4657 numPerLevel[d] = numPerLevel[d] >> 1; 4658 if (numPerLevel[d+1] == 1) mac_depth++; 4659 numPerLevel[d+1] = numPerLevel[d+1] << 1; 4660 } 4661 if(numPerLevel[0] == 1) { 4662 branch = branch >> 1; 4663 if (branch<4) branch = 4; 4664 } 4665 } 4666 4667 for (kmp_uint32 i=1; i<mac_depth; ++i) 4668 mac_skipPerLevel[i] = numPerLevel[i-1] * mac_skipPerLevel[i-1]; 4669 mac_leaf_kids = (kmp_uint8)numPerLevel[0]-1; 4670 first=0; 4671 } 4672 thr_bar->depth = mac_depth; 4673 thr_bar->base_leaf_kids = mac_leaf_kids; 4674 thr_bar->skip_per_level = mac_skipPerLevel; 4675 } 4676 4677 #endif // KMP_AFFINITY_SUPPORTED 4678