1 /* 2 * kmp_affinity.cpp -- affinity management 3 * $Revision: 42810 $ 4 * $Date: 2013-11-07 12:06:33 -0600 (Thu, 07 Nov 2013) $ 5 */ 6 7 8 //===----------------------------------------------------------------------===// 9 // 10 // The LLVM Compiler Infrastructure 11 // 12 // This file is dual licensed under the MIT and the University of Illinois Open 13 // Source Licenses. See LICENSE.txt for details. 14 // 15 //===----------------------------------------------------------------------===// 16 17 18 #include "kmp.h" 19 #include "kmp_i18n.h" 20 #include "kmp_io.h" 21 #include "kmp_str.h" 22 23 24 #if KMP_AFFINITY_SUPPORTED 25 26 // 27 // Print the affinity mask to the character array in a pretty format. 28 // 29 char * 30 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 31 { 32 KMP_ASSERT(buf_len >= 40); 33 char *scan = buf; 34 char *end = buf + buf_len - 1; 35 36 // 37 // Find first element / check for empty set. 38 // 39 size_t i; 40 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 41 if (KMP_CPU_ISSET(i, mask)) { 42 break; 43 } 44 } 45 if (i == KMP_CPU_SETSIZE) { 46 sprintf(scan, "{<empty>}"); 47 while (*scan != '\0') scan++; 48 KMP_ASSERT(scan <= end); 49 return buf; 50 } 51 52 sprintf(scan, "{%ld", i); 53 while (*scan != '\0') scan++; 54 i++; 55 for (; i < KMP_CPU_SETSIZE; i++) { 56 if (! KMP_CPU_ISSET(i, mask)) { 57 continue; 58 } 59 60 // 61 // Check for buffer overflow. A string of the form ",<n>" will have 62 // at most 10 characters, plus we want to leave room to print ",...}" 63 // if the set is too large to print for a total of 15 characters. 64 // We already left room for '\0' in setting end. 65 // 66 if (end - scan < 15) { 67 break; 68 } 69 sprintf(scan, ",%-ld", i); 70 while (*scan != '\0') scan++; 71 } 72 if (i < KMP_CPU_SETSIZE) { 73 sprintf(scan, ",..."); 74 while (*scan != '\0') scan++; 75 } 76 sprintf(scan, "}"); 77 while (*scan != '\0') scan++; 78 KMP_ASSERT(scan <= end); 79 return buf; 80 } 81 82 83 void 84 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 85 { 86 KMP_CPU_ZERO(mask); 87 88 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 89 90 if (__kmp_num_proc_groups > 1) { 91 int group; 92 struct GROUP_AFFINITY ga; 93 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 94 for (group = 0; group < __kmp_num_proc_groups; group++) { 95 int i; 96 int num = __kmp_GetActiveProcessorCount(group); 97 for (i = 0; i < num; i++) { 98 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 99 } 100 } 101 } 102 else 103 104 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 105 106 { 107 int proc; 108 for (proc = 0; proc < __kmp_xproc; proc++) { 109 KMP_CPU_SET(proc, mask); 110 } 111 } 112 } 113 114 115 // 116 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 117 // functions. 118 // 119 // The icc codegen emits sections with extremely long names, of the form 120 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 121 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 122 // some sort of memory corruption or table overflow that is triggered by 123 // these long strings. I checked the latest version of the linker - 124 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 125 // fixed. 126 // 127 // Unfortunately, my attempts to reproduce it in a smaller example have 128 // failed - I'm not sure what the prospects are of getting it fixed 129 // properly - but we need a reproducer smaller than all of libiomp. 130 // 131 // Work around the problem by avoiding inline constructors in such builds. 132 // We do this for all platforms, not just Linux* OS - non-inline functions are 133 // more debuggable and provide better coverage into than inline functions. 134 // Use inline functions in shipping libs, for performance. 135 // 136 137 # if !defined(KMP_DEBUG) && !defined(COVER) 138 139 class Address { 140 public: 141 static const unsigned maxDepth = 32; 142 unsigned labels[maxDepth]; 143 unsigned childNums[maxDepth]; 144 unsigned depth; 145 unsigned leader; 146 Address(unsigned _depth) 147 : depth(_depth), leader(FALSE) { 148 } 149 Address &operator=(const Address &b) { 150 depth = b.depth; 151 for (unsigned i = 0; i < depth; i++) { 152 labels[i] = b.labels[i]; 153 childNums[i] = b.childNums[i]; 154 } 155 leader = FALSE; 156 return *this; 157 } 158 bool operator==(const Address &b) const { 159 if (depth != b.depth) 160 return false; 161 for (unsigned i = 0; i < depth; i++) 162 if(labels[i] != b.labels[i]) 163 return false; 164 return true; 165 } 166 bool isClose(const Address &b, int level) const { 167 if (depth != b.depth) 168 return false; 169 if ((unsigned)level >= depth) 170 return true; 171 for (unsigned i = 0; i < (depth - level); i++) 172 if(labels[i] != b.labels[i]) 173 return false; 174 return true; 175 } 176 bool operator!=(const Address &b) const { 177 return !operator==(b); 178 } 179 }; 180 181 class AddrUnsPair { 182 public: 183 Address first; 184 unsigned second; 185 AddrUnsPair(Address _first, unsigned _second) 186 : first(_first), second(_second) { 187 } 188 AddrUnsPair &operator=(const AddrUnsPair &b) 189 { 190 first = b.first; 191 second = b.second; 192 return *this; 193 } 194 }; 195 196 # else 197 198 class Address { 199 public: 200 static const unsigned maxDepth = 32; 201 unsigned labels[maxDepth]; 202 unsigned childNums[maxDepth]; 203 unsigned depth; 204 unsigned leader; 205 Address(unsigned _depth); 206 Address &operator=(const Address &b); 207 bool operator==(const Address &b) const; 208 bool isClose(const Address &b, int level) const; 209 bool operator!=(const Address &b) const; 210 }; 211 212 Address::Address(unsigned _depth) 213 { 214 depth = _depth; 215 leader = FALSE; 216 } 217 218 Address &Address::operator=(const Address &b) { 219 depth = b.depth; 220 for (unsigned i = 0; i < depth; i++) { 221 labels[i] = b.labels[i]; 222 childNums[i] = b.childNums[i]; 223 } 224 leader = FALSE; 225 return *this; 226 } 227 228 bool Address::operator==(const Address &b) const { 229 if (depth != b.depth) 230 return false; 231 for (unsigned i = 0; i < depth; i++) 232 if(labels[i] != b.labels[i]) 233 return false; 234 return true; 235 } 236 237 bool Address::isClose(const Address &b, int level) const { 238 if (depth != b.depth) 239 return false; 240 if ((unsigned)level >= depth) 241 return true; 242 for (unsigned i = 0; i < (depth - level); i++) 243 if(labels[i] != b.labels[i]) 244 return false; 245 return true; 246 } 247 248 bool Address::operator!=(const Address &b) const { 249 return !operator==(b); 250 } 251 252 class AddrUnsPair { 253 public: 254 Address first; 255 unsigned second; 256 AddrUnsPair(Address _first, unsigned _second); 257 AddrUnsPair &operator=(const AddrUnsPair &b); 258 }; 259 260 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 261 : first(_first), second(_second) 262 { 263 } 264 265 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 266 { 267 first = b.first; 268 second = b.second; 269 return *this; 270 } 271 272 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 273 274 275 static int 276 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 277 { 278 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 279 ->first); 280 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 281 ->first); 282 unsigned depth = aa->depth; 283 unsigned i; 284 KMP_DEBUG_ASSERT(depth == bb->depth); 285 for (i = 0; i < depth; i++) { 286 if (aa->labels[i] < bb->labels[i]) return -1; 287 if (aa->labels[i] > bb->labels[i]) return 1; 288 } 289 return 0; 290 } 291 292 293 static int 294 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 295 { 296 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 297 ->first); 298 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 299 ->first); 300 unsigned depth = aa->depth; 301 unsigned i; 302 KMP_DEBUG_ASSERT(depth == bb->depth); 303 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 304 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 305 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 306 int j = depth - i - 1; 307 if (aa->childNums[j] < bb->childNums[j]) return -1; 308 if (aa->childNums[j] > bb->childNums[j]) return 1; 309 } 310 for (; i < depth; i++) { 311 int j = i - __kmp_affinity_compact; 312 if (aa->childNums[j] < bb->childNums[j]) return -1; 313 if (aa->childNums[j] > bb->childNums[j]) return 1; 314 } 315 return 0; 316 } 317 318 319 // 320 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 321 // called to renumber the labels from [0..n] and place them into the child_num 322 // vector of the address object. This is done in case the labels used for 323 // the children at one node of the hierarchy differ from those used for 324 // another node at the same level. Example: suppose the machine has 2 nodes 325 // with 2 packages each. The first node contains packages 601 and 602, and 326 // second node contains packages 603 and 604. If we try to sort the table 327 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 328 // because we are paying attention to the labels themselves, not the ordinal 329 // child numbers. By using the child numbers in the sort, the result is 330 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 331 // 332 static void 333 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 334 int numAddrs) 335 { 336 KMP_DEBUG_ASSERT(numAddrs > 0); 337 int depth = address2os->first.depth; 338 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 339 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 340 * sizeof(unsigned)); 341 int labCt; 342 for (labCt = 0; labCt < depth; labCt++) { 343 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 344 lastLabel[labCt] = address2os[0].first.labels[labCt]; 345 } 346 int i; 347 for (i = 1; i < numAddrs; i++) { 348 for (labCt = 0; labCt < depth; labCt++) { 349 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 350 int labCt2; 351 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 352 counts[labCt2] = 0; 353 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 354 } 355 counts[labCt]++; 356 lastLabel[labCt] = address2os[i].first.labels[labCt]; 357 break; 358 } 359 } 360 for (labCt = 0; labCt < depth; labCt++) { 361 address2os[i].first.childNums[labCt] = counts[labCt]; 362 } 363 for (; labCt < (int)Address::maxDepth; labCt++) { 364 address2os[i].first.childNums[labCt] = 0; 365 } 366 } 367 } 368 369 370 // 371 // All of the __kmp_affinity_create_*_map() routines should set 372 // __kmp_affinity_masks to a vector of affinity mask objects of length 373 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 374 // return the number of levels in the machine topology tree (zero if 375 // __kmp_affinity_type == affinity_none). 376 // 377 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 378 // to the affinity mask for the initialization thread. They need to save and 379 // restore the mask, and it could be needed later, so saving it is just an 380 // optimization to avoid calling kmp_get_system_affinity() again. 381 // 382 static kmp_affin_mask_t *fullMask = NULL; 383 384 kmp_affin_mask_t * 385 __kmp_affinity_get_fullMask() { return fullMask; } 386 387 388 static int nCoresPerPkg, nPackages; 389 int __kmp_nThreadsPerCore; 390 391 // 392 // __kmp_affinity_uniform_topology() doesn't work when called from 393 // places which support arbitrarily many levels in the machine topology 394 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 395 // __kmp_affinity_create_x2apicid_map(). 396 // 397 inline static bool 398 __kmp_affinity_uniform_topology() 399 { 400 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 401 } 402 403 404 // 405 // Print out the detailed machine topology map, i.e. the physical locations 406 // of each OS proc. 407 // 408 static void 409 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 410 int pkgLevel, int coreLevel, int threadLevel) 411 { 412 int proc; 413 414 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 415 for (proc = 0; proc < len; proc++) { 416 int level; 417 kmp_str_buf_t buf; 418 __kmp_str_buf_init(&buf); 419 for (level = 0; level < depth; level++) { 420 if (level == threadLevel) { 421 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 422 } 423 else if (level == coreLevel) { 424 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 425 } 426 else if (level == pkgLevel) { 427 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 428 } 429 else if (level > pkgLevel) { 430 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 431 level - pkgLevel - 1); 432 } 433 else { 434 __kmp_str_buf_print(&buf, "L%d ", level); 435 } 436 __kmp_str_buf_print(&buf, "%d ", 437 address2os[proc].first.labels[level]); 438 } 439 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 440 buf.str); 441 __kmp_str_buf_free(&buf); 442 } 443 } 444 445 446 // 447 // If we don't know how to retrieve the machine's processor topology, or 448 // encounter an error in doing so, this routine is called to form a "flat" 449 // mapping of os thread id's <-> processor id's. 450 // 451 static int 452 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 453 kmp_i18n_id_t *const msg_id) 454 { 455 *address2os = NULL; 456 *msg_id = kmp_i18n_null; 457 458 // 459 // Even if __kmp_affinity_type == affinity_none, this routine might still 460 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as 461 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 462 // 463 if (! KMP_AFFINITY_CAPABLE()) { 464 KMP_ASSERT(__kmp_affinity_type == affinity_none); 465 __kmp_ncores = nPackages = __kmp_xproc; 466 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 467 __kmp_ht_enabled = FALSE; 468 if (__kmp_affinity_verbose) { 469 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 470 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 471 KMP_INFORM(Uniform, "KMP_AFFINITY"); 472 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 473 __kmp_nThreadsPerCore, __kmp_ncores); 474 } 475 return 0; 476 } 477 478 // 479 // When affinity is off, this routine will still be called to set 480 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 481 // nCoresPerPkg, & nPackages. Make sure all these vars are set 482 // correctly, and return now if affinity is not enabled. 483 // 484 __kmp_ncores = nPackages = __kmp_avail_proc; 485 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 486 __kmp_ht_enabled = FALSE; 487 if (__kmp_affinity_verbose) { 488 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 489 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 490 491 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 492 if (__kmp_affinity_respect_mask) { 493 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 494 } else { 495 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 496 } 497 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 498 KMP_INFORM(Uniform, "KMP_AFFINITY"); 499 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 500 __kmp_nThreadsPerCore, __kmp_ncores); 501 } 502 if (__kmp_affinity_type == affinity_none) { 503 return 0; 504 } 505 506 // 507 // Contruct the data structure to be returned. 508 // 509 *address2os = (AddrUnsPair*) 510 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 511 int avail_ct = 0; 512 unsigned int i; 513 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 514 // 515 // Skip this proc if it is not included in the machine model. 516 // 517 if (! KMP_CPU_ISSET(i, fullMask)) { 518 continue; 519 } 520 521 Address addr(1); 522 addr.labels[0] = i; 523 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 524 } 525 if (__kmp_affinity_verbose) { 526 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 527 } 528 529 if (__kmp_affinity_gran_levels < 0) { 530 // 531 // Only the package level is modeled in the machine topology map, 532 // so the #levels of granularity is either 0 or 1. 533 // 534 if (__kmp_affinity_gran > affinity_gran_package) { 535 __kmp_affinity_gran_levels = 1; 536 } 537 else { 538 __kmp_affinity_gran_levels = 0; 539 } 540 } 541 return 1; 542 } 543 544 545 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 546 547 // 548 // If multiple Windows* OS processor groups exist, we can create a 2-level 549 // topology map with the groups at level 0 and the individual procs at 550 // level 1. 551 // 552 // This facilitates letting the threads float among all procs in a group, 553 // if granularity=group (the default when there are multiple groups). 554 // 555 static int 556 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 557 kmp_i18n_id_t *const msg_id) 558 { 559 *address2os = NULL; 560 *msg_id = kmp_i18n_null; 561 562 // 563 // If we don't have multiple processor groups, return now. 564 // The flat mapping will be used. 565 // 566 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 567 // FIXME set *msg_id 568 return -1; 569 } 570 571 // 572 // Contruct the data structure to be returned. 573 // 574 *address2os = (AddrUnsPair*) 575 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 576 int avail_ct = 0; 577 int i; 578 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 579 // 580 // Skip this proc if it is not included in the machine model. 581 // 582 if (! KMP_CPU_ISSET(i, fullMask)) { 583 continue; 584 } 585 586 Address addr(2); 587 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 588 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 589 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 590 591 if (__kmp_affinity_verbose) { 592 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 593 addr.labels[1]); 594 } 595 } 596 597 if (__kmp_affinity_gran_levels < 0) { 598 if (__kmp_affinity_gran == affinity_gran_group) { 599 __kmp_affinity_gran_levels = 1; 600 } 601 else if ((__kmp_affinity_gran == affinity_gran_fine) 602 || (__kmp_affinity_gran == affinity_gran_thread)) { 603 __kmp_affinity_gran_levels = 0; 604 } 605 else { 606 const char *gran_str = NULL; 607 if (__kmp_affinity_gran == affinity_gran_core) { 608 gran_str = "core"; 609 } 610 else if (__kmp_affinity_gran == affinity_gran_package) { 611 gran_str = "package"; 612 } 613 else if (__kmp_affinity_gran == affinity_gran_node) { 614 gran_str = "node"; 615 } 616 else { 617 KMP_ASSERT(0); 618 } 619 620 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 621 __kmp_affinity_gran_levels = 0; 622 } 623 } 624 return 2; 625 } 626 627 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 628 629 630 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 631 632 static int 633 __kmp_cpuid_mask_width(int count) { 634 int r = 0; 635 636 while((1<<r) < count) 637 ++r; 638 return r; 639 } 640 641 642 class apicThreadInfo { 643 public: 644 unsigned osId; // param to __kmp_affinity_bind_thread 645 unsigned apicId; // from cpuid after binding 646 unsigned maxCoresPerPkg; // "" 647 unsigned maxThreadsPerPkg; // "" 648 unsigned pkgId; // inferred from above values 649 unsigned coreId; // "" 650 unsigned threadId; // "" 651 }; 652 653 654 static int 655 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 656 { 657 const apicThreadInfo *aa = (const apicThreadInfo *)a; 658 const apicThreadInfo *bb = (const apicThreadInfo *)b; 659 if (aa->osId < bb->osId) return -1; 660 if (aa->osId > bb->osId) return 1; 661 return 0; 662 } 663 664 665 static int 666 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 667 { 668 const apicThreadInfo *aa = (const apicThreadInfo *)a; 669 const apicThreadInfo *bb = (const apicThreadInfo *)b; 670 if (aa->pkgId < bb->pkgId) return -1; 671 if (aa->pkgId > bb->pkgId) return 1; 672 if (aa->coreId < bb->coreId) return -1; 673 if (aa->coreId > bb->coreId) return 1; 674 if (aa->threadId < bb->threadId) return -1; 675 if (aa->threadId > bb->threadId) return 1; 676 return 0; 677 } 678 679 680 // 681 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 682 // an algorithm which cycles through the available os threads, setting 683 // the current thread's affinity mask to that thread, and then retrieves 684 // the Apic Id for each thread context using the cpuid instruction. 685 // 686 static int 687 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 688 kmp_i18n_id_t *const msg_id) 689 { 690 int rc; 691 *address2os = NULL; 692 *msg_id = kmp_i18n_null; 693 694 # if KMP_MIC 695 { 696 // The code below will use cpuid(4). 697 // Check if cpuid(4) is supported. 698 // FIXME? - this really doesn't need to be specific to MIC. 699 kmp_cpuid buf; 700 __kmp_x86_cpuid(0, 0, &buf); 701 if (buf.eax < 4) { 702 *msg_id = kmp_i18n_str_NoLeaf4Support; 703 return -1; 704 } 705 } 706 # endif // KMP_MIC 707 708 // 709 // Even if __kmp_affinity_type == affinity_none, this routine is still 710 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as 711 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 712 // 713 // The algorithm used starts by setting the affinity to each available 714 // thread and retreiving info from the cpuid instruction, so if we are not 715 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(), 716 // then we need to do something else. 717 // 718 if (! KMP_AFFINITY_CAPABLE()) { 719 // 720 // Hack to try and infer the machine topology using only the data 721 // available from cpuid on the current thread, and __kmp_xproc. 722 // 723 KMP_ASSERT(__kmp_affinity_type == affinity_none); 724 725 // 726 // Get an upper bound on the number of threads per package using 727 // cpuid(1). 728 // 729 // On some OS/chps combinations where HT is supported by the chip 730 // but is disabled, this value will be 2 on a single core chip. 731 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 732 // 733 kmp_cpuid buf; 734 __kmp_x86_cpuid(1, 0, &buf); 735 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 736 if (maxThreadsPerPkg == 0) { 737 maxThreadsPerPkg = 1; 738 } 739 740 // 741 // The num cores per pkg comes from cpuid(4). 742 // 1 must be added to the encoded value. 743 // 744 // The author of cpu_count.cpp treated this only an upper bound 745 // on the number of cores, but I haven't seen any cases where it 746 // was greater than the actual number of cores, so we will treat 747 // it as exact in this block of code. 748 // 749 // First, we need to check if cpuid(4) is supported on this chip. 750 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 751 // has the value n or greater. 752 // 753 __kmp_x86_cpuid(0, 0, &buf); 754 if (buf.eax >= 4) { 755 __kmp_x86_cpuid(4, 0, &buf); 756 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 757 } 758 else { 759 nCoresPerPkg = 1; 760 } 761 762 // 763 // There is no way to reliably tell if HT is enabled without issuing 764 // the cpuid instruction from every thread, can correlating the cpuid 765 // info, so if the machine is not affinity capable, we assume that HT 766 // is off. We have seen quite a few machines where maxThreadsPerPkg 767 // is 2, yet the machine does not support HT. 768 // 769 // - Older OSes are usually found on machines with older chips, which 770 // do not support HT. 771 // 772 // - The performance penalty for mistakenly identifying a machine as 773 // HT when it isn't (which results in blocktime being incorrecly set 774 // to 0) is greater than the penalty when for mistakenly identifying 775 // a machine as being 1 thread/core when it is really HT enabled 776 // (which results in blocktime being incorrectly set to a positive 777 // value). 778 // 779 __kmp_ncores = __kmp_xproc; 780 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 781 __kmp_nThreadsPerCore = 1; 782 __kmp_ht_enabled = FALSE; 783 if (__kmp_affinity_verbose) { 784 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 785 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 786 if (__kmp_affinity_uniform_topology()) { 787 KMP_INFORM(Uniform, "KMP_AFFINITY"); 788 } else { 789 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 790 } 791 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 792 __kmp_nThreadsPerCore, __kmp_ncores); 793 } 794 return 0; 795 } 796 797 // 798 // 799 // From here on, we can assume that it is safe to call 800 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 801 // even if __kmp_affinity_type = affinity_none. 802 // 803 804 // 805 // Save the affinity mask for the current thread. 806 // 807 kmp_affin_mask_t *oldMask; 808 KMP_CPU_ALLOC(oldMask); 809 KMP_ASSERT(oldMask != NULL); 810 __kmp_get_system_affinity(oldMask, TRUE); 811 812 // 813 // Run through each of the available contexts, binding the current thread 814 // to it, and obtaining the pertinent information using the cpuid instr. 815 // 816 // The relevant information is: 817 // 818 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 819 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 820 // 821 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 822 // value of this field determines the width of the core# + thread# 823 // fields in the Apic Id. It is also an upper bound on the number 824 // of threads per package, but it has been verified that situations 825 // happen were it is not exact. In particular, on certain OS/chip 826 // combinations where Intel(R) Hyper-Threading Technology is supported 827 // by the chip but has 828 // been disabled, the value of this field will be 2 (for a single core 829 // chip). On other OS/chip combinations supporting 830 // Intel(R) Hyper-Threading Technology, the value of 831 // this field will be 1 when Intel(R) Hyper-Threading Technology is 832 // disabled and 2 when it is enabled. 833 // 834 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 835 // value of this field (+1) determines the width of the core# field in 836 // the Apic Id. The comments in "cpucount.cpp" say that this value is 837 // an upper bound, but the IA-32 architecture manual says that it is 838 // exactly the number of cores per package, and I haven't seen any 839 // case where it wasn't. 840 // 841 // From this information, deduce the package Id, core Id, and thread Id, 842 // and set the corresponding fields in the apicThreadInfo struct. 843 // 844 unsigned i; 845 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 846 __kmp_avail_proc * sizeof(apicThreadInfo)); 847 unsigned nApics = 0; 848 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 849 // 850 // Skip this proc if it is not included in the machine model. 851 // 852 if (! KMP_CPU_ISSET(i, fullMask)) { 853 continue; 854 } 855 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 856 857 __kmp_affinity_bind_thread(i); 858 threadInfo[nApics].osId = i; 859 860 // 861 // The apic id and max threads per pkg come from cpuid(1). 862 // 863 kmp_cpuid buf; 864 __kmp_x86_cpuid(1, 0, &buf); 865 if (! (buf.edx >> 9) & 1) { 866 __kmp_set_system_affinity(oldMask, TRUE); 867 __kmp_free(threadInfo); 868 KMP_CPU_FREE(oldMask); 869 *msg_id = kmp_i18n_str_ApicNotPresent; 870 return -1; 871 } 872 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 873 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 874 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 875 threadInfo[nApics].maxThreadsPerPkg = 1; 876 } 877 878 // 879 // Max cores per pkg comes from cpuid(4). 880 // 1 must be added to the encoded value. 881 // 882 // First, we need to check if cpuid(4) is supported on this chip. 883 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 884 // has the value n or greater. 885 // 886 __kmp_x86_cpuid(0, 0, &buf); 887 if (buf.eax >= 4) { 888 __kmp_x86_cpuid(4, 0, &buf); 889 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 890 } 891 else { 892 threadInfo[nApics].maxCoresPerPkg = 1; 893 } 894 895 // 896 // Infer the pkgId / coreId / threadId using only the info 897 // obtained locally. 898 // 899 int widthCT = __kmp_cpuid_mask_width( 900 threadInfo[nApics].maxThreadsPerPkg); 901 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 902 903 int widthC = __kmp_cpuid_mask_width( 904 threadInfo[nApics].maxCoresPerPkg); 905 int widthT = widthCT - widthC; 906 if (widthT < 0) { 907 // 908 // I've never seen this one happen, but I suppose it could, if 909 // the cpuid instruction on a chip was really screwed up. 910 // Make sure to restore the affinity mask before the tail call. 911 // 912 __kmp_set_system_affinity(oldMask, TRUE); 913 __kmp_free(threadInfo); 914 KMP_CPU_FREE(oldMask); 915 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 916 return -1; 917 } 918 919 int maskC = (1 << widthC) - 1; 920 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 921 &maskC; 922 923 int maskT = (1 << widthT) - 1; 924 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 925 926 nApics++; 927 } 928 929 // 930 // We've collected all the info we need. 931 // Restore the old affinity mask for this thread. 932 // 933 __kmp_set_system_affinity(oldMask, TRUE); 934 935 // 936 // If there's only one thread context to bind to, form an Address object 937 // with depth 1 and return immediately (or, if affinity is off, set 938 // address2os to NULL and return). 939 // 940 // If it is configured to omit the package level when there is only a 941 // single package, the logic at the end of this routine won't work if 942 // there is only a single thread - it would try to form an Address 943 // object with depth 0. 944 // 945 KMP_ASSERT(nApics > 0); 946 if (nApics == 1) { 947 __kmp_ncores = nPackages = 1; 948 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 949 __kmp_ht_enabled = FALSE; 950 if (__kmp_affinity_verbose) { 951 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 952 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 953 954 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 955 if (__kmp_affinity_respect_mask) { 956 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 957 } else { 958 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 959 } 960 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 961 KMP_INFORM(Uniform, "KMP_AFFINITY"); 962 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 963 __kmp_nThreadsPerCore, __kmp_ncores); 964 } 965 966 if (__kmp_affinity_type == affinity_none) { 967 __kmp_free(threadInfo); 968 KMP_CPU_FREE(oldMask); 969 return 0; 970 } 971 972 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 973 Address addr(1); 974 addr.labels[0] = threadInfo[0].pkgId; 975 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 976 977 if (__kmp_affinity_gran_levels < 0) { 978 __kmp_affinity_gran_levels = 0; 979 } 980 981 if (__kmp_affinity_verbose) { 982 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 983 } 984 985 __kmp_free(threadInfo); 986 KMP_CPU_FREE(oldMask); 987 return 1; 988 } 989 990 // 991 // Sort the threadInfo table by physical Id. 992 // 993 qsort(threadInfo, nApics, sizeof(*threadInfo), 994 __kmp_affinity_cmp_apicThreadInfo_phys_id); 995 996 // 997 // The table is now sorted by pkgId / coreId / threadId, but we really 998 // don't know the radix of any of the fields. pkgId's may be sparsely 999 // assigned among the chips on a system. Although coreId's are usually 1000 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1001 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1002 // 1003 // For that matter, we don't know what coresPerPkg and threadsPerCore 1004 // (or the total # packages) are at this point - we want to determine 1005 // that now. We only have an upper bound on the first two figures. 1006 // 1007 // We also perform a consistency check at this point: the values returned 1008 // by the cpuid instruction for any thread bound to a given package had 1009 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1010 // 1011 nPackages = 1; 1012 nCoresPerPkg = 1; 1013 __kmp_nThreadsPerCore = 1; 1014 unsigned nCores = 1; 1015 1016 unsigned pkgCt = 1; // to determine radii 1017 unsigned lastPkgId = threadInfo[0].pkgId; 1018 unsigned coreCt = 1; 1019 unsigned lastCoreId = threadInfo[0].coreId; 1020 unsigned threadCt = 1; 1021 unsigned lastThreadId = threadInfo[0].threadId; 1022 1023 // intra-pkg consist checks 1024 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1025 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1026 1027 for (i = 1; i < nApics; i++) { 1028 if (threadInfo[i].pkgId != lastPkgId) { 1029 nCores++; 1030 pkgCt++; 1031 lastPkgId = threadInfo[i].pkgId; 1032 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1033 coreCt = 1; 1034 lastCoreId = threadInfo[i].coreId; 1035 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1036 threadCt = 1; 1037 lastThreadId = threadInfo[i].threadId; 1038 1039 // 1040 // This is a different package, so go on to the next iteration 1041 // without doing any consistency checks. Reset the consistency 1042 // check vars, though. 1043 // 1044 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1045 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1046 continue; 1047 } 1048 1049 if (threadInfo[i].coreId != lastCoreId) { 1050 nCores++; 1051 coreCt++; 1052 lastCoreId = threadInfo[i].coreId; 1053 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1054 threadCt = 1; 1055 lastThreadId = threadInfo[i].threadId; 1056 } 1057 else if (threadInfo[i].threadId != lastThreadId) { 1058 threadCt++; 1059 lastThreadId = threadInfo[i].threadId; 1060 } 1061 else { 1062 __kmp_free(threadInfo); 1063 KMP_CPU_FREE(oldMask); 1064 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1065 return -1; 1066 } 1067 1068 // 1069 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1070 // fields agree between all the threads bounds to a given package. 1071 // 1072 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1073 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1074 __kmp_free(threadInfo); 1075 KMP_CPU_FREE(oldMask); 1076 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1077 return -1; 1078 } 1079 } 1080 nPackages = pkgCt; 1081 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1082 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1083 1084 // 1085 // When affinity is off, this routine will still be called to set 1086 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 1087 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1088 // correctly, and return now if affinity is not enabled. 1089 // 1090 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1091 __kmp_ncores = nCores; 1092 if (__kmp_affinity_verbose) { 1093 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1094 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1095 1096 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1097 if (__kmp_affinity_respect_mask) { 1098 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1099 } else { 1100 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1101 } 1102 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1103 if (__kmp_affinity_uniform_topology()) { 1104 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1105 } else { 1106 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1107 } 1108 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1109 __kmp_nThreadsPerCore, __kmp_ncores); 1110 1111 } 1112 1113 if (__kmp_affinity_type == affinity_none) { 1114 __kmp_free(threadInfo); 1115 KMP_CPU_FREE(oldMask); 1116 return 0; 1117 } 1118 1119 // 1120 // Now that we've determined the number of packages, the number of cores 1121 // per package, and the number of threads per core, we can construct the 1122 // data structure that is to be returned. 1123 // 1124 int pkgLevel = 0; 1125 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1126 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1127 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1128 1129 KMP_ASSERT(depth > 0); 1130 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1131 1132 for (i = 0; i < nApics; ++i) { 1133 Address addr(depth); 1134 unsigned os = threadInfo[i].osId; 1135 int d = 0; 1136 1137 if (pkgLevel >= 0) { 1138 addr.labels[d++] = threadInfo[i].pkgId; 1139 } 1140 if (coreLevel >= 0) { 1141 addr.labels[d++] = threadInfo[i].coreId; 1142 } 1143 if (threadLevel >= 0) { 1144 addr.labels[d++] = threadInfo[i].threadId; 1145 } 1146 (*address2os)[i] = AddrUnsPair(addr, os); 1147 } 1148 1149 if (__kmp_affinity_gran_levels < 0) { 1150 // 1151 // Set the granularity level based on what levels are modeled 1152 // in the machine topology map. 1153 // 1154 __kmp_affinity_gran_levels = 0; 1155 if ((threadLevel >= 0) 1156 && (__kmp_affinity_gran > affinity_gran_thread)) { 1157 __kmp_affinity_gran_levels++; 1158 } 1159 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1160 __kmp_affinity_gran_levels++; 1161 } 1162 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1163 __kmp_affinity_gran_levels++; 1164 } 1165 } 1166 1167 if (__kmp_affinity_verbose) { 1168 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1169 coreLevel, threadLevel); 1170 } 1171 1172 __kmp_free(threadInfo); 1173 KMP_CPU_FREE(oldMask); 1174 return depth; 1175 } 1176 1177 1178 // 1179 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1180 // architectures support a newer interface for specifying the x2APIC Ids, 1181 // based on cpuid leaf 11. 1182 // 1183 static int 1184 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1185 kmp_i18n_id_t *const msg_id) 1186 { 1187 kmp_cpuid buf; 1188 1189 *address2os = NULL; 1190 *msg_id = kmp_i18n_null; 1191 1192 // 1193 // Check to see if cpuid leaf 11 is supported. 1194 // 1195 __kmp_x86_cpuid(0, 0, &buf); 1196 if (buf.eax < 11) { 1197 *msg_id = kmp_i18n_str_NoLeaf11Support; 1198 return -1; 1199 } 1200 __kmp_x86_cpuid(11, 0, &buf); 1201 if (buf.ebx == 0) { 1202 *msg_id = kmp_i18n_str_NoLeaf11Support; 1203 return -1; 1204 } 1205 1206 // 1207 // Find the number of levels in the machine topology. While we're at it, 1208 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1209 // try to get more accurate values later by explicitly counting them, 1210 // but get reasonable defaults now, in case we return early. 1211 // 1212 int level; 1213 int threadLevel = -1; 1214 int coreLevel = -1; 1215 int pkgLevel = -1; 1216 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1217 1218 for (level = 0;; level++) { 1219 if (level > 31) { 1220 // 1221 // FIXME: Hack for DPD200163180 1222 // 1223 // If level is big then something went wrong -> exiting 1224 // 1225 // There could actually be 32 valid levels in the machine topology, 1226 // but so far, the only machine we have seen which does not exit 1227 // this loop before iteration 32 has fubar x2APIC settings. 1228 // 1229 // For now, just reject this case based upon loop trip count. 1230 // 1231 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1232 return -1; 1233 } 1234 __kmp_x86_cpuid(11, level, &buf); 1235 if (buf.ebx == 0) { 1236 if (pkgLevel < 0) { 1237 // 1238 // Will infer nPackages from __kmp_xproc 1239 // 1240 pkgLevel = level; 1241 level++; 1242 } 1243 break; 1244 } 1245 int kind = (buf.ecx >> 8) & 0xff; 1246 if (kind == 1) { 1247 // 1248 // SMT level 1249 // 1250 threadLevel = level; 1251 coreLevel = -1; 1252 pkgLevel = -1; 1253 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1254 if (__kmp_nThreadsPerCore == 0) { 1255 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1256 return -1; 1257 } 1258 } 1259 else if (kind == 2) { 1260 // 1261 // core level 1262 // 1263 coreLevel = level; 1264 pkgLevel = -1; 1265 nCoresPerPkg = buf.ebx & 0xff; 1266 if (nCoresPerPkg == 0) { 1267 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1268 return -1; 1269 } 1270 } 1271 else { 1272 if (level <= 0) { 1273 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1274 return -1; 1275 } 1276 if (pkgLevel >= 0) { 1277 continue; 1278 } 1279 pkgLevel = level; 1280 nPackages = buf.ebx & 0xff; 1281 if (nPackages == 0) { 1282 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1283 return -1; 1284 } 1285 } 1286 } 1287 int depth = level; 1288 1289 // 1290 // In the above loop, "level" was counted from the finest level (usually 1291 // thread) to the coarsest. The caller expects that we will place the 1292 // labels in (*address2os)[].first.labels[] in the inverse order, so 1293 // we need to invert the vars saying which level means what. 1294 // 1295 if (threadLevel >= 0) { 1296 threadLevel = depth - threadLevel - 1; 1297 } 1298 if (coreLevel >= 0) { 1299 coreLevel = depth - coreLevel - 1; 1300 } 1301 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1302 pkgLevel = depth - pkgLevel - 1; 1303 1304 // 1305 // The algorithm used starts by setting the affinity to each available 1306 // thread and retrieving info from the cpuid instruction, so if we are not 1307 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(), 1308 // then we need to do something else - use the defaults that we calculated 1309 // from issuing cpuid without binding to each proc. 1310 // 1311 if (! KMP_AFFINITY_CAPABLE()) 1312 { 1313 // 1314 // Hack to try and infer the machine topology using only the data 1315 // available from cpuid on the current thread, and __kmp_xproc. 1316 // 1317 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1318 1319 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1320 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1321 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1322 if (__kmp_affinity_verbose) { 1323 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1324 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1325 if (__kmp_affinity_uniform_topology()) { 1326 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1327 } else { 1328 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1329 } 1330 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1331 __kmp_nThreadsPerCore, __kmp_ncores); 1332 } 1333 return 0; 1334 } 1335 1336 // 1337 // 1338 // From here on, we can assume that it is safe to call 1339 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1340 // even if __kmp_affinity_type = affinity_none. 1341 // 1342 1343 // 1344 // Save the affinity mask for the current thread. 1345 // 1346 kmp_affin_mask_t *oldMask; 1347 KMP_CPU_ALLOC(oldMask); 1348 __kmp_get_system_affinity(oldMask, TRUE); 1349 1350 // 1351 // Allocate the data structure to be returned. 1352 // 1353 AddrUnsPair *retval = (AddrUnsPair *) 1354 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1355 1356 // 1357 // Run through each of the available contexts, binding the current thread 1358 // to it, and obtaining the pertinent information using the cpuid instr. 1359 // 1360 unsigned int proc; 1361 int nApics = 0; 1362 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1363 // 1364 // Skip this proc if it is not included in the machine model. 1365 // 1366 if (! KMP_CPU_ISSET(proc, fullMask)) { 1367 continue; 1368 } 1369 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1370 1371 __kmp_affinity_bind_thread(proc); 1372 1373 // 1374 // Extrach the labels for each level in the machine topology map 1375 // from the Apic ID. 1376 // 1377 Address addr(depth); 1378 int prev_shift = 0; 1379 1380 for (level = 0; level < depth; level++) { 1381 __kmp_x86_cpuid(11, level, &buf); 1382 unsigned apicId = buf.edx; 1383 if (buf.ebx == 0) { 1384 if (level != depth - 1) { 1385 KMP_CPU_FREE(oldMask); 1386 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1387 return -1; 1388 } 1389 addr.labels[depth - level - 1] = apicId >> prev_shift; 1390 level++; 1391 break; 1392 } 1393 int shift = buf.eax & 0x1f; 1394 int mask = (1 << shift) - 1; 1395 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1396 prev_shift = shift; 1397 } 1398 if (level != depth) { 1399 KMP_CPU_FREE(oldMask); 1400 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1401 return -1; 1402 } 1403 1404 retval[nApics] = AddrUnsPair(addr, proc); 1405 nApics++; 1406 } 1407 1408 // 1409 // We've collected all the info we need. 1410 // Restore the old affinity mask for this thread. 1411 // 1412 __kmp_set_system_affinity(oldMask, TRUE); 1413 1414 // 1415 // If there's only one thread context to bind to, return now. 1416 // 1417 KMP_ASSERT(nApics > 0); 1418 if (nApics == 1) { 1419 __kmp_ncores = nPackages = 1; 1420 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1421 __kmp_ht_enabled = FALSE; 1422 if (__kmp_affinity_verbose) { 1423 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1424 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1425 1426 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1427 if (__kmp_affinity_respect_mask) { 1428 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1429 } else { 1430 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1431 } 1432 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1433 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1434 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1435 __kmp_nThreadsPerCore, __kmp_ncores); 1436 } 1437 1438 if (__kmp_affinity_type == affinity_none) { 1439 __kmp_free(retval); 1440 KMP_CPU_FREE(oldMask); 1441 return 0; 1442 } 1443 1444 // 1445 // Form an Address object which only includes the package level. 1446 // 1447 Address addr(1); 1448 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1449 retval[0].first = addr; 1450 1451 if (__kmp_affinity_gran_levels < 0) { 1452 __kmp_affinity_gran_levels = 0; 1453 } 1454 1455 if (__kmp_affinity_verbose) { 1456 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1457 } 1458 1459 *address2os = retval; 1460 KMP_CPU_FREE(oldMask); 1461 return 1; 1462 } 1463 1464 // 1465 // Sort the table by physical Id. 1466 // 1467 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1468 1469 // 1470 // Find the radix at each of the levels. 1471 // 1472 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1473 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1474 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1475 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1476 for (level = 0; level < depth; level++) { 1477 totals[level] = 1; 1478 maxCt[level] = 1; 1479 counts[level] = 1; 1480 last[level] = retval[0].first.labels[level]; 1481 } 1482 1483 // 1484 // From here on, the iteration variable "level" runs from the finest 1485 // level to the coarsest, i.e. we iterate forward through 1486 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1487 // backwards. 1488 // 1489 for (proc = 1; (int)proc < nApics; proc++) { 1490 int level; 1491 for (level = 0; level < depth; level++) { 1492 if (retval[proc].first.labels[level] != last[level]) { 1493 int j; 1494 for (j = level + 1; j < depth; j++) { 1495 totals[j]++; 1496 counts[j] = 1; 1497 // The line below causes printing incorrect topology information 1498 // in case the max value for some level (maxCt[level]) is encountered earlier than 1499 // some less value while going through the array. 1500 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1501 // whereas it must be 4. 1502 // TODO!!! Check if it can be commented safely 1503 //maxCt[j] = 1; 1504 last[j] = retval[proc].first.labels[j]; 1505 } 1506 totals[level]++; 1507 counts[level]++; 1508 if (counts[level] > maxCt[level]) { 1509 maxCt[level] = counts[level]; 1510 } 1511 last[level] = retval[proc].first.labels[level]; 1512 break; 1513 } 1514 else if (level == depth - 1) { 1515 __kmp_free(last); 1516 __kmp_free(maxCt); 1517 __kmp_free(counts); 1518 __kmp_free(totals); 1519 __kmp_free(retval); 1520 KMP_CPU_FREE(oldMask); 1521 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1522 return -1; 1523 } 1524 } 1525 } 1526 1527 // 1528 // When affinity is off, this routine will still be called to set 1529 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 1530 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1531 // correctly, and return if affinity is not enabled. 1532 // 1533 if (threadLevel >= 0) { 1534 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1535 } 1536 else { 1537 __kmp_nThreadsPerCore = 1; 1538 } 1539 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1540 1541 nPackages = totals[pkgLevel]; 1542 1543 if (coreLevel >= 0) { 1544 __kmp_ncores = totals[coreLevel]; 1545 nCoresPerPkg = maxCt[coreLevel]; 1546 } 1547 else { 1548 __kmp_ncores = nPackages; 1549 nCoresPerPkg = 1; 1550 } 1551 1552 // 1553 // Check to see if the machine topology is uniform 1554 // 1555 unsigned prod = maxCt[0]; 1556 for (level = 1; level < depth; level++) { 1557 prod *= maxCt[level]; 1558 } 1559 bool uniform = (prod == totals[level - 1]); 1560 1561 // 1562 // Print the machine topology summary. 1563 // 1564 if (__kmp_affinity_verbose) { 1565 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1566 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1567 1568 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1569 if (__kmp_affinity_respect_mask) { 1570 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1571 } else { 1572 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1573 } 1574 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1575 if (uniform) { 1576 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1577 } else { 1578 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1579 } 1580 1581 kmp_str_buf_t buf; 1582 __kmp_str_buf_init(&buf); 1583 1584 __kmp_str_buf_print(&buf, "%d", totals[0]); 1585 for (level = 1; level <= pkgLevel; level++) { 1586 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1587 } 1588 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1589 __kmp_nThreadsPerCore, __kmp_ncores); 1590 1591 __kmp_str_buf_free(&buf); 1592 } 1593 1594 if (__kmp_affinity_type == affinity_none) { 1595 __kmp_free(last); 1596 __kmp_free(maxCt); 1597 __kmp_free(counts); 1598 __kmp_free(totals); 1599 __kmp_free(retval); 1600 KMP_CPU_FREE(oldMask); 1601 return 0; 1602 } 1603 1604 // 1605 // Find any levels with radiix 1, and remove them from the map 1606 // (except for the package level). 1607 // 1608 int new_depth = 0; 1609 for (level = 0; level < depth; level++) { 1610 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1611 continue; 1612 } 1613 new_depth++; 1614 } 1615 1616 // 1617 // If we are removing any levels, allocate a new vector to return, 1618 // and copy the relevant information to it. 1619 // 1620 if (new_depth != depth) { 1621 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1622 sizeof(AddrUnsPair) * nApics); 1623 for (proc = 0; (int)proc < nApics; proc++) { 1624 Address addr(new_depth); 1625 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1626 } 1627 int new_level = 0; 1628 for (level = 0; level < depth; level++) { 1629 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1630 if (level == threadLevel) { 1631 threadLevel = -1; 1632 } 1633 else if ((threadLevel >= 0) && (level < threadLevel)) { 1634 threadLevel--; 1635 } 1636 if (level == coreLevel) { 1637 coreLevel = -1; 1638 } 1639 else if ((coreLevel >= 0) && (level < coreLevel)) { 1640 coreLevel--; 1641 } 1642 if (level < pkgLevel) { 1643 pkgLevel--; 1644 } 1645 continue; 1646 } 1647 for (proc = 0; (int)proc < nApics; proc++) { 1648 new_retval[proc].first.labels[new_level] 1649 = retval[proc].first.labels[level]; 1650 } 1651 new_level++; 1652 } 1653 1654 __kmp_free(retval); 1655 retval = new_retval; 1656 depth = new_depth; 1657 } 1658 1659 if (__kmp_affinity_gran_levels < 0) { 1660 // 1661 // Set the granularity level based on what levels are modeled 1662 // in the machine topology map. 1663 // 1664 __kmp_affinity_gran_levels = 0; 1665 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1666 __kmp_affinity_gran_levels++; 1667 } 1668 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1669 __kmp_affinity_gran_levels++; 1670 } 1671 if (__kmp_affinity_gran > affinity_gran_package) { 1672 __kmp_affinity_gran_levels++; 1673 } 1674 } 1675 1676 if (__kmp_affinity_verbose) { 1677 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1678 coreLevel, threadLevel); 1679 } 1680 1681 __kmp_free(last); 1682 __kmp_free(maxCt); 1683 __kmp_free(counts); 1684 __kmp_free(totals); 1685 KMP_CPU_FREE(oldMask); 1686 *address2os = retval; 1687 return depth; 1688 } 1689 1690 1691 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1692 1693 1694 #define osIdIndex 0 1695 #define threadIdIndex 1 1696 #define coreIdIndex 2 1697 #define pkgIdIndex 3 1698 #define nodeIdIndex 4 1699 1700 typedef unsigned *ProcCpuInfo; 1701 static unsigned maxIndex = pkgIdIndex; 1702 1703 1704 static int 1705 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1706 { 1707 const unsigned *aa = (const unsigned *)a; 1708 const unsigned *bb = (const unsigned *)b; 1709 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1710 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1711 return 0; 1712 }; 1713 1714 1715 static int 1716 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1717 { 1718 unsigned i; 1719 const unsigned *aa = *((const unsigned **)a); 1720 const unsigned *bb = *((const unsigned **)b); 1721 for (i = maxIndex; ; i--) { 1722 if (aa[i] < bb[i]) return -1; 1723 if (aa[i] > bb[i]) return 1; 1724 if (i == osIdIndex) break; 1725 } 1726 return 0; 1727 } 1728 1729 1730 // 1731 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1732 // affinity map. 1733 // 1734 static int 1735 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1736 kmp_i18n_id_t *const msg_id, FILE *f) 1737 { 1738 *address2os = NULL; 1739 *msg_id = kmp_i18n_null; 1740 1741 // 1742 // Scan of the file, and count the number of "processor" (osId) fields, 1743 // and find the highest value of <n> for a node_<n> field. 1744 // 1745 char buf[256]; 1746 unsigned num_records = 0; 1747 while (! feof(f)) { 1748 buf[sizeof(buf) - 1] = 1; 1749 if (! fgets(buf, sizeof(buf), f)) { 1750 // 1751 // Read errors presumably because of EOF 1752 // 1753 break; 1754 } 1755 1756 char s1[] = "processor"; 1757 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1758 num_records++; 1759 continue; 1760 } 1761 1762 // 1763 // FIXME - this will match "node_<n> <garbage>" 1764 // 1765 unsigned level; 1766 if (sscanf(buf, "node_%d id", &level) == 1) { 1767 if (nodeIdIndex + level >= maxIndex) { 1768 maxIndex = nodeIdIndex + level; 1769 } 1770 continue; 1771 } 1772 } 1773 1774 // 1775 // Check for empty file / no valid processor records, or too many. 1776 // The number of records can't exceed the number of valid bits in the 1777 // affinity mask. 1778 // 1779 if (num_records == 0) { 1780 *line = 0; 1781 *msg_id = kmp_i18n_str_NoProcRecords; 1782 return -1; 1783 } 1784 if (num_records > (unsigned)__kmp_xproc) { 1785 *line = 0; 1786 *msg_id = kmp_i18n_str_TooManyProcRecords; 1787 return -1; 1788 } 1789 1790 // 1791 // Set the file pointer back to the begginning, so that we can scan the 1792 // file again, this time performing a full parse of the data. 1793 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1794 // Adding an extra element at the end allows us to remove a lot of extra 1795 // checks for termination conditions. 1796 // 1797 if (fseek(f, 0, SEEK_SET) != 0) { 1798 *line = 0; 1799 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1800 return -1; 1801 } 1802 1803 // 1804 // Allocate the array of records to store the proc info in. The dummy 1805 // element at the end makes the logic in filling them out easier to code. 1806 // 1807 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1808 * sizeof(unsigned *)); 1809 unsigned i; 1810 for (i = 0; i <= num_records; i++) { 1811 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1812 * sizeof(unsigned)); 1813 } 1814 1815 #define CLEANUP_THREAD_INFO \ 1816 for (i = 0; i <= num_records; i++) { \ 1817 __kmp_free(threadInfo[i]); \ 1818 } \ 1819 __kmp_free(threadInfo); 1820 1821 // 1822 // A value of UINT_MAX means that we didn't find the field 1823 // 1824 unsigned __index; 1825 1826 #define INIT_PROC_INFO(p) \ 1827 for (__index = 0; __index <= maxIndex; __index++) { \ 1828 (p)[__index] = UINT_MAX; \ 1829 } 1830 1831 for (i = 0; i <= num_records; i++) { 1832 INIT_PROC_INFO(threadInfo[i]); 1833 } 1834 1835 unsigned num_avail = 0; 1836 *line = 0; 1837 while (! feof(f)) { 1838 // 1839 // Create an inner scoping level, so that all the goto targets at the 1840 // end of the loop appear in an outer scoping level. This avoids 1841 // warnings about jumping past an initialization to a target in the 1842 // same block. 1843 // 1844 { 1845 buf[sizeof(buf) - 1] = 1; 1846 bool long_line = false; 1847 if (! fgets(buf, sizeof(buf), f)) { 1848 // 1849 // Read errors presumably because of EOF 1850 // 1851 // If there is valid data in threadInfo[num_avail], then fake 1852 // a blank line in ensure that the last address gets parsed. 1853 // 1854 bool valid = false; 1855 for (i = 0; i <= maxIndex; i++) { 1856 if (threadInfo[num_avail][i] != UINT_MAX) { 1857 valid = true; 1858 } 1859 } 1860 if (! valid) { 1861 break; 1862 } 1863 buf[0] = 0; 1864 } else if (!buf[sizeof(buf) - 1]) { 1865 // 1866 // The line is longer than the buffer. Set a flag and don't 1867 // emit an error if we were going to ignore the line, anyway. 1868 // 1869 long_line = true; 1870 1871 #define CHECK_LINE \ 1872 if (long_line) { \ 1873 CLEANUP_THREAD_INFO; \ 1874 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1875 return -1; \ 1876 } 1877 } 1878 (*line)++; 1879 1880 char s1[] = "processor"; 1881 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1882 CHECK_LINE; 1883 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1884 unsigned val; 1885 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1886 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1887 threadInfo[num_avail][osIdIndex] = val; 1888 #if KMP_OS_LINUX && USE_SYSFS_INFO 1889 char path[256]; 1890 snprintf(path, sizeof(path), 1891 "/sys/devices/system/cpu/cpu%u/topology/physical_package_id", 1892 threadInfo[num_avail][osIdIndex]); 1893 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][pkgIdIndex]); 1894 1895 snprintf(path, sizeof(path), 1896 "/sys/devices/system/cpu/cpu%u/topology/core_id", 1897 threadInfo[num_avail][osIdIndex]); 1898 __kmp_read_from_file(path, "%u", &threadInfo[num_avail][coreIdIndex]); 1899 continue; 1900 #else 1901 } 1902 char s2[] = "physical id"; 1903 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1904 CHECK_LINE; 1905 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1906 unsigned val; 1907 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1908 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 1909 threadInfo[num_avail][pkgIdIndex] = val; 1910 continue; 1911 } 1912 char s3[] = "core id"; 1913 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1914 CHECK_LINE; 1915 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1916 unsigned val; 1917 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1918 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 1919 threadInfo[num_avail][coreIdIndex] = val; 1920 continue; 1921 #endif // KMP_OS_LINUX && USE_SYSFS_INFO 1922 } 1923 char s4[] = "thread id"; 1924 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1925 CHECK_LINE; 1926 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1927 unsigned val; 1928 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1929 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 1930 threadInfo[num_avail][threadIdIndex] = val; 1931 continue; 1932 } 1933 unsigned level; 1934 if (sscanf(buf, "node_%d id", &level) == 1) { 1935 CHECK_LINE; 1936 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1937 unsigned val; 1938 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1939 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1940 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 1941 threadInfo[num_avail][nodeIdIndex + level] = val; 1942 continue; 1943 } 1944 1945 // 1946 // We didn't recognize the leading token on the line. 1947 // There are lots of leading tokens that we don't recognize - 1948 // if the line isn't empty, go on to the next line. 1949 // 1950 if ((*buf != 0) && (*buf != '\n')) { 1951 // 1952 // If the line is longer than the buffer, read characters 1953 // until we find a newline. 1954 // 1955 if (long_line) { 1956 int ch; 1957 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 1958 } 1959 continue; 1960 } 1961 1962 // 1963 // A newline has signalled the end of the processor record. 1964 // Check that there aren't too many procs specified. 1965 // 1966 if (num_avail == __kmp_xproc) { 1967 CLEANUP_THREAD_INFO; 1968 *msg_id = kmp_i18n_str_TooManyEntries; 1969 return -1; 1970 } 1971 1972 // 1973 // Check for missing fields. The osId field must be there, and we 1974 // currently require that the physical id field is specified, also. 1975 // 1976 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1977 CLEANUP_THREAD_INFO; 1978 *msg_id = kmp_i18n_str_MissingProcField; 1979 return -1; 1980 } 1981 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1982 CLEANUP_THREAD_INFO; 1983 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1984 return -1; 1985 } 1986 1987 // 1988 // Skip this proc if it is not included in the machine model. 1989 // 1990 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 1991 INIT_PROC_INFO(threadInfo[num_avail]); 1992 continue; 1993 } 1994 1995 // 1996 // We have a successful parse of this proc's info. 1997 // Increment the counter, and prepare for the next proc. 1998 // 1999 num_avail++; 2000 KMP_ASSERT(num_avail <= num_records); 2001 INIT_PROC_INFO(threadInfo[num_avail]); 2002 } 2003 continue; 2004 2005 no_val: 2006 CLEANUP_THREAD_INFO; 2007 *msg_id = kmp_i18n_str_MissingValCpuinfo; 2008 return -1; 2009 2010 dup_field: 2011 CLEANUP_THREAD_INFO; 2012 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2013 return -1; 2014 } 2015 *line = 0; 2016 2017 # if KMP_MIC && REDUCE_TEAM_SIZE 2018 unsigned teamSize = 0; 2019 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2020 2021 // check for num_records == __kmp_xproc ??? 2022 2023 // 2024 // If there's only one thread context to bind to, form an Address object 2025 // with depth 1 and return immediately (or, if affinity is off, set 2026 // address2os to NULL and return). 2027 // 2028 // If it is configured to omit the package level when there is only a 2029 // single package, the logic at the end of this routine won't work if 2030 // there is only a single thread - it would try to form an Address 2031 // object with depth 0. 2032 // 2033 KMP_ASSERT(num_avail > 0); 2034 KMP_ASSERT(num_avail <= num_records); 2035 if (num_avail == 1) { 2036 __kmp_ncores = 1; 2037 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2038 __kmp_ht_enabled = FALSE; 2039 if (__kmp_affinity_verbose) { 2040 if (! KMP_AFFINITY_CAPABLE()) { 2041 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2042 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2043 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2044 } 2045 else { 2046 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2047 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2048 fullMask); 2049 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2050 if (__kmp_affinity_respect_mask) { 2051 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2052 } else { 2053 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2054 } 2055 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2056 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2057 } 2058 int index; 2059 kmp_str_buf_t buf; 2060 __kmp_str_buf_init(&buf); 2061 __kmp_str_buf_print(&buf, "1"); 2062 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2063 __kmp_str_buf_print(&buf, " x 1"); 2064 } 2065 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2066 __kmp_str_buf_free(&buf); 2067 } 2068 2069 if (__kmp_affinity_type == affinity_none) { 2070 CLEANUP_THREAD_INFO; 2071 return 0; 2072 } 2073 2074 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2075 Address addr(1); 2076 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2077 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2078 2079 if (__kmp_affinity_gran_levels < 0) { 2080 __kmp_affinity_gran_levels = 0; 2081 } 2082 2083 if (__kmp_affinity_verbose) { 2084 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2085 } 2086 2087 CLEANUP_THREAD_INFO; 2088 return 1; 2089 } 2090 2091 // 2092 // Sort the threadInfo table by physical Id. 2093 // 2094 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2095 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2096 2097 // 2098 // The table is now sorted by pkgId / coreId / threadId, but we really 2099 // don't know the radix of any of the fields. pkgId's may be sparsely 2100 // assigned among the chips on a system. Although coreId's are usually 2101 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2102 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2103 // 2104 // For that matter, we don't know what coresPerPkg and threadsPerCore 2105 // (or the total # packages) are at this point - we want to determine 2106 // that now. We only have an upper bound on the first two figures. 2107 // 2108 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2109 * sizeof(unsigned)); 2110 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2111 * sizeof(unsigned)); 2112 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2113 * sizeof(unsigned)); 2114 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2115 * sizeof(unsigned)); 2116 2117 bool assign_thread_ids = false; 2118 unsigned threadIdCt; 2119 unsigned index; 2120 2121 restart_radix_check: 2122 threadIdCt = 0; 2123 2124 // 2125 // Initialize the counter arrays with data from threadInfo[0]. 2126 // 2127 if (assign_thread_ids) { 2128 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2129 threadInfo[0][threadIdIndex] = threadIdCt++; 2130 } 2131 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2132 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2133 } 2134 } 2135 for (index = 0; index <= maxIndex; index++) { 2136 counts[index] = 1; 2137 maxCt[index] = 1; 2138 totals[index] = 1; 2139 lastId[index] = threadInfo[0][index];; 2140 } 2141 2142 // 2143 // Run through the rest of the OS procs. 2144 // 2145 for (i = 1; i < num_avail; i++) { 2146 // 2147 // Find the most significant index whose id differs 2148 // from the id for the previous OS proc. 2149 // 2150 for (index = maxIndex; index >= threadIdIndex; index--) { 2151 if (assign_thread_ids && (index == threadIdIndex)) { 2152 // 2153 // Auto-assign the thread id field if it wasn't specified. 2154 // 2155 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2156 threadInfo[i][threadIdIndex] = threadIdCt++; 2157 } 2158 2159 // 2160 // Aparrently the thread id field was specified for some 2161 // entries and not others. Start the thread id counter 2162 // off at the next higher thread id. 2163 // 2164 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2165 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2166 } 2167 } 2168 if (threadInfo[i][index] != lastId[index]) { 2169 // 2170 // Run through all indices which are less significant, 2171 // and reset the counts to 1. 2172 // 2173 // At all levels up to and including index, we need to 2174 // increment the totals and record the last id. 2175 // 2176 unsigned index2; 2177 for (index2 = threadIdIndex; index2 < index; index2++) { 2178 totals[index2]++; 2179 if (counts[index2] > maxCt[index2]) { 2180 maxCt[index2] = counts[index2]; 2181 } 2182 counts[index2] = 1; 2183 lastId[index2] = threadInfo[i][index2]; 2184 } 2185 counts[index]++; 2186 totals[index]++; 2187 lastId[index] = threadInfo[i][index]; 2188 2189 if (assign_thread_ids && (index > threadIdIndex)) { 2190 2191 # if KMP_MIC && REDUCE_TEAM_SIZE 2192 // 2193 // The default team size is the total #threads in the machine 2194 // minus 1 thread for every core that has 3 or more threads. 2195 // 2196 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2197 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2198 2199 // 2200 // Restart the thread counter, as we are on a new core. 2201 // 2202 threadIdCt = 0; 2203 2204 // 2205 // Auto-assign the thread id field if it wasn't specified. 2206 // 2207 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2208 threadInfo[i][threadIdIndex] = threadIdCt++; 2209 } 2210 2211 // 2212 // Aparrently the thread id field was specified for some 2213 // entries and not others. Start the thread id counter 2214 // off at the next higher thread id. 2215 // 2216 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2217 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2218 } 2219 } 2220 break; 2221 } 2222 } 2223 if (index < threadIdIndex) { 2224 // 2225 // If thread ids were specified, it is an error if they are not 2226 // unique. Also, check that we waven't already restarted the 2227 // loop (to be safe - shouldn't need to). 2228 // 2229 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2230 || assign_thread_ids) { 2231 __kmp_free(lastId); 2232 __kmp_free(totals); 2233 __kmp_free(maxCt); 2234 __kmp_free(counts); 2235 CLEANUP_THREAD_INFO; 2236 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2237 return -1; 2238 } 2239 2240 // 2241 // If the thread ids were not specified and we see entries 2242 // entries that are duplicates, start the loop over and 2243 // assign the thread ids manually. 2244 // 2245 assign_thread_ids = true; 2246 goto restart_radix_check; 2247 } 2248 } 2249 2250 # if KMP_MIC && REDUCE_TEAM_SIZE 2251 // 2252 // The default team size is the total #threads in the machine 2253 // minus 1 thread for every core that has 3 or more threads. 2254 // 2255 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2256 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2257 2258 for (index = threadIdIndex; index <= maxIndex; index++) { 2259 if (counts[index] > maxCt[index]) { 2260 maxCt[index] = counts[index]; 2261 } 2262 } 2263 2264 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2265 nCoresPerPkg = maxCt[coreIdIndex]; 2266 nPackages = totals[pkgIdIndex]; 2267 2268 // 2269 // Check to see if the machine topology is uniform 2270 // 2271 unsigned prod = totals[maxIndex]; 2272 for (index = threadIdIndex; index < maxIndex; index++) { 2273 prod *= maxCt[index]; 2274 } 2275 bool uniform = (prod == totals[threadIdIndex]); 2276 2277 // 2278 // When affinity is off, this routine will still be called to set 2279 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 2280 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2281 // correctly, and return now if affinity is not enabled. 2282 // 2283 __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1 2284 __kmp_ncores = totals[coreIdIndex]; 2285 2286 if (__kmp_affinity_verbose) { 2287 if (! KMP_AFFINITY_CAPABLE()) { 2288 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2289 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2290 if (uniform) { 2291 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2292 } else { 2293 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2294 } 2295 } 2296 else { 2297 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2298 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2299 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2300 if (__kmp_affinity_respect_mask) { 2301 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2302 } else { 2303 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2304 } 2305 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2306 if (uniform) { 2307 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2308 } else { 2309 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2310 } 2311 } 2312 kmp_str_buf_t buf; 2313 __kmp_str_buf_init(&buf); 2314 2315 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2316 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2317 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2318 } 2319 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2320 maxCt[threadIdIndex], __kmp_ncores); 2321 2322 __kmp_str_buf_free(&buf); 2323 } 2324 2325 # if KMP_MIC && REDUCE_TEAM_SIZE 2326 // 2327 // Set the default team size. 2328 // 2329 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2330 __kmp_dflt_team_nth = teamSize; 2331 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2332 __kmp_dflt_team_nth)); 2333 } 2334 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2335 2336 if (__kmp_affinity_type == affinity_none) { 2337 __kmp_free(lastId); 2338 __kmp_free(totals); 2339 __kmp_free(maxCt); 2340 __kmp_free(counts); 2341 CLEANUP_THREAD_INFO; 2342 return 0; 2343 } 2344 2345 // 2346 // Count the number of levels which have more nodes at that level than 2347 // at the parent's level (with there being an implicit root node of 2348 // the top level). This is equivalent to saying that there is at least 2349 // one node at this level which has a sibling. These levels are in the 2350 // map, and the package level is always in the map. 2351 // 2352 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2353 int level = 0; 2354 for (index = threadIdIndex; index < maxIndex; index++) { 2355 KMP_ASSERT(totals[index] >= totals[index + 1]); 2356 inMap[index] = (totals[index] > totals[index + 1]); 2357 } 2358 inMap[maxIndex] = (totals[maxIndex] > 1); 2359 inMap[pkgIdIndex] = true; 2360 2361 int depth = 0; 2362 for (index = threadIdIndex; index <= maxIndex; index++) { 2363 if (inMap[index]) { 2364 depth++; 2365 } 2366 } 2367 KMP_ASSERT(depth > 0); 2368 2369 // 2370 // Construct the data structure that is to be returned. 2371 // 2372 *address2os = (AddrUnsPair*) 2373 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2374 int pkgLevel = -1; 2375 int coreLevel = -1; 2376 int threadLevel = -1; 2377 2378 for (i = 0; i < num_avail; ++i) { 2379 Address addr(depth); 2380 unsigned os = threadInfo[i][osIdIndex]; 2381 int src_index; 2382 int dst_index = 0; 2383 2384 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2385 if (! inMap[src_index]) { 2386 continue; 2387 } 2388 addr.labels[dst_index] = threadInfo[i][src_index]; 2389 if (src_index == pkgIdIndex) { 2390 pkgLevel = dst_index; 2391 } 2392 else if (src_index == coreIdIndex) { 2393 coreLevel = dst_index; 2394 } 2395 else if (src_index == threadIdIndex) { 2396 threadLevel = dst_index; 2397 } 2398 dst_index++; 2399 } 2400 (*address2os)[i] = AddrUnsPair(addr, os); 2401 } 2402 2403 if (__kmp_affinity_gran_levels < 0) { 2404 // 2405 // Set the granularity level based on what levels are modeled 2406 // in the machine topology map. 2407 // 2408 unsigned src_index; 2409 __kmp_affinity_gran_levels = 0; 2410 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2411 if (! inMap[src_index]) { 2412 continue; 2413 } 2414 switch (src_index) { 2415 case threadIdIndex: 2416 if (__kmp_affinity_gran > affinity_gran_thread) { 2417 __kmp_affinity_gran_levels++; 2418 } 2419 2420 break; 2421 case coreIdIndex: 2422 if (__kmp_affinity_gran > affinity_gran_core) { 2423 __kmp_affinity_gran_levels++; 2424 } 2425 break; 2426 2427 case pkgIdIndex: 2428 if (__kmp_affinity_gran > affinity_gran_package) { 2429 __kmp_affinity_gran_levels++; 2430 } 2431 break; 2432 } 2433 } 2434 } 2435 2436 if (__kmp_affinity_verbose) { 2437 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2438 coreLevel, threadLevel); 2439 } 2440 2441 __kmp_free(inMap); 2442 __kmp_free(lastId); 2443 __kmp_free(totals); 2444 __kmp_free(maxCt); 2445 __kmp_free(counts); 2446 CLEANUP_THREAD_INFO; 2447 return depth; 2448 } 2449 2450 2451 // 2452 // Create and return a table of affinity masks, indexed by OS thread ID. 2453 // This routine handles OR'ing together all the affinity masks of threads 2454 // that are sufficiently close, if granularity > fine. 2455 // 2456 static kmp_affin_mask_t * 2457 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2458 AddrUnsPair *address2os, unsigned numAddrs) 2459 { 2460 // 2461 // First form a table of affinity masks in order of OS thread id. 2462 // 2463 unsigned depth; 2464 unsigned maxOsId; 2465 unsigned i; 2466 2467 KMP_ASSERT(numAddrs > 0); 2468 depth = address2os[0].first.depth; 2469 2470 maxOsId = 0; 2471 for (i = 0; i < numAddrs; i++) { 2472 unsigned osId = address2os[i].second; 2473 if (osId > maxOsId) { 2474 maxOsId = osId; 2475 } 2476 } 2477 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2478 (maxOsId + 1) * __kmp_affin_mask_size); 2479 2480 // 2481 // Sort the address2os table according to physical order. Doing so 2482 // will put all threads on the same core/package/node in consecutive 2483 // locations. 2484 // 2485 qsort(address2os, numAddrs, sizeof(*address2os), 2486 __kmp_affinity_cmp_Address_labels); 2487 2488 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2489 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2490 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2491 } 2492 if (__kmp_affinity_gran_levels >= (int)depth) { 2493 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2494 && (__kmp_affinity_type != affinity_none))) { 2495 KMP_WARNING(AffThreadsMayMigrate); 2496 } 2497 } 2498 2499 // 2500 // Run through the table, forming the masks for all threads on each 2501 // core. Threads on the same core will have identical "Address" 2502 // objects, not considering the last level, which must be the thread 2503 // id. All threads on a core will appear consecutively. 2504 // 2505 unsigned unique = 0; 2506 unsigned j = 0; // index of 1st thread on core 2507 unsigned leader = 0; 2508 Address *leaderAddr = &(address2os[0].first); 2509 kmp_affin_mask_t *sum 2510 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 2511 KMP_CPU_ZERO(sum); 2512 KMP_CPU_SET(address2os[0].second, sum); 2513 for (i = 1; i < numAddrs; i++) { 2514 // 2515 // If this thread is sufficiently close to the leader (within the 2516 // granularity setting), then set the bit for this os thread in the 2517 // affinity mask for this group, and go on to the next thread. 2518 // 2519 if (leaderAddr->isClose(address2os[i].first, 2520 __kmp_affinity_gran_levels)) { 2521 KMP_CPU_SET(address2os[i].second, sum); 2522 continue; 2523 } 2524 2525 // 2526 // For every thread in this group, copy the mask to the thread's 2527 // entry in the osId2Mask table. Mark the first address as a 2528 // leader. 2529 // 2530 for (; j < i; j++) { 2531 unsigned osId = address2os[j].second; 2532 KMP_DEBUG_ASSERT(osId <= maxOsId); 2533 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2534 KMP_CPU_COPY(mask, sum); 2535 address2os[j].first.leader = (j == leader); 2536 } 2537 unique++; 2538 2539 // 2540 // Start a new mask. 2541 // 2542 leader = i; 2543 leaderAddr = &(address2os[i].first); 2544 KMP_CPU_ZERO(sum); 2545 KMP_CPU_SET(address2os[i].second, sum); 2546 } 2547 2548 // 2549 // For every thread in last group, copy the mask to the thread's 2550 // entry in the osId2Mask table. 2551 // 2552 for (; j < i; j++) { 2553 unsigned osId = address2os[j].second; 2554 KMP_DEBUG_ASSERT(osId <= maxOsId); 2555 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2556 KMP_CPU_COPY(mask, sum); 2557 address2os[j].first.leader = (j == leader); 2558 } 2559 unique++; 2560 2561 *maxIndex = maxOsId; 2562 *numUnique = unique; 2563 return osId2Mask; 2564 } 2565 2566 2567 // 2568 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2569 // as file-static than to try and pass them through the calling sequence of 2570 // the recursive-descent OMP_PLACES parser. 2571 // 2572 static kmp_affin_mask_t *newMasks; 2573 static int numNewMasks; 2574 static int nextNewMask; 2575 2576 #define ADD_MASK(_mask) \ 2577 { \ 2578 if (nextNewMask >= numNewMasks) { \ 2579 numNewMasks *= 2; \ 2580 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2581 numNewMasks * __kmp_affin_mask_size); \ 2582 } \ 2583 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2584 nextNewMask++; \ 2585 } 2586 2587 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2588 { \ 2589 if (((_osId) > _maxOsId) || \ 2590 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX(_osId2Mask, (_osId))))) {\ 2591 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2592 && (__kmp_affinity_type != affinity_none))) { \ 2593 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2594 } \ 2595 } \ 2596 else { \ 2597 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2598 } \ 2599 } 2600 2601 2602 // 2603 // Re-parse the proclist (for the explicit affinity type), and form the list 2604 // of affinity newMasks indexed by gtid. 2605 // 2606 static void 2607 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2608 unsigned int *out_numMasks, const char *proclist, 2609 kmp_affin_mask_t *osId2Mask, int maxOsId) 2610 { 2611 const char *scan = proclist; 2612 const char *next = proclist; 2613 2614 // 2615 // We use malloc() for the temporary mask vector, 2616 // so that we can use realloc() to extend it. 2617 // 2618 numNewMasks = 2; 2619 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2620 * __kmp_affin_mask_size); 2621 nextNewMask = 0; 2622 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2623 __kmp_affin_mask_size); 2624 int setSize = 0; 2625 2626 for (;;) { 2627 int start, end, stride; 2628 2629 SKIP_WS(scan); 2630 next = scan; 2631 if (*next == '\0') { 2632 break; 2633 } 2634 2635 if (*next == '{') { 2636 int num; 2637 setSize = 0; 2638 next++; // skip '{' 2639 SKIP_WS(next); 2640 scan = next; 2641 2642 // 2643 // Read the first integer in the set. 2644 // 2645 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2646 "bad proclist"); 2647 SKIP_DIGITS(next); 2648 num = __kmp_str_to_int(scan, *next); 2649 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2650 2651 // 2652 // Copy the mask for that osId to the sum (union) mask. 2653 // 2654 if ((num > maxOsId) || 2655 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2656 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2657 && (__kmp_affinity_type != affinity_none))) { 2658 KMP_WARNING(AffIgnoreInvalidProcID, num); 2659 } 2660 KMP_CPU_ZERO(sumMask); 2661 } 2662 else { 2663 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2664 setSize = 1; 2665 } 2666 2667 for (;;) { 2668 // 2669 // Check for end of set. 2670 // 2671 SKIP_WS(next); 2672 if (*next == '}') { 2673 next++; // skip '}' 2674 break; 2675 } 2676 2677 // 2678 // Skip optional comma. 2679 // 2680 if (*next == ',') { 2681 next++; 2682 } 2683 SKIP_WS(next); 2684 2685 // 2686 // Read the next integer in the set. 2687 // 2688 scan = next; 2689 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2690 "bad explicit proc list"); 2691 2692 SKIP_DIGITS(next); 2693 num = __kmp_str_to_int(scan, *next); 2694 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2695 2696 // 2697 // Add the mask for that osId to the sum mask. 2698 // 2699 if ((num > maxOsId) || 2700 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2701 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2702 && (__kmp_affinity_type != affinity_none))) { 2703 KMP_WARNING(AffIgnoreInvalidProcID, num); 2704 } 2705 } 2706 else { 2707 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2708 setSize++; 2709 } 2710 } 2711 if (setSize > 0) { 2712 ADD_MASK(sumMask); 2713 } 2714 2715 SKIP_WS(next); 2716 if (*next == ',') { 2717 next++; 2718 } 2719 scan = next; 2720 continue; 2721 } 2722 2723 // 2724 // Read the first integer. 2725 // 2726 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2727 SKIP_DIGITS(next); 2728 start = __kmp_str_to_int(scan, *next); 2729 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2730 SKIP_WS(next); 2731 2732 // 2733 // If this isn't a range, then add a mask to the list and go on. 2734 // 2735 if (*next != '-') { 2736 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2737 2738 // 2739 // Skip optional comma. 2740 // 2741 if (*next == ',') { 2742 next++; 2743 } 2744 scan = next; 2745 continue; 2746 } 2747 2748 // 2749 // This is a range. Skip over the '-' and read in the 2nd int. 2750 // 2751 next++; // skip '-' 2752 SKIP_WS(next); 2753 scan = next; 2754 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2755 SKIP_DIGITS(next); 2756 end = __kmp_str_to_int(scan, *next); 2757 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2758 2759 // 2760 // Check for a stride parameter 2761 // 2762 stride = 1; 2763 SKIP_WS(next); 2764 if (*next == ':') { 2765 // 2766 // A stride is specified. Skip over the ':" and read the 3rd int. 2767 // 2768 int sign = +1; 2769 next++; // skip ':' 2770 SKIP_WS(next); 2771 scan = next; 2772 if (*next == '-') { 2773 sign = -1; 2774 next++; 2775 SKIP_WS(next); 2776 scan = next; 2777 } 2778 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2779 "bad explicit proc list"); 2780 SKIP_DIGITS(next); 2781 stride = __kmp_str_to_int(scan, *next); 2782 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2783 stride *= sign; 2784 } 2785 2786 // 2787 // Do some range checks. 2788 // 2789 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2790 if (stride > 0) { 2791 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2792 } 2793 else { 2794 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2795 } 2796 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2797 2798 // 2799 // Add the mask for each OS proc # to the list. 2800 // 2801 if (stride > 0) { 2802 do { 2803 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2804 start += stride; 2805 } while (start <= end); 2806 } 2807 else { 2808 do { 2809 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2810 start += stride; 2811 } while (start >= end); 2812 } 2813 2814 // 2815 // Skip optional comma. 2816 // 2817 SKIP_WS(next); 2818 if (*next == ',') { 2819 next++; 2820 } 2821 scan = next; 2822 } 2823 2824 *out_numMasks = nextNewMask; 2825 if (nextNewMask == 0) { 2826 *out_masks = NULL; 2827 KMP_INTERNAL_FREE(newMasks); 2828 return; 2829 } 2830 *out_masks 2831 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2832 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2833 __kmp_free(sumMask); 2834 KMP_INTERNAL_FREE(newMasks); 2835 } 2836 2837 2838 # if OMP_40_ENABLED 2839 2840 /*----------------------------------------------------------------------------- 2841 2842 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2843 places. Again, Here is the grammar: 2844 2845 place_list := place 2846 place_list := place , place_list 2847 place := num 2848 place := place : num 2849 place := place : num : signed 2850 place := { subplacelist } 2851 place := ! place // (lowest priority) 2852 subplace_list := subplace 2853 subplace_list := subplace , subplace_list 2854 subplace := num 2855 subplace := num : num 2856 subplace := num : num : signed 2857 signed := num 2858 signed := + signed 2859 signed := - signed 2860 2861 -----------------------------------------------------------------------------*/ 2862 2863 static void 2864 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2865 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2866 { 2867 const char *next; 2868 2869 for (;;) { 2870 int start, count, stride, i; 2871 2872 // 2873 // Read in the starting proc id 2874 // 2875 SKIP_WS(*scan); 2876 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2877 "bad explicit places list"); 2878 next = *scan; 2879 SKIP_DIGITS(next); 2880 start = __kmp_str_to_int(*scan, *next); 2881 KMP_ASSERT(start >= 0); 2882 *scan = next; 2883 2884 // 2885 // valid follow sets are ',' ':' and '}' 2886 // 2887 SKIP_WS(*scan); 2888 if (**scan == '}' || **scan == ',') { 2889 if ((start > maxOsId) || 2890 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2891 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2892 && (__kmp_affinity_type != affinity_none))) { 2893 KMP_WARNING(AffIgnoreInvalidProcID, start); 2894 } 2895 } 2896 else { 2897 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2898 (*setSize)++; 2899 } 2900 if (**scan == '}') { 2901 break; 2902 } 2903 (*scan)++; // skip ',' 2904 continue; 2905 } 2906 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2907 (*scan)++; // skip ':' 2908 2909 // 2910 // Read count parameter 2911 // 2912 SKIP_WS(*scan); 2913 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2914 "bad explicit places list"); 2915 next = *scan; 2916 SKIP_DIGITS(next); 2917 count = __kmp_str_to_int(*scan, *next); 2918 KMP_ASSERT(count >= 0); 2919 *scan = next; 2920 2921 // 2922 // valid follow sets are ',' ':' and '}' 2923 // 2924 SKIP_WS(*scan); 2925 if (**scan == '}' || **scan == ',') { 2926 for (i = 0; i < count; i++) { 2927 if ((start > maxOsId) || 2928 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2929 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2930 && (__kmp_affinity_type != affinity_none))) { 2931 KMP_WARNING(AffIgnoreInvalidProcID, start); 2932 } 2933 break; // don't proliferate warnings for large count 2934 } 2935 else { 2936 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2937 start++; 2938 (*setSize)++; 2939 } 2940 } 2941 if (**scan == '}') { 2942 break; 2943 } 2944 (*scan)++; // skip ',' 2945 continue; 2946 } 2947 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2948 (*scan)++; // skip ':' 2949 2950 // 2951 // Read stride parameter 2952 // 2953 int sign = +1; 2954 for (;;) { 2955 SKIP_WS(*scan); 2956 if (**scan == '+') { 2957 (*scan)++; // skip '+' 2958 continue; 2959 } 2960 if (**scan == '-') { 2961 sign *= -1; 2962 (*scan)++; // skip '-' 2963 continue; 2964 } 2965 break; 2966 } 2967 SKIP_WS(*scan); 2968 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2969 "bad explicit places list"); 2970 next = *scan; 2971 SKIP_DIGITS(next); 2972 stride = __kmp_str_to_int(*scan, *next); 2973 KMP_ASSERT(stride >= 0); 2974 *scan = next; 2975 stride *= sign; 2976 2977 // 2978 // valid follow sets are ',' and '}' 2979 // 2980 SKIP_WS(*scan); 2981 if (**scan == '}' || **scan == ',') { 2982 for (i = 0; i < count; i++) { 2983 if ((start > maxOsId) || 2984 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2985 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2986 && (__kmp_affinity_type != affinity_none))) { 2987 KMP_WARNING(AffIgnoreInvalidProcID, start); 2988 } 2989 break; // don't proliferate warnings for large count 2990 } 2991 else { 2992 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2993 start += stride; 2994 (*setSize)++; 2995 } 2996 } 2997 if (**scan == '}') { 2998 break; 2999 } 3000 (*scan)++; // skip ',' 3001 continue; 3002 } 3003 3004 KMP_ASSERT2(0, "bad explicit places list"); 3005 } 3006 } 3007 3008 3009 static void 3010 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 3011 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 3012 { 3013 const char *next; 3014 3015 // 3016 // valid follow sets are '{' '!' and num 3017 // 3018 SKIP_WS(*scan); 3019 if (**scan == '{') { 3020 (*scan)++; // skip '{' 3021 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3022 setSize); 3023 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3024 (*scan)++; // skip '}' 3025 } 3026 else if (**scan == '!') { 3027 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3028 KMP_CPU_COMPLEMENT(tempMask); 3029 (*scan)++; // skip '!' 3030 } 3031 else if ((**scan >= '0') && (**scan <= '9')) { 3032 next = *scan; 3033 SKIP_DIGITS(next); 3034 int num = __kmp_str_to_int(*scan, *next); 3035 KMP_ASSERT(num >= 0); 3036 if ((num > maxOsId) || 3037 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3038 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3039 && (__kmp_affinity_type != affinity_none))) { 3040 KMP_WARNING(AffIgnoreInvalidProcID, num); 3041 } 3042 } 3043 else { 3044 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3045 (*setSize)++; 3046 } 3047 *scan = next; // skip num 3048 } 3049 else { 3050 KMP_ASSERT2(0, "bad explicit places list"); 3051 } 3052 } 3053 3054 3055 static void 3056 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3057 unsigned int *out_numMasks, const char *placelist, 3058 kmp_affin_mask_t *osId2Mask, int maxOsId) 3059 { 3060 const char *scan = placelist; 3061 const char *next = placelist; 3062 3063 numNewMasks = 2; 3064 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3065 * __kmp_affin_mask_size); 3066 nextNewMask = 0; 3067 3068 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3069 __kmp_affin_mask_size); 3070 KMP_CPU_ZERO(tempMask); 3071 int setSize = 0; 3072 3073 for (;;) { 3074 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3075 3076 // 3077 // valid follow sets are ',' ':' and EOL 3078 // 3079 SKIP_WS(scan); 3080 if (*scan == '\0' || *scan == ',') { 3081 if (setSize > 0) { 3082 ADD_MASK(tempMask); 3083 } 3084 KMP_CPU_ZERO(tempMask); 3085 setSize = 0; 3086 if (*scan == '\0') { 3087 break; 3088 } 3089 scan++; // skip ',' 3090 continue; 3091 } 3092 3093 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3094 scan++; // skip ':' 3095 3096 // 3097 // Read count parameter 3098 // 3099 SKIP_WS(scan); 3100 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3101 "bad explicit places list"); 3102 next = scan; 3103 SKIP_DIGITS(next); 3104 int count = __kmp_str_to_int(scan, *next); 3105 KMP_ASSERT(count >= 0); 3106 scan = next; 3107 3108 // 3109 // valid follow sets are ',' ':' and EOL 3110 // 3111 SKIP_WS(scan); 3112 if (*scan == '\0' || *scan == ',') { 3113 int i; 3114 for (i = 0; i < count; i++) { 3115 int j; 3116 if (setSize == 0) { 3117 break; 3118 } 3119 ADD_MASK(tempMask); 3120 setSize = 0; 3121 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j > 0; j--) { 3122 // 3123 // Use a temp var in case macro is changed to evaluate 3124 // args multiple times. 3125 // 3126 if (KMP_CPU_ISSET(j - 1, tempMask)) { 3127 KMP_CPU_SET(j, tempMask); 3128 setSize++; 3129 } 3130 else { 3131 KMP_CPU_CLR(j, tempMask); 3132 } 3133 } 3134 for (; j >= 0; j--) { 3135 KMP_CPU_CLR(j, tempMask); 3136 } 3137 } 3138 KMP_CPU_ZERO(tempMask); 3139 setSize = 0; 3140 3141 if (*scan == '\0') { 3142 break; 3143 } 3144 scan++; // skip ',' 3145 continue; 3146 } 3147 3148 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3149 scan++; // skip ':' 3150 3151 // 3152 // Read stride parameter 3153 // 3154 int sign = +1; 3155 for (;;) { 3156 SKIP_WS(scan); 3157 if (*scan == '+') { 3158 scan++; // skip '+' 3159 continue; 3160 } 3161 if (*scan == '-') { 3162 sign *= -1; 3163 scan++; // skip '-' 3164 continue; 3165 } 3166 break; 3167 } 3168 SKIP_WS(scan); 3169 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3170 "bad explicit places list"); 3171 next = scan; 3172 SKIP_DIGITS(next); 3173 int stride = __kmp_str_to_int(scan, *next); 3174 KMP_DEBUG_ASSERT(stride >= 0); 3175 scan = next; 3176 stride *= sign; 3177 3178 if (stride > 0) { 3179 int i; 3180 for (i = 0; i < count; i++) { 3181 int j; 3182 if (setSize == 0) { 3183 break; 3184 } 3185 ADD_MASK(tempMask); 3186 setSize = 0; 3187 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3188 if (KMP_CPU_ISSET(j - stride, tempMask)) { 3189 KMP_CPU_SET(j, tempMask); 3190 setSize++; 3191 } 3192 else { 3193 KMP_CPU_CLR(j, tempMask); 3194 } 3195 } 3196 for (; j >= 0; j--) { 3197 KMP_CPU_CLR(j, tempMask); 3198 } 3199 } 3200 } 3201 else { 3202 int i; 3203 for (i = 0; i < count; i++) { 3204 unsigned j; 3205 if (setSize == 0) { 3206 break; 3207 } 3208 ADD_MASK(tempMask); 3209 setSize = 0; 3210 for (j = 0; j < (__kmp_affin_mask_size * CHAR_BIT) + stride; 3211 j++) { 3212 if (KMP_CPU_ISSET(j - stride, tempMask)) { 3213 KMP_CPU_SET(j, tempMask); 3214 setSize++; 3215 } 3216 else { 3217 KMP_CPU_CLR(j, tempMask); 3218 } 3219 } 3220 for (; j < __kmp_affin_mask_size * CHAR_BIT; j++) { 3221 KMP_CPU_CLR(j, tempMask); 3222 } 3223 } 3224 } 3225 KMP_CPU_ZERO(tempMask); 3226 setSize = 0; 3227 3228 // 3229 // valid follow sets are ',' and EOL 3230 // 3231 SKIP_WS(scan); 3232 if (*scan == '\0') { 3233 break; 3234 } 3235 if (*scan == ',') { 3236 scan++; // skip ',' 3237 continue; 3238 } 3239 3240 KMP_ASSERT2(0, "bad explicit places list"); 3241 } 3242 3243 *out_numMasks = nextNewMask; 3244 if (nextNewMask == 0) { 3245 *out_masks = NULL; 3246 KMP_INTERNAL_FREE(newMasks); 3247 return; 3248 } 3249 *out_masks 3250 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3251 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3252 __kmp_free(tempMask); 3253 KMP_INTERNAL_FREE(newMasks); 3254 } 3255 3256 # endif /* OMP_40_ENABLED */ 3257 3258 #undef ADD_MASK 3259 #undef ADD_MASK_OSID 3260 3261 3262 # if KMP_MIC 3263 3264 static void 3265 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3266 { 3267 if ( __kmp_place_num_cores == 0 ) { 3268 if ( __kmp_place_num_threads_per_core == 0 ) { 3269 return; // no cores limiting actions requested, exit 3270 } 3271 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3272 } 3273 if ( !__kmp_affinity_uniform_topology() || depth != 3 ) { 3274 KMP_WARNING( AffThrPlaceUnsupported ); 3275 return; // don't support non-uniform topology or not-3-level architecture 3276 } 3277 if ( __kmp_place_num_threads_per_core == 0 ) { 3278 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3279 } 3280 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3281 KMP_WARNING( AffThrPlaceManyCores ); 3282 return; 3283 } 3284 3285 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3286 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3287 int i, j, k, n_old = 0, n_new = 0; 3288 for ( i = 0; i < nPackages; ++i ) { 3289 for ( j = 0; j < nCoresPerPkg; ++j ) { 3290 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3291 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3292 } else { 3293 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3294 if ( k < __kmp_place_num_threads_per_core ) { 3295 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3296 n_new++; 3297 } 3298 n_old++; 3299 } 3300 } 3301 } 3302 } 3303 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3304 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3305 __kmp_avail_proc = n_new; // correct avail_proc 3306 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3307 3308 __kmp_free( *pAddr ); 3309 *pAddr = newAddr; // replace old topology with new one 3310 } 3311 3312 # endif /* KMP_MIC */ 3313 3314 3315 static AddrUnsPair *address2os = NULL; 3316 static int * procarr = NULL; 3317 static int __kmp_aff_depth = 0; 3318 3319 static void 3320 __kmp_aux_affinity_initialize(void) 3321 { 3322 if (__kmp_affinity_masks != NULL) { 3323 KMP_ASSERT(fullMask != NULL); 3324 return; 3325 } 3326 3327 // 3328 // Create the "full" mask - this defines all of the processors that we 3329 // consider to be in the machine model. If respect is set, then it is 3330 // the initialization thread's affinity mask. Otherwise, it is all 3331 // processors that we know about on the machine. 3332 // 3333 if (fullMask == NULL) { 3334 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3335 } 3336 if (KMP_AFFINITY_CAPABLE()) { 3337 if (__kmp_affinity_respect_mask) { 3338 __kmp_get_system_affinity(fullMask, TRUE); 3339 3340 // 3341 // Count the number of available processors. 3342 // 3343 unsigned i; 3344 __kmp_avail_proc = 0; 3345 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3346 if (! KMP_CPU_ISSET(i, fullMask)) { 3347 continue; 3348 } 3349 __kmp_avail_proc++; 3350 } 3351 if (__kmp_avail_proc > __kmp_xproc) { 3352 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3353 && (__kmp_affinity_type != affinity_none))) { 3354 KMP_WARNING(ErrorInitializeAffinity); 3355 } 3356 __kmp_affinity_type = affinity_none; 3357 __kmp_affin_mask_size = 0; 3358 return; 3359 } 3360 } 3361 else { 3362 __kmp_affinity_entire_machine_mask(fullMask); 3363 __kmp_avail_proc = __kmp_xproc; 3364 } 3365 } 3366 3367 int depth = -1; 3368 kmp_i18n_id_t msg_id = kmp_i18n_null; 3369 3370 // 3371 // For backward compatibility, setting KMP_CPUINFO_FILE => 3372 // KMP_TOPOLOGY_METHOD=cpuinfo 3373 // 3374 if ((__kmp_cpuinfo_file != NULL) && 3375 (__kmp_affinity_top_method == affinity_top_method_all)) { 3376 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3377 } 3378 3379 if (__kmp_affinity_top_method == affinity_top_method_all) { 3380 // 3381 // In the default code path, errors are not fatal - we just try using 3382 // another method. We only emit a warning message if affinity is on, 3383 // or the verbose flag is set, an the nowarnings flag was not set. 3384 // 3385 const char *file_name = NULL; 3386 int line = 0; 3387 3388 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3389 3390 if (__kmp_affinity_verbose) { 3391 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3392 } 3393 3394 file_name = NULL; 3395 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3396 if (depth == 0) { 3397 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3398 KMP_ASSERT(address2os == NULL); 3399 return; 3400 } 3401 3402 if (depth < 0) { 3403 if ((msg_id != kmp_i18n_null) 3404 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3405 && (__kmp_affinity_type != affinity_none)))) { 3406 # if KMP_MIC 3407 if (__kmp_affinity_verbose) { 3408 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3409 KMP_I18N_STR(DecodingLegacyAPIC)); 3410 } 3411 # else 3412 KMP_WARNING(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3413 KMP_I18N_STR(DecodingLegacyAPIC)); 3414 # endif 3415 } 3416 3417 file_name = NULL; 3418 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3419 if (depth == 0) { 3420 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3421 KMP_ASSERT(address2os == NULL); 3422 return; 3423 } 3424 } 3425 3426 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3427 3428 # if KMP_OS_LINUX 3429 3430 if (depth < 0) { 3431 if ((msg_id != kmp_i18n_null) 3432 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3433 && (__kmp_affinity_type != affinity_none)))) { 3434 # if KMP_MIC 3435 if (__kmp_affinity_verbose) { 3436 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3437 } 3438 # else 3439 KMP_WARNING(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3440 # endif 3441 } 3442 else if (__kmp_affinity_verbose) { 3443 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3444 } 3445 3446 FILE *f = fopen("/proc/cpuinfo", "r"); 3447 if (f == NULL) { 3448 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3449 } 3450 else { 3451 file_name = "/proc/cpuinfo"; 3452 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3453 fclose(f); 3454 if (depth == 0) { 3455 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3456 KMP_ASSERT(address2os == NULL); 3457 return; 3458 } 3459 } 3460 } 3461 3462 # endif /* KMP_OS_LINUX */ 3463 3464 if (depth < 0) { 3465 if (msg_id != kmp_i18n_null 3466 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3467 && (__kmp_affinity_type != affinity_none)))) { 3468 if (file_name == NULL) { 3469 KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3470 } 3471 else if (line == 0) { 3472 KMP_WARNING(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3473 } 3474 else { 3475 KMP_WARNING(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3476 } 3477 } 3478 3479 file_name = ""; 3480 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3481 if (depth == 0) { 3482 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3483 KMP_ASSERT(address2os == NULL); 3484 return; 3485 } 3486 KMP_ASSERT(depth > 0); 3487 KMP_ASSERT(address2os != NULL); 3488 } 3489 } 3490 3491 // 3492 // If the user has specified that a paricular topology discovery method 3493 // is to be used, then we abort if that method fails. The exception is 3494 // group affinity, which might have been implicitly set. 3495 // 3496 3497 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3498 3499 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3500 if (__kmp_affinity_verbose) { 3501 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3502 KMP_I18N_STR(Decodingx2APIC)); 3503 } 3504 3505 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3506 if (depth == 0) { 3507 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3508 KMP_ASSERT(address2os == NULL); 3509 return; 3510 } 3511 3512 if (depth < 0) { 3513 KMP_ASSERT(msg_id != kmp_i18n_null); 3514 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3515 } 3516 } 3517 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3518 if (__kmp_affinity_verbose) { 3519 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3520 KMP_I18N_STR(DecodingLegacyAPIC)); 3521 } 3522 3523 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3524 if (depth == 0) { 3525 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3526 KMP_ASSERT(address2os == NULL); 3527 return; 3528 } 3529 3530 if (depth < 0) { 3531 KMP_ASSERT(msg_id != kmp_i18n_null); 3532 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3533 } 3534 } 3535 3536 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3537 3538 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3539 const char *filename; 3540 if (__kmp_cpuinfo_file != NULL) { 3541 filename = __kmp_cpuinfo_file; 3542 } 3543 else { 3544 filename = "/proc/cpuinfo"; 3545 } 3546 3547 if (__kmp_affinity_verbose) { 3548 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3549 } 3550 3551 FILE *f = fopen(filename, "r"); 3552 if (f == NULL) { 3553 int code = errno; 3554 if (__kmp_cpuinfo_file != NULL) { 3555 __kmp_msg( 3556 kmp_ms_fatal, 3557 KMP_MSG(CantOpenFileForReading, filename), 3558 KMP_ERR(code), 3559 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3560 __kmp_msg_null 3561 ); 3562 } 3563 else { 3564 __kmp_msg( 3565 kmp_ms_fatal, 3566 KMP_MSG(CantOpenFileForReading, filename), 3567 KMP_ERR(code), 3568 __kmp_msg_null 3569 ); 3570 } 3571 } 3572 int line = 0; 3573 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3574 fclose(f); 3575 if (depth < 0) { 3576 KMP_ASSERT(msg_id != kmp_i18n_null); 3577 if (line > 0) { 3578 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3579 } 3580 else { 3581 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3582 } 3583 } 3584 if (__kmp_affinity_type == affinity_none) { 3585 KMP_ASSERT(depth == 0); 3586 KMP_ASSERT(address2os == NULL); 3587 return; 3588 } 3589 } 3590 3591 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3592 3593 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3594 if (__kmp_affinity_verbose) { 3595 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3596 } 3597 3598 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3599 KMP_ASSERT(depth != 0); 3600 3601 if (depth < 0) { 3602 if ((msg_id != kmp_i18n_null) 3603 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3604 && (__kmp_affinity_type != affinity_none)))) { 3605 KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3606 } 3607 3608 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3609 if (depth == 0) { 3610 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3611 KMP_ASSERT(address2os == NULL); 3612 return; 3613 } 3614 // should not fail 3615 KMP_ASSERT(depth > 0); 3616 KMP_ASSERT(address2os != NULL); 3617 } 3618 } 3619 3620 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 3621 3622 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3623 if (__kmp_affinity_verbose) { 3624 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3625 } 3626 3627 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3628 if (depth == 0) { 3629 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3630 KMP_ASSERT(address2os == NULL); 3631 return; 3632 } 3633 // should not fail 3634 KMP_ASSERT(depth > 0); 3635 KMP_ASSERT(address2os != NULL); 3636 } 3637 3638 if (address2os == NULL) { 3639 if (KMP_AFFINITY_CAPABLE() 3640 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3641 && (__kmp_affinity_type != affinity_none)))) { 3642 KMP_WARNING(ErrorInitializeAffinity); 3643 } 3644 __kmp_affinity_type = affinity_none; 3645 __kmp_affin_mask_size = 0; 3646 return; 3647 } 3648 3649 # if KMP_MIC 3650 __kmp_apply_thread_places(&address2os, depth); 3651 # endif 3652 3653 // 3654 // Create the table of masks, indexed by thread Id. 3655 // 3656 unsigned maxIndex; 3657 unsigned numUnique; 3658 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3659 address2os, __kmp_avail_proc); 3660 if (__kmp_affinity_gran_levels == 0) { 3661 KMP_DEBUG_ASSERT(numUnique == __kmp_avail_proc); 3662 } 3663 3664 // 3665 // Set the childNums vector in all Address objects. This must be done 3666 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3667 // which takes into account the setting of __kmp_affinity_compact. 3668 // 3669 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3670 3671 switch (__kmp_affinity_type) { 3672 3673 case affinity_explicit: 3674 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3675 # if OMP_40_ENABLED 3676 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3677 # endif 3678 { 3679 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3680 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3681 maxIndex); 3682 } 3683 # if OMP_40_ENABLED 3684 else { 3685 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3686 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3687 maxIndex); 3688 } 3689 # endif 3690 if (__kmp_affinity_num_masks == 0) { 3691 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3692 && (__kmp_affinity_type != affinity_none))) { 3693 KMP_WARNING(AffNoValidProcID); 3694 } 3695 __kmp_affinity_type = affinity_none; 3696 return; 3697 } 3698 break; 3699 3700 // 3701 // The other affinity types rely on sorting the Addresses according 3702 // to some permutation of the machine topology tree. Set 3703 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3704 // then jump to a common code fragment to do the sort and create 3705 // the array of affinity masks. 3706 // 3707 3708 case affinity_logical: 3709 __kmp_affinity_compact = 0; 3710 if (__kmp_affinity_offset) { 3711 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3712 % __kmp_avail_proc; 3713 } 3714 goto sortAddresses; 3715 3716 case affinity_physical: 3717 if (__kmp_nThreadsPerCore > 1) { 3718 __kmp_affinity_compact = 1; 3719 if (__kmp_affinity_compact >= depth) { 3720 __kmp_affinity_compact = 0; 3721 } 3722 } else { 3723 __kmp_affinity_compact = 0; 3724 } 3725 if (__kmp_affinity_offset) { 3726 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3727 % __kmp_avail_proc; 3728 } 3729 goto sortAddresses; 3730 3731 case affinity_scatter: 3732 if (__kmp_affinity_compact >= depth) { 3733 __kmp_affinity_compact = 0; 3734 } 3735 else { 3736 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3737 } 3738 goto sortAddresses; 3739 3740 case affinity_compact: 3741 if (__kmp_affinity_compact >= depth) { 3742 __kmp_affinity_compact = depth - 1; 3743 } 3744 goto sortAddresses; 3745 3746 # if KMP_MIC 3747 case affinity_balanced: 3748 // Balanced works only for the case of a single package and uniform topology 3749 if( nPackages > 1 ) { 3750 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3751 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3752 } 3753 __kmp_affinity_type = affinity_none; 3754 return; 3755 } else if( __kmp_affinity_uniform_topology() ) { 3756 break; 3757 } else { // Non-uniform topology 3758 3759 // Save the depth for further usage 3760 __kmp_aff_depth = depth; 3761 3762 // Number of hyper threads per core in HT machine 3763 int nth_per_core = __kmp_nThreadsPerCore; 3764 3765 int core_level; 3766 if( nth_per_core > 1 ) { 3767 core_level = depth - 2; 3768 } else { 3769 core_level = depth - 1; 3770 } 3771 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3772 int nproc = nth_per_core * ncores; 3773 3774 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3775 for( int i = 0; i < nproc; i++ ) { 3776 procarr[ i ] = -1; 3777 } 3778 3779 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3780 int proc = address2os[ i ].second; 3781 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3782 // If there is only one thread per core then depth == 2: level 0 - package, 3783 // level 1 - core. 3784 int level = depth - 1; 3785 3786 // __kmp_nth_per_core == 1 3787 int thread = 0; 3788 int core = address2os[ i ].first.labels[ level ]; 3789 // If the thread level exists, that is we have more than one thread context per core 3790 if( nth_per_core > 1 ) { 3791 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3792 core = address2os[ i ].first.labels[ level - 1 ]; 3793 } 3794 procarr[ core * nth_per_core + thread ] = proc; 3795 } 3796 3797 break; 3798 } 3799 # endif 3800 3801 sortAddresses: 3802 // 3803 // Allocate the gtid->affinity mask table. 3804 // 3805 if (__kmp_affinity_dups) { 3806 __kmp_affinity_num_masks = __kmp_avail_proc; 3807 } 3808 else { 3809 __kmp_affinity_num_masks = numUnique; 3810 } 3811 3812 # if OMP_40_ENABLED 3813 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3814 && ( __kmp_affinity_num_places > 0 ) 3815 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3816 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3817 } 3818 # endif 3819 3820 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3821 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3822 3823 // 3824 // Sort the address2os table according to the current setting of 3825 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3826 // 3827 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3828 __kmp_affinity_cmp_Address_child_num); 3829 { 3830 int i; 3831 unsigned j; 3832 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3833 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3834 continue; 3835 } 3836 unsigned osId = address2os[i].second; 3837 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3838 kmp_affin_mask_t *dest 3839 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3840 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3841 KMP_CPU_COPY(dest, src); 3842 if (++j >= __kmp_affinity_num_masks) { 3843 break; 3844 } 3845 } 3846 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3847 } 3848 break; 3849 3850 default: 3851 KMP_ASSERT2(0, "Unexpected affinity setting"); 3852 } 3853 3854 __kmp_free(osId2Mask); 3855 } 3856 3857 3858 void 3859 __kmp_affinity_initialize(void) 3860 { 3861 // 3862 // Much of the code above was written assumming that if a machine was not 3863 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3864 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3865 // 3866 // There are too many checks for __kmp_affinity_type == affinity_none 3867 // in this code. Instead of trying to change them all, check if 3868 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3869 // affinity_none, call the real initialization routine, then restore 3870 // __kmp_affinity_type to affinity_disabled. 3871 // 3872 int disabled = (__kmp_affinity_type == affinity_disabled); 3873 if (! KMP_AFFINITY_CAPABLE()) { 3874 KMP_ASSERT(disabled); 3875 } 3876 if (disabled) { 3877 __kmp_affinity_type = affinity_none; 3878 } 3879 __kmp_aux_affinity_initialize(); 3880 if (disabled) { 3881 __kmp_affinity_type = affinity_disabled; 3882 } 3883 } 3884 3885 3886 void 3887 __kmp_affinity_uninitialize(void) 3888 { 3889 if (__kmp_affinity_masks != NULL) { 3890 __kmp_free(__kmp_affinity_masks); 3891 __kmp_affinity_masks = NULL; 3892 } 3893 if (fullMask != NULL) { 3894 KMP_CPU_FREE(fullMask); 3895 fullMask = NULL; 3896 } 3897 __kmp_affinity_num_masks = 0; 3898 # if OMP_40_ENABLED 3899 __kmp_affinity_num_places = 0; 3900 # endif 3901 if (__kmp_affinity_proclist != NULL) { 3902 __kmp_free(__kmp_affinity_proclist); 3903 __kmp_affinity_proclist = NULL; 3904 } 3905 if( address2os != NULL ) { 3906 __kmp_free( address2os ); 3907 address2os = NULL; 3908 } 3909 if( procarr != NULL ) { 3910 __kmp_free( procarr ); 3911 procarr = NULL; 3912 } 3913 } 3914 3915 3916 void 3917 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3918 { 3919 if (! KMP_AFFINITY_CAPABLE()) { 3920 return; 3921 } 3922 3923 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3924 if (th->th.th_affin_mask == NULL) { 3925 KMP_CPU_ALLOC(th->th.th_affin_mask); 3926 } 3927 else { 3928 KMP_CPU_ZERO(th->th.th_affin_mask); 3929 } 3930 3931 // 3932 // Copy the thread mask to the kmp_info_t strucuture. 3933 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3934 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 3935 // is set, then the full mask is the same as the mask of the initialization 3936 // thread. 3937 // 3938 kmp_affin_mask_t *mask; 3939 int i; 3940 3941 # if OMP_40_ENABLED 3942 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3943 # endif 3944 { 3945 if ((__kmp_affinity_type == affinity_none) 3946 # if KMP_MIC 3947 || (__kmp_affinity_type == affinity_balanced) 3948 # endif 3949 ) { 3950 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3951 if (__kmp_num_proc_groups > 1) { 3952 return; 3953 } 3954 # endif 3955 KMP_ASSERT(fullMask != NULL); 3956 i = -1; 3957 mask = fullMask; 3958 } 3959 else { 3960 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3961 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3962 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3963 } 3964 } 3965 # if OMP_40_ENABLED 3966 else { 3967 if ((! isa_root) 3968 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 3969 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3970 if (__kmp_num_proc_groups > 1) { 3971 return; 3972 } 3973 # endif 3974 KMP_ASSERT(fullMask != NULL); 3975 i = KMP_PLACE_ALL; 3976 mask = fullMask; 3977 } 3978 else { 3979 // 3980 // int i = some hash function or just a counter that doesn't 3981 // always start at 0. Use gtid for now. 3982 // 3983 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3984 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3985 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3986 } 3987 } 3988 # endif 3989 3990 # if OMP_40_ENABLED 3991 th->th.th_current_place = i; 3992 if (isa_root) { 3993 th->th.th_new_place = i; 3994 th->th.th_first_place = 0; 3995 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3996 } 3997 3998 if (i == KMP_PLACE_ALL) { 3999 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 4000 gtid)); 4001 } 4002 else { 4003 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 4004 gtid, i)); 4005 } 4006 # else 4007 if (i == -1) { 4008 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 4009 gtid)); 4010 } 4011 else { 4012 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4013 gtid, i)); 4014 } 4015 # endif /* OMP_40_ENABLED */ 4016 4017 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4018 4019 if (__kmp_affinity_verbose) { 4020 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4021 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4022 th->th.th_affin_mask); 4023 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", gtid, buf); 4024 } 4025 4026 # if KMP_OS_WINDOWS 4027 // 4028 // On Windows* OS, the process affinity mask might have changed. 4029 // If the user didn't request affinity and this call fails, 4030 // just continue silently. See CQ171393. 4031 // 4032 if ( __kmp_affinity_type == affinity_none ) { 4033 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4034 } 4035 else 4036 # endif 4037 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4038 } 4039 4040 4041 # if OMP_40_ENABLED 4042 4043 void 4044 __kmp_affinity_set_place(int gtid) 4045 { 4046 int retval; 4047 4048 if (! KMP_AFFINITY_CAPABLE()) { 4049 return; 4050 } 4051 4052 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4053 4054 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4055 gtid, th->th.th_new_place, th->th.th_current_place)); 4056 4057 // 4058 // Check that the new place is within this thread's partition. 4059 // 4060 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4061 KMP_DEBUG_ASSERT(th->th.th_new_place >= 0); 4062 KMP_DEBUG_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4063 if (th->th.th_first_place <= th->th.th_last_place) { 4064 KMP_DEBUG_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4065 && (th->th.th_new_place <= th->th.th_last_place)); 4066 } 4067 else { 4068 KMP_DEBUG_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4069 || (th->th.th_new_place >= th->th.th_last_place)); 4070 } 4071 4072 // 4073 // Copy the thread mask to the kmp_info_t strucuture, 4074 // and set this thread's affinity. 4075 // 4076 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4077 th->th.th_new_place); 4078 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4079 th->th.th_current_place = th->th.th_new_place; 4080 4081 if (__kmp_affinity_verbose) { 4082 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4083 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4084 th->th.th_affin_mask); 4085 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", gtid, buf); 4086 } 4087 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4088 } 4089 4090 # endif /* OMP_40_ENABLED */ 4091 4092 4093 int 4094 __kmp_aux_set_affinity(void **mask) 4095 { 4096 int gtid; 4097 kmp_info_t *th; 4098 int retval; 4099 4100 if (! KMP_AFFINITY_CAPABLE()) { 4101 return -1; 4102 } 4103 4104 gtid = __kmp_entry_gtid(); 4105 KA_TRACE(1000, ;{ 4106 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4107 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4108 (kmp_affin_mask_t *)(*mask)); 4109 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4110 gtid, buf); 4111 }); 4112 4113 if (__kmp_env_consistency_check) { 4114 if ((mask == NULL) || (*mask == NULL)) { 4115 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4116 } 4117 else { 4118 unsigned proc; 4119 int num_procs = 0; 4120 4121 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4122 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4123 continue; 4124 } 4125 num_procs++; 4126 if (! KMP_CPU_ISSET(proc, fullMask)) { 4127 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4128 break; 4129 } 4130 } 4131 if (num_procs == 0) { 4132 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4133 } 4134 4135 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 4136 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4137 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4138 } 4139 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 4140 4141 } 4142 } 4143 4144 th = __kmp_threads[gtid]; 4145 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4146 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4147 if (retval == 0) { 4148 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4149 } 4150 4151 # if OMP_40_ENABLED 4152 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4153 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4154 th->th.th_first_place = 0; 4155 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4156 # endif 4157 4158 return retval; 4159 } 4160 4161 4162 int 4163 __kmp_aux_get_affinity(void **mask) 4164 { 4165 int gtid; 4166 int retval; 4167 kmp_info_t *th; 4168 4169 if (! KMP_AFFINITY_CAPABLE()) { 4170 return -1; 4171 } 4172 4173 gtid = __kmp_entry_gtid(); 4174 th = __kmp_threads[gtid]; 4175 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4176 4177 KA_TRACE(1000, ;{ 4178 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4179 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4180 th->th.th_affin_mask); 4181 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4182 }); 4183 4184 if (__kmp_env_consistency_check) { 4185 if ((mask == NULL) || (*mask == NULL)) { 4186 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4187 } 4188 } 4189 4190 # if !KMP_OS_WINDOWS 4191 4192 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4193 KA_TRACE(1000, ;{ 4194 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4195 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4196 (kmp_affin_mask_t *)(*mask)); 4197 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4198 }); 4199 return retval; 4200 4201 # else 4202 4203 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4204 return 0; 4205 4206 # endif /* KMP_OS_WINDOWS */ 4207 4208 } 4209 4210 4211 int 4212 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4213 { 4214 int retval; 4215 4216 if (! KMP_AFFINITY_CAPABLE()) { 4217 return -1; 4218 } 4219 4220 KA_TRACE(1000, ;{ 4221 int gtid = __kmp_entry_gtid(); 4222 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4223 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4224 (kmp_affin_mask_t *)(*mask)); 4225 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4226 proc, gtid, buf); 4227 }); 4228 4229 if (__kmp_env_consistency_check) { 4230 if ((mask == NULL) || (*mask == NULL)) { 4231 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4232 } 4233 } 4234 4235 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4236 return -1; 4237 } 4238 if (! KMP_CPU_ISSET(proc, fullMask)) { 4239 return -2; 4240 } 4241 4242 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4243 return 0; 4244 } 4245 4246 4247 int 4248 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4249 { 4250 int retval; 4251 4252 if (! KMP_AFFINITY_CAPABLE()) { 4253 return -1; 4254 } 4255 4256 KA_TRACE(1000, ;{ 4257 int gtid = __kmp_entry_gtid(); 4258 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4259 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4260 (kmp_affin_mask_t *)(*mask)); 4261 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4262 proc, gtid, buf); 4263 }); 4264 4265 if (__kmp_env_consistency_check) { 4266 if ((mask == NULL) || (*mask == NULL)) { 4267 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4268 } 4269 } 4270 4271 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4272 return -1; 4273 } 4274 if (! KMP_CPU_ISSET(proc, fullMask)) { 4275 return -2; 4276 } 4277 4278 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4279 return 0; 4280 } 4281 4282 4283 int 4284 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4285 { 4286 int retval; 4287 4288 if (! KMP_AFFINITY_CAPABLE()) { 4289 return -1; 4290 } 4291 4292 KA_TRACE(1000, ;{ 4293 int gtid = __kmp_entry_gtid(); 4294 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4295 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4296 (kmp_affin_mask_t *)(*mask)); 4297 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4298 proc, gtid, buf); 4299 }); 4300 4301 if (__kmp_env_consistency_check) { 4302 if ((mask == NULL) || (*mask == NULL)) { 4303 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4304 } 4305 } 4306 4307 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4308 return 0; 4309 } 4310 if (! KMP_CPU_ISSET(proc, fullMask)) { 4311 return 0; 4312 } 4313 4314 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4315 } 4316 4317 # if KMP_MIC 4318 4319 // Dynamic affinity settings - Affinity balanced 4320 void __kmp_balanced_affinity( int tid, int nthreads ) 4321 { 4322 if( __kmp_affinity_uniform_topology() ) { 4323 int coreID; 4324 int threadID; 4325 // Number of hyper threads per core in HT machine 4326 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4327 // Number of cores 4328 int ncores = __kmp_ncores; 4329 // How many threads will be bound to each core 4330 int chunk = nthreads / ncores; 4331 // How many cores will have an additional thread bound to it - "big cores" 4332 int big_cores = nthreads % ncores; 4333 // Number of threads on the big cores 4334 int big_nth = ( chunk + 1 ) * big_cores; 4335 if( tid < big_nth ) { 4336 coreID = tid / (chunk + 1 ); 4337 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4338 } else { //tid >= big_nth 4339 coreID = ( tid - big_cores ) / chunk; 4340 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4341 } 4342 4343 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4344 "Illegal set affinity operation when not capable"); 4345 4346 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4347 KMP_CPU_ZERO(mask); 4348 4349 // Granularity == thread 4350 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4351 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4352 KMP_CPU_SET( osID, mask); 4353 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4354 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4355 int osID; 4356 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4357 KMP_CPU_SET( osID, mask); 4358 } 4359 } 4360 if (__kmp_affinity_verbose) { 4361 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4362 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4363 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf); 4364 } 4365 __kmp_set_system_affinity( mask, TRUE ); 4366 } else { // Non-uniform topology 4367 4368 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4369 KMP_CPU_ZERO(mask); 4370 4371 // Number of hyper threads per core in HT machine 4372 int nth_per_core = __kmp_nThreadsPerCore; 4373 int core_level; 4374 if( nth_per_core > 1 ) { 4375 core_level = __kmp_aff_depth - 2; 4376 } else { 4377 core_level = __kmp_aff_depth - 1; 4378 } 4379 4380 // Number of cores - maximum value; it does not count trail cores with 0 processors 4381 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4382 4383 // For performance gain consider the special case nthreads == __kmp_avail_proc 4384 if( nthreads == __kmp_avail_proc ) { 4385 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4386 int osID = address2os[ tid ].second; 4387 KMP_CPU_SET( osID, mask); 4388 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4389 int coreID = address2os[ tid ].first.labels[ core_level ]; 4390 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4391 // since the address2os is sortied we can break when cnt==nth_per_core 4392 int cnt = 0; 4393 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4394 int osID = address2os[ i ].second; 4395 int core = address2os[ i ].first.labels[ core_level ]; 4396 if( core == coreID ) { 4397 KMP_CPU_SET( osID, mask); 4398 cnt++; 4399 if( cnt == nth_per_core ) { 4400 break; 4401 } 4402 } 4403 } 4404 } 4405 } else if( nthreads <= __kmp_ncores ) { 4406 4407 int core = 0; 4408 for( int i = 0; i < ncores; i++ ) { 4409 // Check if this core from procarr[] is in the mask 4410 int in_mask = 0; 4411 for( int j = 0; j < nth_per_core; j++ ) { 4412 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4413 in_mask = 1; 4414 break; 4415 } 4416 } 4417 if( in_mask ) { 4418 if( tid == core ) { 4419 for( int j = 0; j < nth_per_core; j++ ) { 4420 int osID = procarr[ i * nth_per_core + j ]; 4421 if( osID != -1 ) { 4422 KMP_CPU_SET( osID, mask ); 4423 // For granularity=thread it is enough to set the first available osID for this core 4424 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4425 break; 4426 } 4427 } 4428 } 4429 break; 4430 } else { 4431 core++; 4432 } 4433 } 4434 } 4435 4436 } else { // nthreads > __kmp_ncores 4437 4438 // Array to save the number of processors at each core 4439 int nproc_at_core[ ncores ]; 4440 // Array to save the number of cores with "x" available processors; 4441 int ncores_with_x_procs[ nth_per_core + 1 ]; 4442 // Array to save the number of cores with # procs from x to nth_per_core 4443 int ncores_with_x_to_max_procs[ nth_per_core + 1 ]; 4444 4445 for( int i = 0; i <= nth_per_core; i++ ) { 4446 ncores_with_x_procs[ i ] = 0; 4447 ncores_with_x_to_max_procs[ i ] = 0; 4448 } 4449 4450 for( int i = 0; i < ncores; i++ ) { 4451 int cnt = 0; 4452 for( int j = 0; j < nth_per_core; j++ ) { 4453 if( procarr[ i * nth_per_core + j ] != -1 ) { 4454 cnt++; 4455 } 4456 } 4457 nproc_at_core[ i ] = cnt; 4458 ncores_with_x_procs[ cnt ]++; 4459 } 4460 4461 for( int i = 0; i <= nth_per_core; i++ ) { 4462 for( int j = i; j <= nth_per_core; j++ ) { 4463 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4464 } 4465 } 4466 4467 // Max number of processors 4468 int nproc = nth_per_core * ncores; 4469 // An array to keep number of threads per each context 4470 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4471 for( int i = 0; i < nproc; i++ ) { 4472 newarr[ i ] = 0; 4473 } 4474 4475 int nth = nthreads; 4476 int flag = 0; 4477 while( nth > 0 ) { 4478 for( int j = 1; j <= nth_per_core; j++ ) { 4479 int cnt = ncores_with_x_to_max_procs[ j ]; 4480 for( int i = 0; i < ncores; i++ ) { 4481 // Skip the core with 0 processors 4482 if( nproc_at_core[ i ] == 0 ) { 4483 continue; 4484 } 4485 for( int k = 0; k < nth_per_core; k++ ) { 4486 if( procarr[ i * nth_per_core + k ] != -1 ) { 4487 if( newarr[ i * nth_per_core + k ] == 0 ) { 4488 newarr[ i * nth_per_core + k ] = 1; 4489 cnt--; 4490 nth--; 4491 break; 4492 } else { 4493 if( flag != 0 ) { 4494 newarr[ i * nth_per_core + k ] ++; 4495 cnt--; 4496 nth--; 4497 break; 4498 } 4499 } 4500 } 4501 } 4502 if( cnt == 0 || nth == 0 ) { 4503 break; 4504 } 4505 } 4506 if( nth == 0 ) { 4507 break; 4508 } 4509 } 4510 flag = 1; 4511 } 4512 int sum = 0; 4513 for( int i = 0; i < nproc; i++ ) { 4514 sum += newarr[ i ]; 4515 if( sum > tid ) { 4516 // Granularity == thread 4517 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4518 int osID = procarr[ i ]; 4519 KMP_CPU_SET( osID, mask); 4520 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4521 int coreID = i / nth_per_core; 4522 for( int ii = 0; ii < nth_per_core; ii++ ) { 4523 int osID = procarr[ coreID * nth_per_core + ii ]; 4524 if( osID != -1 ) { 4525 KMP_CPU_SET( osID, mask); 4526 } 4527 } 4528 } 4529 break; 4530 } 4531 } 4532 __kmp_free( newarr ); 4533 } 4534 4535 if (__kmp_affinity_verbose) { 4536 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4537 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4538 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf); 4539 } 4540 __kmp_set_system_affinity( mask, TRUE ); 4541 } 4542 } 4543 4544 # endif /* KMP_MIC */ 4545 4546 #endif // KMP_AFFINITY_SUPPORTED 4547