1 /* 2 * kmp_affinity.cpp -- affinity management 3 * $Revision: 42613 $ 4 * $Date: 2013-08-23 13:29:50 -0500 (Fri, 23 Aug 2013) $ 5 */ 6 7 8 //===----------------------------------------------------------------------===// 9 // 10 // The LLVM Compiler Infrastructure 11 // 12 // This file is dual licensed under the MIT and the University of Illinois Open 13 // Source Licenses. See LICENSE.txt for details. 14 // 15 //===----------------------------------------------------------------------===// 16 17 18 #include "kmp.h" 19 #include "kmp_i18n.h" 20 #include "kmp_io.h" 21 #include "kmp_str.h" 22 23 24 #if KMP_OS_WINDOWS || KMP_OS_LINUX 25 26 // 27 // Print the affinity mask to the character array in a pretty format. 28 // 29 char * 30 __kmp_affinity_print_mask(char *buf, int buf_len, kmp_affin_mask_t *mask) 31 { 32 KMP_ASSERT(buf_len >= 40); 33 char *scan = buf; 34 char *end = buf + buf_len - 1; 35 36 // 37 // Find first element / check for empty set. 38 // 39 size_t i; 40 for (i = 0; i < KMP_CPU_SETSIZE; i++) { 41 if (KMP_CPU_ISSET(i, mask)) { 42 break; 43 } 44 } 45 if (i == KMP_CPU_SETSIZE) { 46 sprintf(scan, "{<empty>}"); 47 while (*scan != '\0') scan++; 48 KMP_ASSERT(scan <= end); 49 return buf; 50 } 51 52 sprintf(scan, "{%ld", i); 53 while (*scan != '\0') scan++; 54 i++; 55 for (; i < KMP_CPU_SETSIZE; i++) { 56 if (! KMP_CPU_ISSET(i, mask)) { 57 continue; 58 } 59 60 // 61 // Check for buffer overflow. A string of the form ",<n>" will have 62 // at most 10 characters, plus we want to leave room to print ",...}" 63 // if the set is too large to print for a total of 15 characters. 64 // We already left room for '\0' in setting end. 65 // 66 if (end - scan < 15) { 67 break; 68 } 69 sprintf(scan, ",%-ld", i); 70 while (*scan != '\0') scan++; 71 } 72 if (i < KMP_CPU_SETSIZE) { 73 sprintf(scan, ",..."); 74 while (*scan != '\0') scan++; 75 } 76 sprintf(scan, "}"); 77 while (*scan != '\0') scan++; 78 KMP_ASSERT(scan <= end); 79 return buf; 80 } 81 82 83 void 84 __kmp_affinity_entire_machine_mask(kmp_affin_mask_t *mask) 85 { 86 KMP_CPU_ZERO(mask); 87 88 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 89 90 if (__kmp_num_proc_groups > 1) { 91 int group; 92 struct GROUP_AFFINITY ga; 93 KMP_DEBUG_ASSERT(__kmp_GetActiveProcessorCount != NULL); 94 for (group = 0; group < __kmp_num_proc_groups; group++) { 95 int i; 96 int num = __kmp_GetActiveProcessorCount(group); 97 for (i = 0; i < num; i++) { 98 KMP_CPU_SET(i + group * (CHAR_BIT * sizeof(DWORD_PTR)), mask); 99 } 100 } 101 } 102 else 103 104 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 105 106 { 107 int proc; 108 for (proc = 0; proc < __kmp_xproc; proc++) { 109 KMP_CPU_SET(proc, mask); 110 } 111 } 112 } 113 114 115 // 116 // In Linux* OS debug & cover (-O0) builds, we need to avoid inline member 117 // functions. 118 // 119 // The icc codegen emits sections with extremely long names, of the form 120 // ".gnu.linkonce.<mangled_name>". There seems to have been a linker bug 121 // introduced between GNU ld version 2.14.90.0.4 and 2.15.92.0.2 involving 122 // some sort of memory corruption or table overflow that is triggered by 123 // these long strings. I checked the latest version of the linker - 124 // GNU ld (Linux* OS/GNU Binutils) 2.18.50.0.7.20080422 - and the bug is not 125 // fixed. 126 // 127 // Unfortunately, my attempts to reproduce it in a smaller example have 128 // failed - I'm not sure what the prospects are of getting it fixed 129 // properly - but we need a reproducer smaller than all of libiomp. 130 // 131 // Work around the problem by avoiding inline constructors in such builds. 132 // We do this for all platforms, not just Linux* OS - non-inline functions are 133 // more debuggable and provide better coverage into than inline functions. 134 // Use inline functions in shipping libs, for performance. 135 // 136 137 # if !defined(KMP_DEBUG) && !defined(COVER) 138 139 class Address { 140 public: 141 static const unsigned maxDepth = 32; 142 unsigned labels[maxDepth]; 143 unsigned childNums[maxDepth]; 144 unsigned depth; 145 unsigned leader; 146 Address(unsigned _depth) 147 : depth(_depth), leader(FALSE) { 148 } 149 Address &operator=(const Address &b) { 150 depth = b.depth; 151 for (unsigned i = 0; i < depth; i++) { 152 labels[i] = b.labels[i]; 153 childNums[i] = b.childNums[i]; 154 } 155 leader = FALSE; 156 return *this; 157 } 158 bool operator==(const Address &b) const { 159 if (depth != b.depth) 160 return false; 161 for (unsigned i = 0; i < depth; i++) 162 if(labels[i] != b.labels[i]) 163 return false; 164 return true; 165 } 166 bool isClose(const Address &b, int level) const { 167 if (depth != b.depth) 168 return false; 169 if ((unsigned)level >= depth) 170 return true; 171 for (unsigned i = 0; i < (depth - level); i++) 172 if(labels[i] != b.labels[i]) 173 return false; 174 return true; 175 } 176 bool operator!=(const Address &b) const { 177 return !operator==(b); 178 } 179 }; 180 181 class AddrUnsPair { 182 public: 183 Address first; 184 unsigned second; 185 AddrUnsPair(Address _first, unsigned _second) 186 : first(_first), second(_second) { 187 } 188 AddrUnsPair &operator=(const AddrUnsPair &b) 189 { 190 first = b.first; 191 second = b.second; 192 return *this; 193 } 194 }; 195 196 # else 197 198 class Address { 199 public: 200 static const unsigned maxDepth = 32; 201 unsigned labels[maxDepth]; 202 unsigned childNums[maxDepth]; 203 unsigned depth; 204 unsigned leader; 205 Address(unsigned _depth); 206 Address &operator=(const Address &b); 207 bool operator==(const Address &b) const; 208 bool isClose(const Address &b, int level) const; 209 bool operator!=(const Address &b) const; 210 }; 211 212 Address::Address(unsigned _depth) 213 { 214 depth = _depth; 215 leader = FALSE; 216 } 217 218 Address &Address::operator=(const Address &b) { 219 depth = b.depth; 220 for (unsigned i = 0; i < depth; i++) { 221 labels[i] = b.labels[i]; 222 childNums[i] = b.childNums[i]; 223 } 224 leader = FALSE; 225 return *this; 226 } 227 228 bool Address::operator==(const Address &b) const { 229 if (depth != b.depth) 230 return false; 231 for (unsigned i = 0; i < depth; i++) 232 if(labels[i] != b.labels[i]) 233 return false; 234 return true; 235 } 236 237 bool Address::isClose(const Address &b, int level) const { 238 if (depth != b.depth) 239 return false; 240 if ((unsigned)level >= depth) 241 return true; 242 for (unsigned i = 0; i < (depth - level); i++) 243 if(labels[i] != b.labels[i]) 244 return false; 245 return true; 246 } 247 248 bool Address::operator!=(const Address &b) const { 249 return !operator==(b); 250 } 251 252 class AddrUnsPair { 253 public: 254 Address first; 255 unsigned second; 256 AddrUnsPair(Address _first, unsigned _second); 257 AddrUnsPair &operator=(const AddrUnsPair &b); 258 }; 259 260 AddrUnsPair::AddrUnsPair(Address _first, unsigned _second) 261 : first(_first), second(_second) 262 { 263 } 264 265 AddrUnsPair &AddrUnsPair::operator=(const AddrUnsPair &b) 266 { 267 first = b.first; 268 second = b.second; 269 return *this; 270 } 271 272 # endif /* !defined(KMP_DEBUG) && !defined(COVER) */ 273 274 275 static int 276 __kmp_affinity_cmp_Address_labels(const void *a, const void *b) 277 { 278 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 279 ->first); 280 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 281 ->first); 282 unsigned depth = aa->depth; 283 unsigned i; 284 KMP_DEBUG_ASSERT(depth == bb->depth); 285 for (i = 0; i < depth; i++) { 286 if (aa->labels[i] < bb->labels[i]) return -1; 287 if (aa->labels[i] > bb->labels[i]) return 1; 288 } 289 return 0; 290 } 291 292 293 static int 294 __kmp_affinity_cmp_Address_child_num(const void *a, const void *b) 295 { 296 const Address *aa = (const Address *)&(((AddrUnsPair *)a) 297 ->first); 298 const Address *bb = (const Address *)&(((AddrUnsPair *)b) 299 ->first); 300 unsigned depth = aa->depth; 301 unsigned i; 302 KMP_DEBUG_ASSERT(depth == bb->depth); 303 KMP_DEBUG_ASSERT((unsigned)__kmp_affinity_compact <= depth); 304 KMP_DEBUG_ASSERT(__kmp_affinity_compact >= 0); 305 for (i = 0; i < (unsigned)__kmp_affinity_compact; i++) { 306 int j = depth - i - 1; 307 if (aa->childNums[j] < bb->childNums[j]) return -1; 308 if (aa->childNums[j] > bb->childNums[j]) return 1; 309 } 310 for (; i < depth; i++) { 311 int j = i - __kmp_affinity_compact; 312 if (aa->childNums[j] < bb->childNums[j]) return -1; 313 if (aa->childNums[j] > bb->childNums[j]) return 1; 314 } 315 return 0; 316 } 317 318 319 // 320 // When sorting by labels, __kmp_affinity_assign_child_nums() must first be 321 // called to renumber the labels from [0..n] and place them into the child_num 322 // vector of the address object. This is done in case the labels used for 323 // the children at one node of the heirarchy differ from those used for 324 // another node at the same level. Example: suppose the machine has 2 nodes 325 // with 2 packages each. The first node contains packages 601 and 602, and 326 // second node contains packages 603 and 604. If we try to sort the table 327 // for "scatter" affinity, the table will still be sorted 601, 602, 603, 604 328 // because we are paying attention to the labels themselves, not the ordinal 329 // child numbers. By using the child numbers in the sort, the result is 330 // {0,0}=601, {0,1}=603, {1,0}=602, {1,1}=604. 331 // 332 static void 333 __kmp_affinity_assign_child_nums(AddrUnsPair *address2os, 334 int numAddrs) 335 { 336 KMP_DEBUG_ASSERT(numAddrs > 0); 337 int depth = address2os->first.depth; 338 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 339 unsigned *lastLabel = (unsigned *)__kmp_allocate(depth 340 * sizeof(unsigned)); 341 int labCt; 342 for (labCt = 0; labCt < depth; labCt++) { 343 address2os[0].first.childNums[labCt] = counts[labCt] = 0; 344 lastLabel[labCt] = address2os[0].first.labels[labCt]; 345 } 346 int i; 347 for (i = 1; i < numAddrs; i++) { 348 for (labCt = 0; labCt < depth; labCt++) { 349 if (address2os[i].first.labels[labCt] != lastLabel[labCt]) { 350 int labCt2; 351 for (labCt2 = labCt + 1; labCt2 < depth; labCt2++) { 352 counts[labCt2] = 0; 353 lastLabel[labCt2] = address2os[i].first.labels[labCt2]; 354 } 355 counts[labCt]++; 356 lastLabel[labCt] = address2os[i].first.labels[labCt]; 357 break; 358 } 359 } 360 for (labCt = 0; labCt < depth; labCt++) { 361 address2os[i].first.childNums[labCt] = counts[labCt]; 362 } 363 for (; labCt < (int)Address::maxDepth; labCt++) { 364 address2os[i].first.childNums[labCt] = 0; 365 } 366 } 367 } 368 369 370 // 371 // All of the __kmp_affinity_create_*_map() routines should set 372 // __kmp_affinity_masks to a vector of affinity mask objects of length 373 // __kmp_affinity_num_masks, if __kmp_affinity_type != affinity_none, and 374 // return the number of levels in the machine topology tree (zero if 375 // __kmp_affinity_type == affinity_none). 376 // 377 // All of the __kmp_affinity_create_*_map() routines should set *fullMask 378 // to the affinity mask for the initialization thread. They need to save and 379 // restore the mask, and it could be needed later, so saving it is just an 380 // optimization to avoid calling kmp_get_system_affinity() again. 381 // 382 static kmp_affin_mask_t *fullMask = NULL; 383 384 kmp_affin_mask_t * 385 __kmp_affinity_get_fullMask() { return fullMask; } 386 387 388 static int nCoresPerPkg, nPackages; 389 int __kmp_nThreadsPerCore; 390 391 // 392 // __kmp_affinity_uniform_topology() doesn't work when called from 393 // places which support arbitrarily many levels in the machine topology 394 // map, i.e. the non-default cases in __kmp_affinity_create_cpuinfo_map() 395 // __kmp_affinity_create_x2apicid_map(). 396 // 397 inline static bool 398 __kmp_affinity_uniform_topology() 399 { 400 return __kmp_avail_proc == (__kmp_nThreadsPerCore * nCoresPerPkg * nPackages); 401 } 402 403 404 // 405 // Print out the detailed machine topology map, i.e. the physical locations 406 // of each OS proc. 407 // 408 static void 409 __kmp_affinity_print_topology(AddrUnsPair *address2os, int len, int depth, 410 int pkgLevel, int coreLevel, int threadLevel) 411 { 412 int proc; 413 414 KMP_INFORM(OSProcToPhysicalThreadMap, "KMP_AFFINITY"); 415 for (proc = 0; proc < len; proc++) { 416 int level; 417 kmp_str_buf_t buf; 418 __kmp_str_buf_init(&buf); 419 for (level = 0; level < depth; level++) { 420 if (level == threadLevel) { 421 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Thread)); 422 } 423 else if (level == coreLevel) { 424 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Core)); 425 } 426 else if (level == pkgLevel) { 427 __kmp_str_buf_print(&buf, "%s ", KMP_I18N_STR(Package)); 428 } 429 else if (level > pkgLevel) { 430 __kmp_str_buf_print(&buf, "%s_%d ", KMP_I18N_STR(Node), 431 level - pkgLevel - 1); 432 } 433 else { 434 __kmp_str_buf_print(&buf, "L%d ", level); 435 } 436 __kmp_str_buf_print(&buf, "%d ", 437 address2os[proc].first.labels[level]); 438 } 439 KMP_INFORM(OSProcMapToPack, "KMP_AFFINITY", address2os[proc].second, 440 buf.str); 441 __kmp_str_buf_free(&buf); 442 } 443 } 444 445 446 // 447 // If we don't know how to retrieve the machine's processor topology, or 448 // encounter an error in doing so, this routine is called to form a "flat" 449 // mapping of os thread id's <-> processor id's. 450 // 451 static int 452 __kmp_affinity_create_flat_map(AddrUnsPair **address2os, 453 kmp_i18n_id_t *const msg_id) 454 { 455 *address2os = NULL; 456 *msg_id = kmp_i18n_null; 457 458 // 459 // Even if __kmp_affinity_type == affinity_none, this routine might still 460 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as 461 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 462 // 463 if (! KMP_AFFINITY_CAPABLE()) { 464 KMP_ASSERT(__kmp_affinity_type == affinity_none); 465 __kmp_ncores = nPackages = __kmp_xproc; 466 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 467 __kmp_ht_enabled = FALSE; 468 if (__kmp_affinity_verbose) { 469 KMP_INFORM(AffFlatTopology, "KMP_AFFINITY"); 470 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 471 KMP_INFORM(Uniform, "KMP_AFFINITY"); 472 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 473 __kmp_nThreadsPerCore, __kmp_ncores); 474 } 475 return 0; 476 } 477 478 // 479 // When affinity is off, this routine will still be called to set 480 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 481 // nCoresPerPkg, & nPackages. Make sure all these vars are set 482 // correctly, and return now if affinity is not enabled. 483 // 484 __kmp_ncores = nPackages = __kmp_avail_proc; 485 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 486 __kmp_ht_enabled = FALSE; 487 if (__kmp_affinity_verbose) { 488 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 489 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 490 491 KMP_INFORM(AffCapableUseFlat, "KMP_AFFINITY"); 492 if (__kmp_affinity_respect_mask) { 493 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 494 } else { 495 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 496 } 497 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 498 KMP_INFORM(Uniform, "KMP_AFFINITY"); 499 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 500 __kmp_nThreadsPerCore, __kmp_ncores); 501 } 502 if (__kmp_affinity_type == affinity_none) { 503 return 0; 504 } 505 506 // 507 // Contruct the data structure to be returned. 508 // 509 *address2os = (AddrUnsPair*) 510 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 511 int avail_ct = 0; 512 unsigned int i; 513 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 514 // 515 // Skip this proc if it is not included in the machine model. 516 // 517 if (! KMP_CPU_ISSET(i, fullMask)) { 518 continue; 519 } 520 521 Address addr(1); 522 addr.labels[0] = i; 523 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 524 } 525 if (__kmp_affinity_verbose) { 526 KMP_INFORM(OSProcToPackage, "KMP_AFFINITY"); 527 } 528 529 if (__kmp_affinity_gran_levels < 0) { 530 // 531 // Only the package level is modeled in the machine topology map, 532 // so the #levels of granularity is either 0 or 1. 533 // 534 if (__kmp_affinity_gran > affinity_gran_package) { 535 __kmp_affinity_gran_levels = 1; 536 } 537 else { 538 __kmp_affinity_gran_levels = 0; 539 } 540 } 541 return 1; 542 } 543 544 545 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 546 547 // 548 // If multiple Windows* OS processor groups exist, we can create a 2-level 549 // topology map with the groups at level 0 and the individual procs at 550 // level 1. 551 // 552 // This facilitates letting the threads float among all procs in a group, 553 // if granularity=group (the default when there are multiple groups). 554 // 555 static int 556 __kmp_affinity_create_proc_group_map(AddrUnsPair **address2os, 557 kmp_i18n_id_t *const msg_id) 558 { 559 *address2os = NULL; 560 *msg_id = kmp_i18n_null; 561 562 // 563 // If we don't have multiple processor groups, return now. 564 // The flat mapping will be used. 565 // 566 if ((! KMP_AFFINITY_CAPABLE()) || (__kmp_get_proc_group(fullMask) >= 0)) { 567 // FIXME set *msg_id 568 return -1; 569 } 570 571 // 572 // Contruct the data structure to be returned. 573 // 574 *address2os = (AddrUnsPair*) 575 __kmp_allocate(sizeof(**address2os) * __kmp_avail_proc); 576 int avail_ct = 0; 577 int i; 578 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 579 // 580 // Skip this proc if it is not included in the machine model. 581 // 582 if (! KMP_CPU_ISSET(i, fullMask)) { 583 continue; 584 } 585 586 Address addr(2); 587 addr.labels[0] = i / (CHAR_BIT * sizeof(DWORD_PTR)); 588 addr.labels[1] = i % (CHAR_BIT * sizeof(DWORD_PTR)); 589 (*address2os)[avail_ct++] = AddrUnsPair(addr,i); 590 591 if (__kmp_affinity_verbose) { 592 KMP_INFORM(AffOSProcToGroup, "KMP_AFFINITY", i, addr.labels[0], 593 addr.labels[1]); 594 } 595 } 596 597 if (__kmp_affinity_gran_levels < 0) { 598 if (__kmp_affinity_gran == affinity_gran_group) { 599 __kmp_affinity_gran_levels = 1; 600 } 601 else if ((__kmp_affinity_gran == affinity_gran_fine) 602 || (__kmp_affinity_gran == affinity_gran_thread)) { 603 __kmp_affinity_gran_levels = 0; 604 } 605 else { 606 const char *gran_str = NULL; 607 if (__kmp_affinity_gran == affinity_gran_core) { 608 gran_str = "core"; 609 } 610 else if (__kmp_affinity_gran == affinity_gran_package) { 611 gran_str = "package"; 612 } 613 else if (__kmp_affinity_gran == affinity_gran_node) { 614 gran_str = "node"; 615 } 616 else { 617 KMP_ASSERT(0); 618 } 619 620 // Warning: can't use affinity granularity \"gran\" with group topology method, using "thread" 621 __kmp_affinity_gran_levels = 0; 622 } 623 } 624 return 2; 625 } 626 627 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 628 629 630 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 631 632 static int 633 __kmp_cpuid_mask_width(int count) { 634 int r = 0; 635 636 while((1<<r) < count) 637 ++r; 638 return r; 639 } 640 641 642 class apicThreadInfo { 643 public: 644 unsigned osId; // param to __kmp_affinity_bind_thread 645 unsigned apicId; // from cpuid after binding 646 unsigned maxCoresPerPkg; // "" 647 unsigned maxThreadsPerPkg; // "" 648 unsigned pkgId; // inferred from above values 649 unsigned coreId; // "" 650 unsigned threadId; // "" 651 }; 652 653 654 static int 655 __kmp_affinity_cmp_apicThreadInfo_os_id(const void *a, const void *b) 656 { 657 const apicThreadInfo *aa = (const apicThreadInfo *)a; 658 const apicThreadInfo *bb = (const apicThreadInfo *)b; 659 if (aa->osId < bb->osId) return -1; 660 if (aa->osId > bb->osId) return 1; 661 return 0; 662 } 663 664 665 static int 666 __kmp_affinity_cmp_apicThreadInfo_phys_id(const void *a, const void *b) 667 { 668 const apicThreadInfo *aa = (const apicThreadInfo *)a; 669 const apicThreadInfo *bb = (const apicThreadInfo *)b; 670 if (aa->pkgId < bb->pkgId) return -1; 671 if (aa->pkgId > bb->pkgId) return 1; 672 if (aa->coreId < bb->coreId) return -1; 673 if (aa->coreId > bb->coreId) return 1; 674 if (aa->threadId < bb->threadId) return -1; 675 if (aa->threadId > bb->threadId) return 1; 676 return 0; 677 } 678 679 680 // 681 // On IA-32 architecture and Intel(R) 64 architecture, we attempt to use 682 // an algorithm which cycles through the available os threads, setting 683 // the current thread's affinity mask to that thread, and then retrieves 684 // the Apic Id for each thread context using the cpuid instruction. 685 // 686 static int 687 __kmp_affinity_create_apicid_map(AddrUnsPair **address2os, 688 kmp_i18n_id_t *const msg_id) 689 { 690 int rc; 691 *address2os = NULL; 692 *msg_id = kmp_i18n_null; 693 694 # if KMP_MIC 695 { 696 // The code below will use cpuid(4). 697 // Check if cpuid(4) is supported. 698 // FIXME? - this really doesn't need to be specific to MIC. 699 kmp_cpuid buf; 700 __kmp_x86_cpuid(0, 0, &buf); 701 if (buf.eax < 4) { 702 *msg_id = kmp_i18n_str_NoLeaf4Support; 703 return -1; 704 } 705 } 706 # endif // KMP_MIC 707 708 // 709 // Even if __kmp_affinity_type == affinity_none, this routine is still 710 // called to set __kmp_ht_enabled, & __kmp_ncores, as well as 711 // __kmp_nThreadsPerCore, nCoresPerPkg, & nPackages. 712 // 713 // The algorithm used starts by setting the affinity to each available 714 // thread and retreiving info from the cpuid instruction, so if we are not 715 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(), 716 // then we need to do something else. 717 // 718 if (! KMP_AFFINITY_CAPABLE()) { 719 // 720 // Hack to try and infer the machine topology using only the data 721 // available from cpuid on the current thread, and __kmp_xproc. 722 // 723 KMP_ASSERT(__kmp_affinity_type == affinity_none); 724 725 // 726 // Get an upper bound on the number of threads per package using 727 // cpuid(1). 728 // 729 // On some OS/chps combinations where HT is supported by the chip 730 // but is disabled, this value will be 2 on a single core chip. 731 // Usually, it will be 2 if HT is enabled and 1 if HT is disabled. 732 // 733 kmp_cpuid buf; 734 __kmp_x86_cpuid(1, 0, &buf); 735 int maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 736 if (maxThreadsPerPkg == 0) { 737 maxThreadsPerPkg = 1; 738 } 739 740 // 741 // The num cores per pkg comes from cpuid(4). 742 // 1 must be added to the encoded value. 743 // 744 // The author of cpu_count.cpp treated this only an upper bound 745 // on the number of cores, but I haven't seen any cases where it 746 // was greater than the actual number of cores, so we will treat 747 // it as exact in this block of code. 748 // 749 // First, we need to check if cpuid(4) is supported on this chip. 750 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 751 // has the value n or greater. 752 // 753 __kmp_x86_cpuid(0, 0, &buf); 754 if (buf.eax >= 4) { 755 __kmp_x86_cpuid(4, 0, &buf); 756 nCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 757 } 758 else { 759 nCoresPerPkg = 1; 760 } 761 762 // 763 // There is no way to reliably tell if HT is enabled without issuing 764 // the cpuid instruction from every thread, can correlating the cpuid 765 // info, so if the machine is not affinity capable, we assume that HT 766 // is off. We have seen quite a few machines where maxThreadsPerPkg 767 // is 2, yet the machine does not support HT. 768 // 769 // - Older OSes are usually found on machines with older chips, which 770 // do not support HT. 771 // 772 // - The performance penalty for mistakenly identifying a machine as 773 // HT when it isn't (which results in blocktime being incorrecly set 774 // to 0) is greater than the penalty when for mistakenly identifying 775 // a machine as being 1 thread/core when it is really HT enabled 776 // (which results in blocktime being incorrectly set to a positive 777 // value). 778 // 779 __kmp_ncores = __kmp_xproc; 780 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 781 __kmp_nThreadsPerCore = 1; 782 __kmp_ht_enabled = FALSE; 783 if (__kmp_affinity_verbose) { 784 KMP_INFORM(AffNotCapableUseLocCpuid, "KMP_AFFINITY"); 785 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 786 if (__kmp_affinity_uniform_topology()) { 787 KMP_INFORM(Uniform, "KMP_AFFINITY"); 788 } else { 789 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 790 } 791 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 792 __kmp_nThreadsPerCore, __kmp_ncores); 793 } 794 return 0; 795 } 796 797 // 798 // 799 // From here on, we can assume that it is safe to call 800 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 801 // even if __kmp_affinity_type = affinity_none. 802 // 803 804 // 805 // Save the affinity mask for the current thread. 806 // 807 kmp_affin_mask_t *oldMask; 808 KMP_CPU_ALLOC(oldMask); 809 KMP_ASSERT(oldMask != NULL); 810 __kmp_get_system_affinity(oldMask, TRUE); 811 812 // 813 // Run through each of the available contexts, binding the current thread 814 // to it, and obtaining the pertinent information using the cpuid instr. 815 // 816 // The relevant information is: 817 // 818 // Apic Id: Bits 24:31 of ebx after issuing cpuid(1) - each thread context 819 // has a uniqie Apic Id, which is of the form pkg# : core# : thread#. 820 // 821 // Max Threads Per Pkg: Bits 16:23 of ebx after issuing cpuid(1). The 822 // value of this field determines the width of the core# + thread# 823 // fields in the Apic Id. It is also an upper bound on the number 824 // of threads per package, but it has been verified that situations 825 // happen were it is not exact. In particular, on certain OS/chip 826 // combinations where Intel(R) Hyper-Threading Technology is supported 827 // by the chip but has 828 // been disabled, the value of this field will be 2 (for a single core 829 // chip). On other OS/chip combinations supporting 830 // Intel(R) Hyper-Threading Technology, the value of 831 // this field will be 1 when Intel(R) Hyper-Threading Technology is 832 // disabled and 2 when it is enabled. 833 // 834 // Max Cores Per Pkg: Bits 26:31 of eax after issuing cpuid(4). The 835 // value of this field (+1) determines the width of the core# field in 836 // the Apic Id. The comments in "cpucount.cpp" say that this value is 837 // an upper bound, but the IA-32 architecture manual says that it is 838 // exactly the number of cores per package, and I haven't seen any 839 // case where it wasn't. 840 // 841 // From this information, deduce the package Id, core Id, and thread Id, 842 // and set the corresponding fields in the apicThreadInfo struct. 843 // 844 unsigned i; 845 apicThreadInfo *threadInfo = (apicThreadInfo *)__kmp_allocate( 846 __kmp_avail_proc * sizeof(apicThreadInfo)); 847 unsigned nApics = 0; 848 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 849 // 850 // Skip this proc if it is not included in the machine model. 851 // 852 if (! KMP_CPU_ISSET(i, fullMask)) { 853 continue; 854 } 855 KMP_DEBUG_ASSERT((int)nApics < __kmp_avail_proc); 856 857 __kmp_affinity_bind_thread(i); 858 threadInfo[nApics].osId = i; 859 860 // 861 // The apic id and max threads per pkg come from cpuid(1). 862 // 863 kmp_cpuid buf; 864 __kmp_x86_cpuid(1, 0, &buf); 865 if (! (buf.edx >> 9) & 1) { 866 __kmp_set_system_affinity(oldMask, TRUE); 867 __kmp_free(threadInfo); 868 KMP_CPU_FREE(oldMask); 869 *msg_id = kmp_i18n_str_ApicNotPresent; 870 return -1; 871 } 872 threadInfo[nApics].apicId = (buf.ebx >> 24) & 0xff; 873 threadInfo[nApics].maxThreadsPerPkg = (buf.ebx >> 16) & 0xff; 874 if (threadInfo[nApics].maxThreadsPerPkg == 0) { 875 threadInfo[nApics].maxThreadsPerPkg = 1; 876 } 877 878 // 879 // Max cores per pkg comes from cpuid(4). 880 // 1 must be added to the encoded value. 881 // 882 // First, we need to check if cpuid(4) is supported on this chip. 883 // To see if cpuid(n) is supported, issue cpuid(0) and check if eax 884 // has the value n or greater. 885 // 886 __kmp_x86_cpuid(0, 0, &buf); 887 if (buf.eax >= 4) { 888 __kmp_x86_cpuid(4, 0, &buf); 889 threadInfo[nApics].maxCoresPerPkg = ((buf.eax >> 26) & 0x3f) + 1; 890 } 891 else { 892 threadInfo[nApics].maxCoresPerPkg = 1; 893 } 894 895 // 896 // Infer the pkgId / coreId / threadId using only the info 897 // obtained locally. 898 // 899 int widthCT = __kmp_cpuid_mask_width( 900 threadInfo[nApics].maxThreadsPerPkg); 901 threadInfo[nApics].pkgId = threadInfo[nApics].apicId >> widthCT; 902 903 int widthC = __kmp_cpuid_mask_width( 904 threadInfo[nApics].maxCoresPerPkg); 905 int widthT = widthCT - widthC; 906 if (widthT < 0) { 907 // 908 // I've never seen this one happen, but I suppose it could, if 909 // the cpuid instruction on a chip was really screwed up. 910 // Make sure to restore the affinity mask before the tail call. 911 // 912 __kmp_set_system_affinity(oldMask, TRUE); 913 __kmp_free(threadInfo); 914 KMP_CPU_FREE(oldMask); 915 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 916 return -1; 917 } 918 919 int maskC = (1 << widthC) - 1; 920 threadInfo[nApics].coreId = (threadInfo[nApics].apicId >> widthT) 921 &maskC; 922 923 int maskT = (1 << widthT) - 1; 924 threadInfo[nApics].threadId = threadInfo[nApics].apicId &maskT; 925 926 nApics++; 927 } 928 929 // 930 // We've collected all the info we need. 931 // Restore the old affinity mask for this thread. 932 // 933 __kmp_set_system_affinity(oldMask, TRUE); 934 935 // 936 // If there's only one thread context to bind to, form an Address object 937 // with depth 1 and return immediately (or, if affinity is off, set 938 // address2os to NULL and return). 939 // 940 // If it is configured to omit the package level when there is only a 941 // single package, the logic at the end of this routine won't work if 942 // there is only a single thread - it would try to form an Address 943 // object with depth 0. 944 // 945 KMP_ASSERT(nApics > 0); 946 if (nApics == 1) { 947 __kmp_ncores = nPackages = 1; 948 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 949 __kmp_ht_enabled = FALSE; 950 if (__kmp_affinity_verbose) { 951 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 952 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 953 954 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 955 if (__kmp_affinity_respect_mask) { 956 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 957 } else { 958 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 959 } 960 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 961 KMP_INFORM(Uniform, "KMP_AFFINITY"); 962 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 963 __kmp_nThreadsPerCore, __kmp_ncores); 964 } 965 966 if (__kmp_affinity_type == affinity_none) { 967 __kmp_free(threadInfo); 968 KMP_CPU_FREE(oldMask); 969 return 0; 970 } 971 972 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 973 Address addr(1); 974 addr.labels[0] = threadInfo[0].pkgId; 975 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0].osId); 976 977 if (__kmp_affinity_gran_levels < 0) { 978 __kmp_affinity_gran_levels = 0; 979 } 980 981 if (__kmp_affinity_verbose) { 982 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 983 } 984 985 __kmp_free(threadInfo); 986 KMP_CPU_FREE(oldMask); 987 return 1; 988 } 989 990 // 991 // Sort the threadInfo table by physical Id. 992 // 993 qsort(threadInfo, nApics, sizeof(*threadInfo), 994 __kmp_affinity_cmp_apicThreadInfo_phys_id); 995 996 // 997 // The table is now sorted by pkgId / coreId / threadId, but we really 998 // don't know the radix of any of the fields. pkgId's may be sparsely 999 // assigned among the chips on a system. Although coreId's are usually 1000 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 1001 // [0..threadsPerCore-1], we don't want to make any such assumptions. 1002 // 1003 // For that matter, we don't know what coresPerPkg and threadsPerCore 1004 // (or the total # packages) are at this point - we want to determine 1005 // that now. We only have an upper bound on the first two figures. 1006 // 1007 // We also perform a consistency check at this point: the values returned 1008 // by the cpuid instruction for any thread bound to a given package had 1009 // better return the same info for maxThreadsPerPkg and maxCoresPerPkg. 1010 // 1011 nPackages = 1; 1012 nCoresPerPkg = 1; 1013 __kmp_nThreadsPerCore = 1; 1014 unsigned nCores = 1; 1015 1016 unsigned pkgCt = 1; // to determine radii 1017 unsigned lastPkgId = threadInfo[0].pkgId; 1018 unsigned coreCt = 1; 1019 unsigned lastCoreId = threadInfo[0].coreId; 1020 unsigned threadCt = 1; 1021 unsigned lastThreadId = threadInfo[0].threadId; 1022 1023 // intra-pkg consist checks 1024 unsigned prevMaxCoresPerPkg = threadInfo[0].maxCoresPerPkg; 1025 unsigned prevMaxThreadsPerPkg = threadInfo[0].maxThreadsPerPkg; 1026 1027 for (i = 1; i < nApics; i++) { 1028 if (threadInfo[i].pkgId != lastPkgId) { 1029 nCores++; 1030 pkgCt++; 1031 lastPkgId = threadInfo[i].pkgId; 1032 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1033 coreCt = 1; 1034 lastCoreId = threadInfo[i].coreId; 1035 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1036 threadCt = 1; 1037 lastThreadId = threadInfo[i].threadId; 1038 1039 // 1040 // This is a different package, so go on to the next iteration 1041 // without doing any consistency checks. Reset the consistency 1042 // check vars, though. 1043 // 1044 prevMaxCoresPerPkg = threadInfo[i].maxCoresPerPkg; 1045 prevMaxThreadsPerPkg = threadInfo[i].maxThreadsPerPkg; 1046 continue; 1047 } 1048 1049 if (threadInfo[i].coreId != lastCoreId) { 1050 nCores++; 1051 coreCt++; 1052 lastCoreId = threadInfo[i].coreId; 1053 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1054 threadCt = 1; 1055 lastThreadId = threadInfo[i].threadId; 1056 } 1057 else if (threadInfo[i].threadId != lastThreadId) { 1058 threadCt++; 1059 lastThreadId = threadInfo[i].threadId; 1060 } 1061 else { 1062 __kmp_free(threadInfo); 1063 KMP_CPU_FREE(oldMask); 1064 *msg_id = kmp_i18n_str_LegacyApicIDsNotUnique; 1065 return -1; 1066 } 1067 1068 // 1069 // Check to make certain that the maxCoresPerPkg and maxThreadsPerPkg 1070 // fields agree between all the threads bounds to a given package. 1071 // 1072 if ((prevMaxCoresPerPkg != threadInfo[i].maxCoresPerPkg) 1073 || (prevMaxThreadsPerPkg != threadInfo[i].maxThreadsPerPkg)) { 1074 __kmp_free(threadInfo); 1075 KMP_CPU_FREE(oldMask); 1076 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1077 return -1; 1078 } 1079 } 1080 nPackages = pkgCt; 1081 if ((int)coreCt > nCoresPerPkg) nCoresPerPkg = coreCt; 1082 if ((int)threadCt > __kmp_nThreadsPerCore) __kmp_nThreadsPerCore = threadCt; 1083 1084 // 1085 // When affinity is off, this routine will still be called to set 1086 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 1087 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1088 // correctly, and return now if affinity is not enabled. 1089 // 1090 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1091 __kmp_ncores = nCores; 1092 if (__kmp_affinity_verbose) { 1093 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1094 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1095 1096 KMP_INFORM(AffUseGlobCpuid, "KMP_AFFINITY"); 1097 if (__kmp_affinity_respect_mask) { 1098 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1099 } else { 1100 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1101 } 1102 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1103 if (__kmp_affinity_uniform_topology()) { 1104 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1105 } else { 1106 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1107 } 1108 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1109 __kmp_nThreadsPerCore, __kmp_ncores); 1110 1111 } 1112 1113 if (__kmp_affinity_type == affinity_none) { 1114 __kmp_free(threadInfo); 1115 KMP_CPU_FREE(oldMask); 1116 return 0; 1117 } 1118 1119 // 1120 // Now that we've determined the number of packages, the number of cores 1121 // per package, and the number of threads per core, we can construct the 1122 // data structure that is to be returned. 1123 // 1124 int pkgLevel = 0; 1125 int coreLevel = (nCoresPerPkg <= 1) ? -1 : 1; 1126 int threadLevel = (__kmp_nThreadsPerCore <= 1) ? -1 : ((coreLevel >= 0) ? 2 : 1); 1127 unsigned depth = (pkgLevel >= 0) + (coreLevel >= 0) + (threadLevel >= 0); 1128 1129 KMP_ASSERT(depth > 0); 1130 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair) * nApics); 1131 1132 for (i = 0; i < nApics; ++i) { 1133 Address addr(depth); 1134 unsigned os = threadInfo[i].osId; 1135 int d = 0; 1136 1137 if (pkgLevel >= 0) { 1138 addr.labels[d++] = threadInfo[i].pkgId; 1139 } 1140 if (coreLevel >= 0) { 1141 addr.labels[d++] = threadInfo[i].coreId; 1142 } 1143 if (threadLevel >= 0) { 1144 addr.labels[d++] = threadInfo[i].threadId; 1145 } 1146 (*address2os)[i] = AddrUnsPair(addr, os); 1147 } 1148 1149 if (__kmp_affinity_gran_levels < 0) { 1150 // 1151 // Set the granularity level based on what levels are modeled 1152 // in the machine topology map. 1153 // 1154 __kmp_affinity_gran_levels = 0; 1155 if ((threadLevel >= 0) 1156 && (__kmp_affinity_gran > affinity_gran_thread)) { 1157 __kmp_affinity_gran_levels++; 1158 } 1159 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1160 __kmp_affinity_gran_levels++; 1161 } 1162 if ((pkgLevel >= 0) && (__kmp_affinity_gran > affinity_gran_package)) { 1163 __kmp_affinity_gran_levels++; 1164 } 1165 } 1166 1167 if (__kmp_affinity_verbose) { 1168 __kmp_affinity_print_topology(*address2os, nApics, depth, pkgLevel, 1169 coreLevel, threadLevel); 1170 } 1171 1172 __kmp_free(threadInfo); 1173 KMP_CPU_FREE(oldMask); 1174 return depth; 1175 } 1176 1177 1178 // 1179 // Intel(R) microarchitecture code name Nehalem, Dunnington and later 1180 // architectures support a newer interface for specifying the x2APIC Ids, 1181 // based on cpuid leaf 11. 1182 // 1183 static int 1184 __kmp_affinity_create_x2apicid_map(AddrUnsPair **address2os, 1185 kmp_i18n_id_t *const msg_id) 1186 { 1187 kmp_cpuid buf; 1188 1189 *address2os = NULL; 1190 *msg_id = kmp_i18n_null; 1191 1192 // 1193 // Check to see if cpuid leaf 11 is supported. 1194 // 1195 __kmp_x86_cpuid(0, 0, &buf); 1196 if (buf.eax < 11) { 1197 *msg_id = kmp_i18n_str_NoLeaf11Support; 1198 return -1; 1199 } 1200 __kmp_x86_cpuid(11, 0, &buf); 1201 if (buf.ebx == 0) { 1202 *msg_id = kmp_i18n_str_NoLeaf11Support; 1203 return -1; 1204 } 1205 1206 // 1207 // Find the number of levels in the machine topology. While we're at it, 1208 // get the default values for __kmp_nThreadsPerCore & nCoresPerPkg. We will 1209 // try to get more accurate values later by explicitly counting them, 1210 // but get reasonable defaults now, in case we return early. 1211 // 1212 int level; 1213 int threadLevel = -1; 1214 int coreLevel = -1; 1215 int pkgLevel = -1; 1216 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 1217 1218 for (level = 0;; level++) { 1219 if (level > 31) { 1220 // 1221 // FIXME: Hack for DPD200163180 1222 // 1223 // If level is big then something went wrong -> exiting 1224 // 1225 // There could actually be 32 valid levels in the machine topology, 1226 // but so far, the only machine we have seen which does not exit 1227 // this loop before iteration 32 has fubar x2APIC settings. 1228 // 1229 // For now, just reject this case based upon loop trip count. 1230 // 1231 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1232 return -1; 1233 } 1234 __kmp_x86_cpuid(11, level, &buf); 1235 if (buf.ebx == 0) { 1236 if (pkgLevel < 0) { 1237 // 1238 // Will infer nPackages from __kmp_xproc 1239 // 1240 pkgLevel = level; 1241 level++; 1242 } 1243 break; 1244 } 1245 int kind = (buf.ecx >> 8) & 0xff; 1246 if (kind == 1) { 1247 // 1248 // SMT level 1249 // 1250 threadLevel = level; 1251 coreLevel = -1; 1252 pkgLevel = -1; 1253 __kmp_nThreadsPerCore = buf.ebx & 0xff; 1254 if (__kmp_nThreadsPerCore == 0) { 1255 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1256 return -1; 1257 } 1258 } 1259 else if (kind == 2) { 1260 // 1261 // core level 1262 // 1263 coreLevel = level; 1264 pkgLevel = -1; 1265 nCoresPerPkg = buf.ebx & 0xff; 1266 if (nCoresPerPkg == 0) { 1267 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1268 return -1; 1269 } 1270 } 1271 else { 1272 if (level <= 0) { 1273 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1274 return -1; 1275 } 1276 if (pkgLevel >= 0) { 1277 continue; 1278 } 1279 pkgLevel = level; 1280 nPackages = buf.ebx & 0xff; 1281 if (nPackages == 0) { 1282 *msg_id = kmp_i18n_str_InvalidCpuidInfo; 1283 return -1; 1284 } 1285 } 1286 } 1287 int depth = level; 1288 1289 // 1290 // In the above loop, "level" was counted from the finest level (usually 1291 // thread) to the coarsest. The caller expects that we will place the 1292 // labels in (*address2os)[].first.labels[] in the inverse order, so 1293 // we need to invert the vars saying which level means what. 1294 // 1295 if (threadLevel >= 0) { 1296 threadLevel = depth - threadLevel - 1; 1297 } 1298 if (coreLevel >= 0) { 1299 coreLevel = depth - coreLevel - 1; 1300 } 1301 KMP_DEBUG_ASSERT(pkgLevel >= 0); 1302 pkgLevel = depth - pkgLevel - 1; 1303 1304 // 1305 // The algorithm used starts by setting the affinity to each available 1306 // thread and retrieving info from the cpuid instruction, so if we are not 1307 // capable of calling __kmp_affinity_get_map()/__kmp_affinity_get_map(), 1308 // then we need to do something else - use the defaults that we calculated 1309 // from issuing cpuid without binding to each proc. 1310 // 1311 if (! KMP_AFFINITY_CAPABLE()) 1312 { 1313 // 1314 // Hack to try and infer the machine topology using only the data 1315 // available from cpuid on the current thread, and __kmp_xproc. 1316 // 1317 KMP_ASSERT(__kmp_affinity_type == affinity_none); 1318 1319 __kmp_ncores = __kmp_xproc / __kmp_nThreadsPerCore; 1320 nPackages = (__kmp_xproc + nCoresPerPkg - 1) / nCoresPerPkg; 1321 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1322 if (__kmp_affinity_verbose) { 1323 KMP_INFORM(AffNotCapableUseLocCpuidL11, "KMP_AFFINITY"); 1324 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1325 if (__kmp_affinity_uniform_topology()) { 1326 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1327 } else { 1328 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1329 } 1330 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1331 __kmp_nThreadsPerCore, __kmp_ncores); 1332 } 1333 return 0; 1334 } 1335 1336 // 1337 // 1338 // From here on, we can assume that it is safe to call 1339 // __kmp_get_system_affinity() and __kmp_set_system_affinity(), 1340 // even if __kmp_affinity_type = affinity_none. 1341 // 1342 1343 // 1344 // Save the affinity mask for the current thread. 1345 // 1346 kmp_affin_mask_t *oldMask; 1347 KMP_CPU_ALLOC(oldMask); 1348 __kmp_get_system_affinity(oldMask, TRUE); 1349 1350 // 1351 // Allocate the data structure to be returned. 1352 // 1353 AddrUnsPair *retval = (AddrUnsPair *) 1354 __kmp_allocate(sizeof(AddrUnsPair) * __kmp_avail_proc); 1355 1356 // 1357 // Run through each of the available contexts, binding the current thread 1358 // to it, and obtaining the pertinent information using the cpuid instr. 1359 // 1360 unsigned int proc; 1361 int nApics = 0; 1362 for (proc = 0; proc < KMP_CPU_SETSIZE; ++proc) { 1363 // 1364 // Skip this proc if it is not included in the machine model. 1365 // 1366 if (! KMP_CPU_ISSET(proc, fullMask)) { 1367 continue; 1368 } 1369 KMP_DEBUG_ASSERT(nApics < __kmp_avail_proc); 1370 1371 __kmp_affinity_bind_thread(proc); 1372 1373 // 1374 // Extrach the labels for each level in the machine topology map 1375 // from the Apic ID. 1376 // 1377 Address addr(depth); 1378 int prev_shift = 0; 1379 1380 for (level = 0; level < depth; level++) { 1381 __kmp_x86_cpuid(11, level, &buf); 1382 unsigned apicId = buf.edx; 1383 if (buf.ebx == 0) { 1384 if (level != depth - 1) { 1385 KMP_CPU_FREE(oldMask); 1386 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1387 return -1; 1388 } 1389 addr.labels[depth - level - 1] = apicId >> prev_shift; 1390 level++; 1391 break; 1392 } 1393 int shift = buf.eax & 0x1f; 1394 int mask = (1 << shift) - 1; 1395 addr.labels[depth - level - 1] = (apicId & mask) >> prev_shift; 1396 prev_shift = shift; 1397 } 1398 if (level != depth) { 1399 KMP_CPU_FREE(oldMask); 1400 *msg_id = kmp_i18n_str_InconsistentCpuidInfo; 1401 return -1; 1402 } 1403 1404 retval[nApics] = AddrUnsPair(addr, proc); 1405 nApics++; 1406 } 1407 1408 // 1409 // We've collected all the info we need. 1410 // Restore the old affinity mask for this thread. 1411 // 1412 __kmp_set_system_affinity(oldMask, TRUE); 1413 1414 // 1415 // If there's only one thread context to bind to, return now. 1416 // 1417 KMP_ASSERT(nApics > 0); 1418 if (nApics == 1) { 1419 __kmp_ncores = nPackages = 1; 1420 __kmp_nThreadsPerCore = nCoresPerPkg = 1; 1421 __kmp_ht_enabled = FALSE; 1422 if (__kmp_affinity_verbose) { 1423 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 1424 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1425 1426 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1427 if (__kmp_affinity_respect_mask) { 1428 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 1429 } else { 1430 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 1431 } 1432 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1433 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1434 KMP_INFORM(Topology, "KMP_AFFINITY", nPackages, nCoresPerPkg, 1435 __kmp_nThreadsPerCore, __kmp_ncores); 1436 } 1437 1438 if (__kmp_affinity_type == affinity_none) { 1439 __kmp_free(retval); 1440 KMP_CPU_FREE(oldMask); 1441 return 0; 1442 } 1443 1444 // 1445 // Form an Address object which only includes the package level. 1446 // 1447 Address addr(1); 1448 addr.labels[0] = retval[0].first.labels[pkgLevel]; 1449 retval[0].first = addr; 1450 1451 if (__kmp_affinity_gran_levels < 0) { 1452 __kmp_affinity_gran_levels = 0; 1453 } 1454 1455 if (__kmp_affinity_verbose) { 1456 __kmp_affinity_print_topology(retval, 1, 1, 0, -1, -1); 1457 } 1458 1459 *address2os = retval; 1460 KMP_CPU_FREE(oldMask); 1461 return 1; 1462 } 1463 1464 // 1465 // Sort the table by physical Id. 1466 // 1467 qsort(retval, nApics, sizeof(*retval), __kmp_affinity_cmp_Address_labels); 1468 1469 // 1470 // Find the radix at each of the levels. 1471 // 1472 unsigned *totals = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1473 unsigned *counts = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1474 unsigned *maxCt = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1475 unsigned *last = (unsigned *)__kmp_allocate(depth * sizeof(unsigned)); 1476 for (level = 0; level < depth; level++) { 1477 totals[level] = 1; 1478 maxCt[level] = 1; 1479 counts[level] = 1; 1480 last[level] = retval[0].first.labels[level]; 1481 } 1482 1483 // 1484 // From here on, the iteration variable "level" runs from the finest 1485 // level to the coarsest, i.e. we iterate forward through 1486 // (*address2os)[].first.labels[] - in the previous loops, we iterated 1487 // backwards. 1488 // 1489 for (proc = 1; (int)proc < nApics; proc++) { 1490 int level; 1491 for (level = 0; level < depth; level++) { 1492 if (retval[proc].first.labels[level] != last[level]) { 1493 int j; 1494 for (j = level + 1; j < depth; j++) { 1495 totals[j]++; 1496 counts[j] = 1; 1497 // The line below causes printing incorrect topology information 1498 // in case the max value for some level (maxCt[level]) is encountered earlier than 1499 // some less value while going through the array. 1500 // For example, let pkg0 has 4 cores and pkg1 has 2 cores. Then maxCt[1] == 2 1501 // whereas it must be 4. 1502 // TODO!!! Check if it can be commented safely 1503 //maxCt[j] = 1; 1504 last[j] = retval[proc].first.labels[j]; 1505 } 1506 totals[level]++; 1507 counts[level]++; 1508 if (counts[level] > maxCt[level]) { 1509 maxCt[level] = counts[level]; 1510 } 1511 last[level] = retval[proc].first.labels[level]; 1512 break; 1513 } 1514 else if (level == depth - 1) { 1515 __kmp_free(last); 1516 __kmp_free(maxCt); 1517 __kmp_free(counts); 1518 __kmp_free(totals); 1519 __kmp_free(retval); 1520 KMP_CPU_FREE(oldMask); 1521 *msg_id = kmp_i18n_str_x2ApicIDsNotUnique; 1522 return -1; 1523 } 1524 } 1525 } 1526 1527 // 1528 // When affinity is off, this routine will still be called to set 1529 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 1530 // nCoresPerPkg, & nPackages. Make sure all these vars are set 1531 // correctly, and return if affinity is not enabled. 1532 // 1533 if (threadLevel >= 0) { 1534 __kmp_nThreadsPerCore = maxCt[threadLevel]; 1535 } 1536 else { 1537 __kmp_nThreadsPerCore = 1; 1538 } 1539 __kmp_ht_enabled = (__kmp_nThreadsPerCore > 1); 1540 1541 nPackages = totals[pkgLevel]; 1542 1543 if (coreLevel >= 0) { 1544 __kmp_ncores = totals[coreLevel]; 1545 nCoresPerPkg = maxCt[coreLevel]; 1546 } 1547 else { 1548 __kmp_ncores = nPackages; 1549 nCoresPerPkg = 1; 1550 } 1551 1552 // 1553 // Check to see if the machine topology is uniform 1554 // 1555 unsigned prod = maxCt[0]; 1556 for (level = 1; level < depth; level++) { 1557 prod *= maxCt[level]; 1558 } 1559 bool uniform = (prod == totals[level - 1]); 1560 1561 // 1562 // Print the machine topology summary. 1563 // 1564 if (__kmp_affinity_verbose) { 1565 char mask[KMP_AFFIN_MASK_PRINT_LEN]; 1566 __kmp_affinity_print_mask(mask, KMP_AFFIN_MASK_PRINT_LEN, oldMask); 1567 1568 KMP_INFORM(AffUseGlobCpuidL11, "KMP_AFFINITY"); 1569 if (__kmp_affinity_respect_mask) { 1570 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", mask); 1571 } else { 1572 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", mask); 1573 } 1574 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 1575 if (uniform) { 1576 KMP_INFORM(Uniform, "KMP_AFFINITY"); 1577 } else { 1578 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 1579 } 1580 1581 kmp_str_buf_t buf; 1582 __kmp_str_buf_init(&buf); 1583 1584 __kmp_str_buf_print(&buf, "%d", totals[0]); 1585 for (level = 1; level <= pkgLevel; level++) { 1586 __kmp_str_buf_print(&buf, " x %d", maxCt[level]); 1587 } 1588 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, nCoresPerPkg, 1589 __kmp_nThreadsPerCore, __kmp_ncores); 1590 1591 __kmp_str_buf_free(&buf); 1592 } 1593 1594 if (__kmp_affinity_type == affinity_none) { 1595 __kmp_free(last); 1596 __kmp_free(maxCt); 1597 __kmp_free(counts); 1598 __kmp_free(totals); 1599 __kmp_free(retval); 1600 KMP_CPU_FREE(oldMask); 1601 return 0; 1602 } 1603 1604 // 1605 // Find any levels with radiix 1, and remove them from the map 1606 // (except for the package level). 1607 // 1608 int new_depth = 0; 1609 for (level = 0; level < depth; level++) { 1610 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1611 continue; 1612 } 1613 new_depth++; 1614 } 1615 1616 // 1617 // If we are removing any levels, allocate a new vector to return, 1618 // and copy the relevant information to it. 1619 // 1620 if (new_depth != depth) { 1621 AddrUnsPair *new_retval = (AddrUnsPair *)__kmp_allocate( 1622 sizeof(AddrUnsPair) * nApics); 1623 for (proc = 0; (int)proc < nApics; proc++) { 1624 Address addr(new_depth); 1625 new_retval[proc] = AddrUnsPair(addr, retval[proc].second); 1626 } 1627 int new_level = 0; 1628 for (level = 0; level < depth; level++) { 1629 if ((maxCt[level] == 1) && (level != pkgLevel)) { 1630 if (level == threadLevel) { 1631 threadLevel = -1; 1632 } 1633 else if ((threadLevel >= 0) && (level < threadLevel)) { 1634 threadLevel--; 1635 } 1636 if (level == coreLevel) { 1637 coreLevel = -1; 1638 } 1639 else if ((coreLevel >= 0) && (level < coreLevel)) { 1640 coreLevel--; 1641 } 1642 if (level < pkgLevel) { 1643 pkgLevel--; 1644 } 1645 continue; 1646 } 1647 for (proc = 0; (int)proc < nApics; proc++) { 1648 new_retval[proc].first.labels[new_level] 1649 = retval[proc].first.labels[level]; 1650 } 1651 new_level++; 1652 } 1653 1654 __kmp_free(retval); 1655 retval = new_retval; 1656 depth = new_depth; 1657 } 1658 1659 if (__kmp_affinity_gran_levels < 0) { 1660 // 1661 // Set the granularity level based on what levels are modeled 1662 // in the machine topology map. 1663 // 1664 __kmp_affinity_gran_levels = 0; 1665 if ((threadLevel >= 0) && (__kmp_affinity_gran > affinity_gran_thread)) { 1666 __kmp_affinity_gran_levels++; 1667 } 1668 if ((coreLevel >= 0) && (__kmp_affinity_gran > affinity_gran_core)) { 1669 __kmp_affinity_gran_levels++; 1670 } 1671 if (__kmp_affinity_gran > affinity_gran_package) { 1672 __kmp_affinity_gran_levels++; 1673 } 1674 } 1675 1676 if (__kmp_affinity_verbose) { 1677 __kmp_affinity_print_topology(retval, nApics, depth, pkgLevel, 1678 coreLevel, threadLevel); 1679 } 1680 1681 __kmp_free(last); 1682 __kmp_free(maxCt); 1683 __kmp_free(counts); 1684 __kmp_free(totals); 1685 KMP_CPU_FREE(oldMask); 1686 *address2os = retval; 1687 return depth; 1688 } 1689 1690 1691 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 1692 1693 1694 #define osIdIndex 0 1695 #define threadIdIndex 1 1696 #define coreIdIndex 2 1697 #define pkgIdIndex 3 1698 #define nodeIdIndex 4 1699 1700 typedef unsigned *ProcCpuInfo; 1701 static unsigned maxIndex = pkgIdIndex; 1702 1703 1704 static int 1705 __kmp_affinity_cmp_ProcCpuInfo_os_id(const void *a, const void *b) 1706 { 1707 const unsigned *aa = (const unsigned *)a; 1708 const unsigned *bb = (const unsigned *)b; 1709 if (aa[osIdIndex] < bb[osIdIndex]) return -1; 1710 if (aa[osIdIndex] > bb[osIdIndex]) return 1; 1711 return 0; 1712 }; 1713 1714 1715 static int 1716 __kmp_affinity_cmp_ProcCpuInfo_phys_id(const void *a, const void *b) 1717 { 1718 unsigned i; 1719 const unsigned *aa = *((const unsigned **)a); 1720 const unsigned *bb = *((const unsigned **)b); 1721 for (i = maxIndex; ; i--) { 1722 if (aa[i] < bb[i]) return -1; 1723 if (aa[i] > bb[i]) return 1; 1724 if (i == osIdIndex) break; 1725 } 1726 return 0; 1727 } 1728 1729 1730 // 1731 // Parse /proc/cpuinfo (or an alternate file in the same format) to obtain the 1732 // affinity map. 1733 // 1734 static int 1735 __kmp_affinity_create_cpuinfo_map(AddrUnsPair **address2os, int *line, 1736 kmp_i18n_id_t *const msg_id, FILE *f) 1737 { 1738 *address2os = NULL; 1739 *msg_id = kmp_i18n_null; 1740 1741 // 1742 // Scan of the file, and count the number of "processor" (osId) fields, 1743 // and find the higest value of <n> for a node_<n> field. 1744 // 1745 char buf[256]; 1746 unsigned num_records = 0; 1747 while (! feof(f)) { 1748 buf[sizeof(buf) - 1] = 1; 1749 if (! fgets(buf, sizeof(buf), f)) { 1750 // 1751 // Read errors presumably because of EOF 1752 // 1753 break; 1754 } 1755 1756 char s1[] = "processor"; 1757 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1758 num_records++; 1759 continue; 1760 } 1761 1762 // 1763 // FIXME - this will match "node_<n> <garbage>" 1764 // 1765 unsigned level; 1766 if (sscanf(buf, "node_%d id", &level) == 1) { 1767 if (nodeIdIndex + level >= maxIndex) { 1768 maxIndex = nodeIdIndex + level; 1769 } 1770 continue; 1771 } 1772 } 1773 1774 // 1775 // Check for empty file / no valid processor records, or too many. 1776 // The number of records can't exceed the number of valid bits in the 1777 // affinity mask. 1778 // 1779 if (num_records == 0) { 1780 *line = 0; 1781 *msg_id = kmp_i18n_str_NoProcRecords; 1782 return -1; 1783 } 1784 if (num_records > (unsigned)__kmp_xproc) { 1785 *line = 0; 1786 *msg_id = kmp_i18n_str_TooManyProcRecords; 1787 return -1; 1788 } 1789 1790 // 1791 // Set the file pointer back to the begginning, so that we can scan the 1792 // file again, this time performing a full parse of the data. 1793 // Allocate a vector of ProcCpuInfo object, where we will place the data. 1794 // Adding an extra element at the end allows us to remove a lot of extra 1795 // checks for termination conditions. 1796 // 1797 if (fseek(f, 0, SEEK_SET) != 0) { 1798 *line = 0; 1799 *msg_id = kmp_i18n_str_CantRewindCpuinfo; 1800 return -1; 1801 } 1802 1803 // 1804 // Allocate the array of records to store the proc info in. The dummy 1805 // element at the end makes the logic in filling them out easier to code. 1806 // 1807 unsigned **threadInfo = (unsigned **)__kmp_allocate((num_records + 1) 1808 * sizeof(unsigned *)); 1809 unsigned i; 1810 for (i = 0; i <= num_records; i++) { 1811 threadInfo[i] = (unsigned *)__kmp_allocate((maxIndex + 1) 1812 * sizeof(unsigned)); 1813 } 1814 1815 #define CLEANUP_THREAD_INFO \ 1816 for (i = 0; i <= num_records; i++) { \ 1817 __kmp_free(threadInfo[i]); \ 1818 } \ 1819 __kmp_free(threadInfo); 1820 1821 // 1822 // A value of UINT_MAX means that we didn't find the field 1823 // 1824 unsigned __index; 1825 1826 #define INIT_PROC_INFO(p) \ 1827 for (__index = 0; __index <= maxIndex; __index++) { \ 1828 (p)[__index] = UINT_MAX; \ 1829 } 1830 1831 for (i = 0; i <= num_records; i++) { 1832 INIT_PROC_INFO(threadInfo[i]); 1833 } 1834 1835 unsigned num_avail = 0; 1836 *line = 0; 1837 while (! feof(f)) { 1838 // 1839 // Create an inner scoping level, so that all the goto targets at the 1840 // end of the loop appear in an outer scoping level. This avoids 1841 // warnings about jumping past an initialization to a target in the 1842 // same block. 1843 // 1844 { 1845 buf[sizeof(buf) - 1] = 1; 1846 bool long_line = false; 1847 if (! fgets(buf, sizeof(buf), f)) { 1848 // 1849 // Read errors presumably because of EOF 1850 // 1851 // If there is valid data in threadInfo[num_avail], then fake 1852 // a blank line in ensure that the last address gets parsed. 1853 // 1854 bool valid = false; 1855 for (i = 0; i <= maxIndex; i++) { 1856 if (threadInfo[num_avail][i] != UINT_MAX) { 1857 valid = true; 1858 } 1859 } 1860 if (! valid) { 1861 break; 1862 } 1863 buf[0] = 0; 1864 } else if (!buf[sizeof(buf) - 1]) { 1865 // 1866 // The line is longer than the buffer. Set a flag and don't 1867 // emit an error if we were going to ignore the line, anyway. 1868 // 1869 long_line = true; 1870 1871 #define CHECK_LINE \ 1872 if (long_line) { \ 1873 CLEANUP_THREAD_INFO; \ 1874 *msg_id = kmp_i18n_str_LongLineCpuinfo; \ 1875 return -1; \ 1876 } 1877 } 1878 (*line)++; 1879 1880 char s1[] = "processor"; 1881 if (strncmp(buf, s1, sizeof(s1) - 1) == 0) { 1882 CHECK_LINE; 1883 char *p = strchr(buf + sizeof(s1) - 1, ':'); 1884 unsigned val; 1885 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1886 if (threadInfo[num_avail][osIdIndex] != UINT_MAX) goto dup_field; 1887 threadInfo[num_avail][osIdIndex] = val; 1888 continue; 1889 } 1890 char s2[] = "physical id"; 1891 if (strncmp(buf, s2, sizeof(s2) - 1) == 0) { 1892 CHECK_LINE; 1893 char *p = strchr(buf + sizeof(s2) - 1, ':'); 1894 unsigned val; 1895 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1896 if (threadInfo[num_avail][pkgIdIndex] != UINT_MAX) goto dup_field; 1897 threadInfo[num_avail][pkgIdIndex] = val; 1898 continue; 1899 } 1900 char s3[] = "core id"; 1901 if (strncmp(buf, s3, sizeof(s3) - 1) == 0) { 1902 CHECK_LINE; 1903 char *p = strchr(buf + sizeof(s3) - 1, ':'); 1904 unsigned val; 1905 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1906 if (threadInfo[num_avail][coreIdIndex] != UINT_MAX) goto dup_field; 1907 threadInfo[num_avail][coreIdIndex] = val; 1908 continue; 1909 } 1910 char s4[] = "thread id"; 1911 if (strncmp(buf, s4, sizeof(s4) - 1) == 0) { 1912 CHECK_LINE; 1913 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1914 unsigned val; 1915 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1916 if (threadInfo[num_avail][threadIdIndex] != UINT_MAX) goto dup_field; 1917 threadInfo[num_avail][threadIdIndex] = val; 1918 continue; 1919 } 1920 unsigned level; 1921 if (sscanf(buf, "node_%d id", &level) == 1) { 1922 CHECK_LINE; 1923 char *p = strchr(buf + sizeof(s4) - 1, ':'); 1924 unsigned val; 1925 if ((p == NULL) || (sscanf(p + 1, "%u\n", &val) != 1)) goto no_val; 1926 KMP_ASSERT(nodeIdIndex + level <= maxIndex); 1927 if (threadInfo[num_avail][nodeIdIndex + level] != UINT_MAX) goto dup_field; 1928 threadInfo[num_avail][nodeIdIndex + level] = val; 1929 continue; 1930 } 1931 1932 // 1933 // We didn't recognize the leading token on the line. 1934 // There are lots of leading tokens that we don't recognize - 1935 // if the line isn't empty, go on to the next line. 1936 // 1937 if ((*buf != 0) && (*buf != '\n')) { 1938 // 1939 // If the line is longer than the buffer, read characters 1940 // until we find a newline. 1941 // 1942 if (long_line) { 1943 int ch; 1944 while (((ch = fgetc(f)) != EOF) && (ch != '\n')); 1945 } 1946 continue; 1947 } 1948 1949 // 1950 // A newline has signalled the end of the processor record. 1951 // Check that there aren't too many procs specified. 1952 // 1953 if (num_avail == __kmp_xproc) { 1954 CLEANUP_THREAD_INFO; 1955 *msg_id = kmp_i18n_str_TooManyEntries; 1956 return -1; 1957 } 1958 1959 // 1960 // Check for missing fields. The osId field must be there, and we 1961 // currently require that the physical id field is specified, also. 1962 // 1963 if (threadInfo[num_avail][osIdIndex] == UINT_MAX) { 1964 CLEANUP_THREAD_INFO; 1965 *msg_id = kmp_i18n_str_MissingProcField; 1966 return -1; 1967 } 1968 if (threadInfo[0][pkgIdIndex] == UINT_MAX) { 1969 CLEANUP_THREAD_INFO; 1970 *msg_id = kmp_i18n_str_MissingPhysicalIDField; 1971 return -1; 1972 } 1973 1974 // 1975 // Skip this proc if it is not included in the machine model. 1976 // 1977 if (! KMP_CPU_ISSET(threadInfo[num_avail][osIdIndex], fullMask)) { 1978 INIT_PROC_INFO(threadInfo[num_avail]); 1979 continue; 1980 } 1981 1982 // 1983 // We have a successful parse of this proc's info. 1984 // Increment the counter, and prepare for the next proc. 1985 // 1986 num_avail++; 1987 KMP_ASSERT(num_avail <= num_records); 1988 INIT_PROC_INFO(threadInfo[num_avail]); 1989 } 1990 continue; 1991 1992 no_val: 1993 CLEANUP_THREAD_INFO; 1994 *msg_id = kmp_i18n_str_MissingValCpuinfo; 1995 return -1; 1996 1997 dup_field: 1998 CLEANUP_THREAD_INFO; 1999 *msg_id = kmp_i18n_str_DuplicateFieldCpuinfo; 2000 return -1; 2001 } 2002 *line = 0; 2003 2004 # if KMP_MIC && REDUCE_TEAM_SIZE 2005 unsigned teamSize = 0; 2006 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2007 2008 // check for num_records == __kmp_xproc ??? 2009 2010 // 2011 // If there's only one thread context to bind to, form an Address object 2012 // with depth 1 and return immediately (or, if affinity is off, set 2013 // address2os to NULL and return). 2014 // 2015 // If it is configured to omit the package level when there is only a 2016 // single package, the logic at the end of this routine won't work if 2017 // there is only a single thread - it would try to form an Address 2018 // object with depth 0. 2019 // 2020 KMP_ASSERT(num_avail > 0); 2021 KMP_ASSERT(num_avail <= num_records); 2022 if (num_avail == 1) { 2023 __kmp_ncores = 1; 2024 __kmp_nThreadsPerCore = nCoresPerPkg = nPackages = 1; 2025 __kmp_ht_enabled = FALSE; 2026 if (__kmp_affinity_verbose) { 2027 if (! KMP_AFFINITY_CAPABLE()) { 2028 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2029 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2030 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2031 } 2032 else { 2033 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2034 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 2035 fullMask); 2036 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2037 if (__kmp_affinity_respect_mask) { 2038 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2039 } else { 2040 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2041 } 2042 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2043 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2044 } 2045 int index; 2046 kmp_str_buf_t buf; 2047 __kmp_str_buf_init(&buf); 2048 __kmp_str_buf_print(&buf, "1"); 2049 for (index = maxIndex - 1; index > pkgIdIndex; index--) { 2050 __kmp_str_buf_print(&buf, " x 1"); 2051 } 2052 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, 1, 1, 1); 2053 __kmp_str_buf_free(&buf); 2054 } 2055 2056 if (__kmp_affinity_type == affinity_none) { 2057 CLEANUP_THREAD_INFO; 2058 return 0; 2059 } 2060 2061 *address2os = (AddrUnsPair*)__kmp_allocate(sizeof(AddrUnsPair)); 2062 Address addr(1); 2063 addr.labels[0] = threadInfo[0][pkgIdIndex]; 2064 (*address2os)[0] = AddrUnsPair(addr, threadInfo[0][osIdIndex]); 2065 2066 if (__kmp_affinity_gran_levels < 0) { 2067 __kmp_affinity_gran_levels = 0; 2068 } 2069 2070 if (__kmp_affinity_verbose) { 2071 __kmp_affinity_print_topology(*address2os, 1, 1, 0, -1, -1); 2072 } 2073 2074 CLEANUP_THREAD_INFO; 2075 return 1; 2076 } 2077 2078 // 2079 // Sort the threadInfo table by physical Id. 2080 // 2081 qsort(threadInfo, num_avail, sizeof(*threadInfo), 2082 __kmp_affinity_cmp_ProcCpuInfo_phys_id); 2083 2084 // 2085 // The table is now sorted by pkgId / coreId / threadId, but we really 2086 // don't know the radix of any of the fields. pkgId's may be sparsely 2087 // assigned among the chips on a system. Although coreId's are usually 2088 // assigned [0 .. coresPerPkg-1] and threadId's are usually assigned 2089 // [0..threadsPerCore-1], we don't want to make any such assumptions. 2090 // 2091 // For that matter, we don't know what coresPerPkg and threadsPerCore 2092 // (or the total # packages) are at this point - we want to determine 2093 // that now. We only have an upper bound on the first two figures. 2094 // 2095 unsigned *counts = (unsigned *)__kmp_allocate((maxIndex + 1) 2096 * sizeof(unsigned)); 2097 unsigned *maxCt = (unsigned *)__kmp_allocate((maxIndex + 1) 2098 * sizeof(unsigned)); 2099 unsigned *totals = (unsigned *)__kmp_allocate((maxIndex + 1) 2100 * sizeof(unsigned)); 2101 unsigned *lastId = (unsigned *)__kmp_allocate((maxIndex + 1) 2102 * sizeof(unsigned)); 2103 2104 bool assign_thread_ids = false; 2105 unsigned threadIdCt; 2106 unsigned index; 2107 2108 restart_radix_check: 2109 threadIdCt = 0; 2110 2111 // 2112 // Initialize the counter arrays with data from threadInfo[0]. 2113 // 2114 if (assign_thread_ids) { 2115 if (threadInfo[0][threadIdIndex] == UINT_MAX) { 2116 threadInfo[0][threadIdIndex] = threadIdCt++; 2117 } 2118 else if (threadIdCt <= threadInfo[0][threadIdIndex]) { 2119 threadIdCt = threadInfo[0][threadIdIndex] + 1; 2120 } 2121 } 2122 for (index = 0; index <= maxIndex; index++) { 2123 counts[index] = 1; 2124 maxCt[index] = 1; 2125 totals[index] = 1; 2126 lastId[index] = threadInfo[0][index];; 2127 } 2128 2129 // 2130 // Run through the rest of the OS procs. 2131 // 2132 for (i = 1; i < num_avail; i++) { 2133 // 2134 // Find the most significant index whose id differs 2135 // from the id for the previous OS proc. 2136 // 2137 for (index = maxIndex; index >= threadIdIndex; index--) { 2138 if (assign_thread_ids && (index == threadIdIndex)) { 2139 // 2140 // Auto-assign the thread id field if it wasn't specified. 2141 // 2142 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2143 threadInfo[i][threadIdIndex] = threadIdCt++; 2144 } 2145 2146 // 2147 // Aparrently the thread id field was specified for some 2148 // entries and not others. Start the thread id counter 2149 // off at the next higher thread id. 2150 // 2151 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2152 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2153 } 2154 } 2155 if (threadInfo[i][index] != lastId[index]) { 2156 // 2157 // Run through all indices which are less significant, 2158 // and reset the counts to 1. 2159 // 2160 // At all levels up to and including index, we need to 2161 // increment the totals and record the last id. 2162 // 2163 unsigned index2; 2164 for (index2 = threadIdIndex; index2 < index; index2++) { 2165 totals[index2]++; 2166 if (counts[index2] > maxCt[index2]) { 2167 maxCt[index2] = counts[index2]; 2168 } 2169 counts[index2] = 1; 2170 lastId[index2] = threadInfo[i][index2]; 2171 } 2172 counts[index]++; 2173 totals[index]++; 2174 lastId[index] = threadInfo[i][index]; 2175 2176 if (assign_thread_ids && (index > threadIdIndex)) { 2177 2178 # if KMP_MIC && REDUCE_TEAM_SIZE 2179 // 2180 // The default team size is the total #threads in the machine 2181 // minus 1 thread for every core that has 3 or more threads. 2182 // 2183 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2184 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2185 2186 // 2187 // Restart the thread counter, as we are on a new core. 2188 // 2189 threadIdCt = 0; 2190 2191 // 2192 // Auto-assign the thread id field if it wasn't specified. 2193 // 2194 if (threadInfo[i][threadIdIndex] == UINT_MAX) { 2195 threadInfo[i][threadIdIndex] = threadIdCt++; 2196 } 2197 2198 // 2199 // Aparrently the thread id field was specified for some 2200 // entries and not others. Start the thread id counter 2201 // off at the next higher thread id. 2202 // 2203 else if (threadIdCt <= threadInfo[i][threadIdIndex]) { 2204 threadIdCt = threadInfo[i][threadIdIndex] + 1; 2205 } 2206 } 2207 break; 2208 } 2209 } 2210 if (index < threadIdIndex) { 2211 // 2212 // If thread ids were specified, it is an error if they are not 2213 // unique. Also, check that we waven't already restarted the 2214 // loop (to be safe - shouldn't need to). 2215 // 2216 if ((threadInfo[i][threadIdIndex] != UINT_MAX) 2217 || assign_thread_ids) { 2218 __kmp_free(lastId); 2219 __kmp_free(totals); 2220 __kmp_free(maxCt); 2221 __kmp_free(counts); 2222 CLEANUP_THREAD_INFO; 2223 *msg_id = kmp_i18n_str_PhysicalIDsNotUnique; 2224 return -1; 2225 } 2226 2227 // 2228 // If the thread ids were not specified and we see entries 2229 // entries that are duplicates, start the loop over and 2230 // assign the thread ids manually. 2231 // 2232 assign_thread_ids = true; 2233 goto restart_radix_check; 2234 } 2235 } 2236 2237 # if KMP_MIC && REDUCE_TEAM_SIZE 2238 // 2239 // The default team size is the total #threads in the machine 2240 // minus 1 thread for every core that has 3 or more threads. 2241 // 2242 teamSize += ( threadIdCt <= 2 ) ? ( threadIdCt ) : ( threadIdCt - 1 ); 2243 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2244 2245 for (index = threadIdIndex; index <= maxIndex; index++) { 2246 if (counts[index] > maxCt[index]) { 2247 maxCt[index] = counts[index]; 2248 } 2249 } 2250 2251 __kmp_nThreadsPerCore = maxCt[threadIdIndex]; 2252 nCoresPerPkg = maxCt[coreIdIndex]; 2253 nPackages = totals[pkgIdIndex]; 2254 2255 // 2256 // Check to see if the machine topology is uniform 2257 // 2258 unsigned prod = totals[maxIndex]; 2259 for (index = threadIdIndex; index < maxIndex; index++) { 2260 prod *= maxCt[index]; 2261 } 2262 bool uniform = (prod == totals[threadIdIndex]); 2263 2264 // 2265 // When affinity is off, this routine will still be called to set 2266 // __kmp_ht_enabled, & __kmp_ncores, as well as __kmp_nThreadsPerCore, 2267 // nCoresPerPkg, & nPackages. Make sure all these vars are set 2268 // correctly, and return now if affinity is not enabled. 2269 // 2270 __kmp_ht_enabled = (maxCt[threadIdIndex] > 1); // threads per core > 1 2271 __kmp_ncores = totals[coreIdIndex]; 2272 2273 if (__kmp_affinity_verbose) { 2274 if (! KMP_AFFINITY_CAPABLE()) { 2275 KMP_INFORM(AffNotCapableUseCpuinfo, "KMP_AFFINITY"); 2276 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2277 if (uniform) { 2278 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2279 } else { 2280 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2281 } 2282 } 2283 else { 2284 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 2285 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, fullMask); 2286 KMP_INFORM(AffCapableUseCpuinfo, "KMP_AFFINITY"); 2287 if (__kmp_affinity_respect_mask) { 2288 KMP_INFORM(InitOSProcSetRespect, "KMP_AFFINITY", buf); 2289 } else { 2290 KMP_INFORM(InitOSProcSetNotRespect, "KMP_AFFINITY", buf); 2291 } 2292 KMP_INFORM(AvailableOSProc, "KMP_AFFINITY", __kmp_avail_proc); 2293 if (uniform) { 2294 KMP_INFORM(Uniform, "KMP_AFFINITY"); 2295 } else { 2296 KMP_INFORM(NonUniform, "KMP_AFFINITY"); 2297 } 2298 } 2299 kmp_str_buf_t buf; 2300 __kmp_str_buf_init(&buf); 2301 2302 __kmp_str_buf_print(&buf, "%d", totals[maxIndex]); 2303 for (index = maxIndex - 1; index >= pkgIdIndex; index--) { 2304 __kmp_str_buf_print(&buf, " x %d", maxCt[index]); 2305 } 2306 KMP_INFORM(TopologyExtra, "KMP_AFFINITY", buf.str, maxCt[coreIdIndex], 2307 maxCt[threadIdIndex], __kmp_ncores); 2308 2309 __kmp_str_buf_free(&buf); 2310 } 2311 2312 # if KMP_MIC && REDUCE_TEAM_SIZE 2313 // 2314 // Set the default team size. 2315 // 2316 if ((__kmp_dflt_team_nth == 0) && (teamSize > 0)) { 2317 __kmp_dflt_team_nth = teamSize; 2318 KA_TRACE(20, ("__kmp_affinity_create_cpuinfo_map: setting __kmp_dflt_team_nth = %d\n", 2319 __kmp_dflt_team_nth)); 2320 } 2321 # endif // KMP_MIC && REDUCE_TEAM_SIZE 2322 2323 if (__kmp_affinity_type == affinity_none) { 2324 __kmp_free(lastId); 2325 __kmp_free(totals); 2326 __kmp_free(maxCt); 2327 __kmp_free(counts); 2328 CLEANUP_THREAD_INFO; 2329 return 0; 2330 } 2331 2332 // 2333 // Count the number of levels which have more nodes at that level than 2334 // at the parent's level (with there being an implicit root node of 2335 // the top level). This is equivalent to saying that there is at least 2336 // one node at this level which has a sibling. These levels are in the 2337 // map, and the package level is always in the map. 2338 // 2339 bool *inMap = (bool *)__kmp_allocate((maxIndex + 1) * sizeof(bool)); 2340 int level = 0; 2341 for (index = threadIdIndex; index < maxIndex; index++) { 2342 KMP_ASSERT(totals[index] >= totals[index + 1]); 2343 inMap[index] = (totals[index] > totals[index + 1]); 2344 } 2345 inMap[maxIndex] = (totals[maxIndex] > 1); 2346 inMap[pkgIdIndex] = true; 2347 2348 int depth = 0; 2349 for (index = threadIdIndex; index <= maxIndex; index++) { 2350 if (inMap[index]) { 2351 depth++; 2352 } 2353 } 2354 KMP_ASSERT(depth > 0); 2355 2356 // 2357 // Construct the data structure that is to be returned. 2358 // 2359 *address2os = (AddrUnsPair*) 2360 __kmp_allocate(sizeof(AddrUnsPair) * num_avail); 2361 int pkgLevel = -1; 2362 int coreLevel = -1; 2363 int threadLevel = -1; 2364 2365 for (i = 0; i < num_avail; ++i) { 2366 Address addr(depth); 2367 unsigned os = threadInfo[i][osIdIndex]; 2368 int src_index; 2369 int dst_index = 0; 2370 2371 for (src_index = maxIndex; src_index >= threadIdIndex; src_index--) { 2372 if (! inMap[src_index]) { 2373 continue; 2374 } 2375 addr.labels[dst_index] = threadInfo[i][src_index]; 2376 if (src_index == pkgIdIndex) { 2377 pkgLevel = dst_index; 2378 } 2379 else if (src_index == coreIdIndex) { 2380 coreLevel = dst_index; 2381 } 2382 else if (src_index == threadIdIndex) { 2383 threadLevel = dst_index; 2384 } 2385 dst_index++; 2386 } 2387 (*address2os)[i] = AddrUnsPair(addr, os); 2388 } 2389 2390 if (__kmp_affinity_gran_levels < 0) { 2391 // 2392 // Set the granularity level based on what levels are modeled 2393 // in the machine topology map. 2394 // 2395 unsigned src_index; 2396 __kmp_affinity_gran_levels = 0; 2397 for (src_index = threadIdIndex; src_index <= maxIndex; src_index++) { 2398 if (! inMap[src_index]) { 2399 continue; 2400 } 2401 switch (src_index) { 2402 case threadIdIndex: 2403 if (__kmp_affinity_gran > affinity_gran_thread) { 2404 __kmp_affinity_gran_levels++; 2405 } 2406 2407 break; 2408 case coreIdIndex: 2409 if (__kmp_affinity_gran > affinity_gran_core) { 2410 __kmp_affinity_gran_levels++; 2411 } 2412 break; 2413 2414 case pkgIdIndex: 2415 if (__kmp_affinity_gran > affinity_gran_package) { 2416 __kmp_affinity_gran_levels++; 2417 } 2418 break; 2419 } 2420 } 2421 } 2422 2423 if (__kmp_affinity_verbose) { 2424 __kmp_affinity_print_topology(*address2os, num_avail, depth, pkgLevel, 2425 coreLevel, threadLevel); 2426 } 2427 2428 __kmp_free(inMap); 2429 __kmp_free(lastId); 2430 __kmp_free(totals); 2431 __kmp_free(maxCt); 2432 __kmp_free(counts); 2433 CLEANUP_THREAD_INFO; 2434 return depth; 2435 } 2436 2437 2438 // 2439 // Create and return a table of affinity masks, indexed by OS thread ID. 2440 // This routine handles OR'ing together all the affinity masks of threads 2441 // that are sufficiently close, if granularity > fine. 2442 // 2443 static kmp_affin_mask_t * 2444 __kmp_create_masks(unsigned *maxIndex, unsigned *numUnique, 2445 AddrUnsPair *address2os, unsigned numAddrs) 2446 { 2447 // 2448 // First form a table of affinity masks in order of OS thread id. 2449 // 2450 unsigned depth; 2451 unsigned maxOsId; 2452 unsigned i; 2453 2454 KMP_ASSERT(numAddrs > 0); 2455 depth = address2os[0].first.depth; 2456 2457 maxOsId = 0; 2458 for (i = 0; i < numAddrs; i++) { 2459 unsigned osId = address2os[i].second; 2460 if (osId > maxOsId) { 2461 maxOsId = osId; 2462 } 2463 } 2464 kmp_affin_mask_t *osId2Mask = (kmp_affin_mask_t *)__kmp_allocate( 2465 (maxOsId + 1) * __kmp_affin_mask_size); 2466 2467 // 2468 // Sort the address2os table according to physical order. Doing so 2469 // will put all threads on the same core/package/node in consecutive 2470 // locations. 2471 // 2472 qsort(address2os, numAddrs, sizeof(*address2os), 2473 __kmp_affinity_cmp_Address_labels); 2474 2475 KMP_ASSERT(__kmp_affinity_gran_levels >= 0); 2476 if (__kmp_affinity_verbose && (__kmp_affinity_gran_levels > 0)) { 2477 KMP_INFORM(ThreadsMigrate, "KMP_AFFINITY", __kmp_affinity_gran_levels); 2478 } 2479 if (__kmp_affinity_gran_levels >= (int)depth) { 2480 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2481 && (__kmp_affinity_type != affinity_none))) { 2482 KMP_WARNING(AffThreadsMayMigrate); 2483 } 2484 } 2485 2486 // 2487 // Run through the table, forming the masks for all threads on each 2488 // core. Threads on the same core will have identical "Address" 2489 // objects, not considering the last level, which must be the thread 2490 // id. All threads on a core will appear consecutively. 2491 // 2492 unsigned unique = 0; 2493 unsigned j = 0; // index of 1st thread on core 2494 unsigned leader = 0; 2495 Address *leaderAddr = &(address2os[0].first); 2496 kmp_affin_mask_t *sum 2497 = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 2498 KMP_CPU_ZERO(sum); 2499 KMP_CPU_SET(address2os[0].second, sum); 2500 for (i = 1; i < numAddrs; i++) { 2501 // 2502 // If this thread is sufficiently close to the leader (withing the 2503 // granularity setting), then set the bit for this os thread in the 2504 // affinity mask for this group, and go on to the next thread. 2505 // 2506 if (leaderAddr->isClose(address2os[i].first, 2507 __kmp_affinity_gran_levels)) { 2508 KMP_CPU_SET(address2os[i].second, sum); 2509 continue; 2510 } 2511 2512 // 2513 // For every thread in this group, copy the mask to the thread's 2514 // entry in the osId2Mask table. Mark the first address as a 2515 // leader. 2516 // 2517 for (; j < i; j++) { 2518 unsigned osId = address2os[j].second; 2519 KMP_DEBUG_ASSERT(osId <= maxOsId); 2520 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2521 KMP_CPU_COPY(mask, sum); 2522 address2os[j].first.leader = (j == leader); 2523 } 2524 unique++; 2525 2526 // 2527 // Start a new mask. 2528 // 2529 leader = i; 2530 leaderAddr = &(address2os[i].first); 2531 KMP_CPU_ZERO(sum); 2532 KMP_CPU_SET(address2os[i].second, sum); 2533 } 2534 2535 // 2536 // For every thread in last group, copy the mask to the thread's 2537 // entry in the osId2Mask table. 2538 // 2539 for (; j < i; j++) { 2540 unsigned osId = address2os[j].second; 2541 KMP_DEBUG_ASSERT(osId <= maxOsId); 2542 kmp_affin_mask_t *mask = KMP_CPU_INDEX(osId2Mask, osId); 2543 KMP_CPU_COPY(mask, sum); 2544 address2os[j].first.leader = (j == leader); 2545 } 2546 unique++; 2547 2548 *maxIndex = maxOsId; 2549 *numUnique = unique; 2550 return osId2Mask; 2551 } 2552 2553 2554 // 2555 // Stuff for the affinity proclist parsers. It's easier to declare these vars 2556 // as file-static than to try and pass them through the calling sequence of 2557 // the recursive-descent OMP_PLACES parser. 2558 // 2559 static kmp_affin_mask_t *newMasks; 2560 static int numNewMasks; 2561 static int nextNewMask; 2562 2563 #define ADD_MASK(_mask) \ 2564 { \ 2565 if (nextNewMask >= numNewMasks) { \ 2566 numNewMasks *= 2; \ 2567 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_REALLOC(newMasks, \ 2568 numNewMasks * __kmp_affin_mask_size); \ 2569 } \ 2570 KMP_CPU_COPY(KMP_CPU_INDEX(newMasks, nextNewMask), (_mask)); \ 2571 nextNewMask++; \ 2572 } 2573 2574 #define ADD_MASK_OSID(_osId,_osId2Mask,_maxOsId) \ 2575 { \ 2576 if (((_osId) > _maxOsId) || \ 2577 (! KMP_CPU_ISSET((_osId), KMP_CPU_INDEX(_osId2Mask, (_osId))))) {\ 2578 if (__kmp_affinity_verbose || (__kmp_affinity_warnings \ 2579 && (__kmp_affinity_type != affinity_none))) { \ 2580 KMP_WARNING(AffIgnoreInvalidProcID, _osId); \ 2581 } \ 2582 } \ 2583 else { \ 2584 ADD_MASK(KMP_CPU_INDEX(_osId2Mask, (_osId))); \ 2585 } \ 2586 } 2587 2588 2589 // 2590 // Re-parse the proclist (for the explicit affinity type), and form the list 2591 // of affinity newMasks indexed by gtid. 2592 // 2593 static void 2594 __kmp_affinity_process_proclist(kmp_affin_mask_t **out_masks, 2595 unsigned int *out_numMasks, const char *proclist, 2596 kmp_affin_mask_t *osId2Mask, int maxOsId) 2597 { 2598 const char *scan = proclist; 2599 const char *next = proclist; 2600 2601 // 2602 // We use malloc() for the temporary mask vector, 2603 // so that we can use realloc() to extend it. 2604 // 2605 numNewMasks = 2; 2606 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 2607 * __kmp_affin_mask_size); 2608 nextNewMask = 0; 2609 kmp_affin_mask_t *sumMask = (kmp_affin_mask_t *)__kmp_allocate( 2610 __kmp_affin_mask_size); 2611 int setSize = 0; 2612 2613 for (;;) { 2614 int start, end, stride; 2615 2616 SKIP_WS(scan); 2617 next = scan; 2618 if (*next == '\0') { 2619 break; 2620 } 2621 2622 if (*next == '{') { 2623 int num; 2624 setSize = 0; 2625 next++; // skip '{' 2626 SKIP_WS(next); 2627 scan = next; 2628 2629 // 2630 // Read the first integer in the set. 2631 // 2632 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2633 "bad proclist"); 2634 SKIP_DIGITS(next); 2635 num = __kmp_str_to_int(scan, *next); 2636 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2637 2638 // 2639 // Copy the mask for that osId to the sum (union) mask. 2640 // 2641 if ((num > maxOsId) || 2642 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2643 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2644 && (__kmp_affinity_type != affinity_none))) { 2645 KMP_WARNING(AffIgnoreInvalidProcID, num); 2646 } 2647 KMP_CPU_ZERO(sumMask); 2648 } 2649 else { 2650 KMP_CPU_COPY(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2651 setSize = 1; 2652 } 2653 2654 for (;;) { 2655 // 2656 // Check for end of set. 2657 // 2658 SKIP_WS(next); 2659 if (*next == '}') { 2660 next++; // skip '}' 2661 break; 2662 } 2663 2664 // 2665 // Skip optional comma. 2666 // 2667 if (*next == ',') { 2668 next++; 2669 } 2670 SKIP_WS(next); 2671 2672 // 2673 // Read the next integer in the set. 2674 // 2675 scan = next; 2676 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2677 "bad explicit proc list"); 2678 2679 SKIP_DIGITS(next); 2680 num = __kmp_str_to_int(scan, *next); 2681 KMP_ASSERT2(num >= 0, "bad explicit proc list"); 2682 2683 // 2684 // Add the mask for that osId to the sum mask. 2685 // 2686 if ((num > maxOsId) || 2687 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 2688 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2689 && (__kmp_affinity_type != affinity_none))) { 2690 KMP_WARNING(AffIgnoreInvalidProcID, num); 2691 } 2692 } 2693 else { 2694 KMP_CPU_UNION(sumMask, KMP_CPU_INDEX(osId2Mask, num)); 2695 setSize++; 2696 } 2697 } 2698 if (setSize > 0) { 2699 ADD_MASK(sumMask); 2700 } 2701 2702 SKIP_WS(next); 2703 if (*next == ',') { 2704 next++; 2705 } 2706 scan = next; 2707 continue; 2708 } 2709 2710 // 2711 // Read the first integer. 2712 // 2713 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2714 SKIP_DIGITS(next); 2715 start = __kmp_str_to_int(scan, *next); 2716 KMP_ASSERT2(start >= 0, "bad explicit proc list"); 2717 SKIP_WS(next); 2718 2719 // 2720 // If this isn't a range, then add a mask to the list and go on. 2721 // 2722 if (*next != '-') { 2723 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2724 2725 // 2726 // Skip optional comma. 2727 // 2728 if (*next == ',') { 2729 next++; 2730 } 2731 scan = next; 2732 continue; 2733 } 2734 2735 // 2736 // This is a range. Skip over the '-' and read in the 2nd int. 2737 // 2738 next++; // skip '-' 2739 SKIP_WS(next); 2740 scan = next; 2741 KMP_ASSERT2((*next >= '0') && (*next <= '9'), "bad explicit proc list"); 2742 SKIP_DIGITS(next); 2743 end = __kmp_str_to_int(scan, *next); 2744 KMP_ASSERT2(end >= 0, "bad explicit proc list"); 2745 2746 // 2747 // Check for a stride parameter 2748 // 2749 stride = 1; 2750 SKIP_WS(next); 2751 if (*next == ':') { 2752 // 2753 // A stride is specified. Skip over the ':" and read the 3rd int. 2754 // 2755 int sign = +1; 2756 next++; // skip ':' 2757 SKIP_WS(next); 2758 scan = next; 2759 if (*next == '-') { 2760 sign = -1; 2761 next++; 2762 SKIP_WS(next); 2763 scan = next; 2764 } 2765 KMP_ASSERT2((*next >= '0') && (*next <= '9'), 2766 "bad explicit proc list"); 2767 SKIP_DIGITS(next); 2768 stride = __kmp_str_to_int(scan, *next); 2769 KMP_ASSERT2(stride >= 0, "bad explicit proc list"); 2770 stride *= sign; 2771 } 2772 2773 // 2774 // Do some range checks. 2775 // 2776 KMP_ASSERT2(stride != 0, "bad explicit proc list"); 2777 if (stride > 0) { 2778 KMP_ASSERT2(start <= end, "bad explicit proc list"); 2779 } 2780 else { 2781 KMP_ASSERT2(start >= end, "bad explicit proc list"); 2782 } 2783 KMP_ASSERT2((end - start) / stride <= 65536, "bad explicit proc list"); 2784 2785 // 2786 // Add the mask for each OS proc # to the list. 2787 // 2788 if (stride > 0) { 2789 do { 2790 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2791 start += stride; 2792 } while (start <= end); 2793 } 2794 else { 2795 do { 2796 ADD_MASK_OSID(start, osId2Mask, maxOsId); 2797 start += stride; 2798 } while (start >= end); 2799 } 2800 2801 // 2802 // Skip optional comma. 2803 // 2804 SKIP_WS(next); 2805 if (*next == ',') { 2806 next++; 2807 } 2808 scan = next; 2809 } 2810 2811 *out_numMasks = nextNewMask; 2812 if (nextNewMask == 0) { 2813 *out_masks = NULL; 2814 KMP_INTERNAL_FREE(newMasks); 2815 return; 2816 } 2817 *out_masks 2818 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 2819 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 2820 __kmp_free(sumMask); 2821 KMP_INTERNAL_FREE(newMasks); 2822 } 2823 2824 2825 # if OMP_40_ENABLED 2826 2827 /*----------------------------------------------------------------------------- 2828 2829 Re-parse the OMP_PLACES proc id list, forming the newMasks for the different 2830 places. Again, Here is the grammar: 2831 2832 place_list := place 2833 place_list := place , place_list 2834 place := num 2835 place := place : num 2836 place := place : num : signed 2837 place := { subplacelist } 2838 place := ! place // (lowest priority) 2839 subplace_list := subplace 2840 subplace_list := subplace , subplace_list 2841 subplace := num 2842 subplace := num : num 2843 subplace := num : num : signed 2844 signed := num 2845 signed := + signed 2846 signed := - signed 2847 2848 -----------------------------------------------------------------------------*/ 2849 2850 static void 2851 __kmp_process_subplace_list(const char **scan, kmp_affin_mask_t *osId2Mask, 2852 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2853 { 2854 const char *next; 2855 2856 for (;;) { 2857 int start, count, stride, i; 2858 2859 // 2860 // Read in the starting proc id 2861 // 2862 SKIP_WS(*scan); 2863 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2864 "bad explicit places list"); 2865 next = *scan; 2866 SKIP_DIGITS(next); 2867 start = __kmp_str_to_int(*scan, *next); 2868 KMP_ASSERT(start >= 0); 2869 *scan = next; 2870 2871 // 2872 // valid follow sets are ',' ':' and '}' 2873 // 2874 SKIP_WS(*scan); 2875 if (**scan == '}' || **scan == ',') { 2876 if ((start > maxOsId) || 2877 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2878 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2879 && (__kmp_affinity_type != affinity_none))) { 2880 KMP_WARNING(AffIgnoreInvalidProcID, start); 2881 } 2882 } 2883 else { 2884 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2885 (*setSize)++; 2886 } 2887 if (**scan == '}') { 2888 break; 2889 } 2890 (*scan)++; // skip ',' 2891 continue; 2892 } 2893 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2894 (*scan)++; // skip ':' 2895 2896 // 2897 // Read count parameter 2898 // 2899 SKIP_WS(*scan); 2900 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2901 "bad explicit places list"); 2902 next = *scan; 2903 SKIP_DIGITS(next); 2904 count = __kmp_str_to_int(*scan, *next); 2905 KMP_ASSERT(count >= 0); 2906 *scan = next; 2907 2908 // 2909 // valid follow sets are ',' ':' and '}' 2910 // 2911 SKIP_WS(*scan); 2912 if (**scan == '}' || **scan == ',') { 2913 for (i = 0; i < count; i++) { 2914 if ((start > maxOsId) || 2915 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2916 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2917 && (__kmp_affinity_type != affinity_none))) { 2918 KMP_WARNING(AffIgnoreInvalidProcID, start); 2919 } 2920 break; // don't proliferate warnings for large count 2921 } 2922 else { 2923 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2924 start++; 2925 (*setSize)++; 2926 } 2927 } 2928 if (**scan == '}') { 2929 break; 2930 } 2931 (*scan)++; // skip ',' 2932 continue; 2933 } 2934 KMP_ASSERT2(**scan == ':', "bad explicit places list"); 2935 (*scan)++; // skip ':' 2936 2937 // 2938 // Read stride parameter 2939 // 2940 int sign = +1; 2941 for (;;) { 2942 SKIP_WS(*scan); 2943 if (**scan == '+') { 2944 (*scan)++; // skip '+' 2945 continue; 2946 } 2947 if (**scan == '-') { 2948 sign *= -1; 2949 (*scan)++; // skip '-' 2950 continue; 2951 } 2952 break; 2953 } 2954 SKIP_WS(*scan); 2955 KMP_ASSERT2((**scan >= '0') && (**scan <= '9'), 2956 "bad explicit places list"); 2957 next = *scan; 2958 SKIP_DIGITS(next); 2959 stride = __kmp_str_to_int(*scan, *next); 2960 KMP_ASSERT(stride >= 0); 2961 *scan = next; 2962 stride *= sign; 2963 2964 // 2965 // valid follow sets are ',' and '}' 2966 // 2967 SKIP_WS(*scan); 2968 if (**scan == '}' || **scan == ',') { 2969 for (i = 0; i < count; i++) { 2970 if ((start > maxOsId) || 2971 (! KMP_CPU_ISSET(start, KMP_CPU_INDEX(osId2Mask, start)))) { 2972 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 2973 && (__kmp_affinity_type != affinity_none))) { 2974 KMP_WARNING(AffIgnoreInvalidProcID, start); 2975 } 2976 break; // don't proliferate warnings for large count 2977 } 2978 else { 2979 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, start)); 2980 start += stride; 2981 (*setSize)++; 2982 } 2983 } 2984 if (**scan == '}') { 2985 break; 2986 } 2987 (*scan)++; // skip ',' 2988 continue; 2989 } 2990 2991 KMP_ASSERT2(0, "bad explicit places list"); 2992 } 2993 } 2994 2995 2996 static void 2997 __kmp_process_place(const char **scan, kmp_affin_mask_t *osId2Mask, 2998 int maxOsId, kmp_affin_mask_t *tempMask, int *setSize) 2999 { 3000 const char *next; 3001 3002 // 3003 // valid follow sets are '{' '!' and num 3004 // 3005 SKIP_WS(*scan); 3006 if (**scan == '{') { 3007 (*scan)++; // skip '{' 3008 __kmp_process_subplace_list(scan, osId2Mask, maxOsId , tempMask, 3009 setSize); 3010 KMP_ASSERT2(**scan == '}', "bad explicit places list"); 3011 (*scan)++; // skip '}' 3012 } 3013 else if (**scan == '!') { 3014 __kmp_process_place(scan, osId2Mask, maxOsId, tempMask, setSize); 3015 KMP_CPU_COMPLEMENT(tempMask); 3016 (*scan)++; // skip '!' 3017 } 3018 else if ((**scan >= '0') && (**scan <= '9')) { 3019 next = *scan; 3020 SKIP_DIGITS(next); 3021 int num = __kmp_str_to_int(*scan, *next); 3022 KMP_ASSERT(num >= 0); 3023 if ((num > maxOsId) || 3024 (! KMP_CPU_ISSET(num, KMP_CPU_INDEX(osId2Mask, num)))) { 3025 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3026 && (__kmp_affinity_type != affinity_none))) { 3027 KMP_WARNING(AffIgnoreInvalidProcID, num); 3028 } 3029 } 3030 else { 3031 KMP_CPU_UNION(tempMask, KMP_CPU_INDEX(osId2Mask, num)); 3032 (*setSize)++; 3033 } 3034 *scan = next; // skip num 3035 } 3036 else { 3037 KMP_ASSERT2(0, "bad explicit places list"); 3038 } 3039 } 3040 3041 3042 static void 3043 __kmp_affinity_process_placelist(kmp_affin_mask_t **out_masks, 3044 unsigned int *out_numMasks, const char *placelist, 3045 kmp_affin_mask_t *osId2Mask, int maxOsId) 3046 { 3047 const char *scan = placelist; 3048 const char *next = placelist; 3049 3050 numNewMasks = 2; 3051 newMasks = (kmp_affin_mask_t *)KMP_INTERNAL_MALLOC(numNewMasks 3052 * __kmp_affin_mask_size); 3053 nextNewMask = 0; 3054 3055 kmp_affin_mask_t *tempMask = (kmp_affin_mask_t *)__kmp_allocate( 3056 __kmp_affin_mask_size); 3057 KMP_CPU_ZERO(tempMask); 3058 int setSize = 0; 3059 3060 for (;;) { 3061 int start, count, stride; 3062 3063 __kmp_process_place(&scan, osId2Mask, maxOsId, tempMask, &setSize); 3064 3065 // 3066 // valid follow sets are ',' ':' and EOL 3067 // 3068 SKIP_WS(scan); 3069 if (*scan == '\0' || *scan == ',') { 3070 if (setSize > 0) { 3071 ADD_MASK(tempMask); 3072 } 3073 KMP_CPU_ZERO(tempMask); 3074 setSize = 0; 3075 if (*scan == '\0') { 3076 break; 3077 } 3078 scan++; // skip ',' 3079 continue; 3080 } 3081 3082 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3083 scan++; // skip ':' 3084 3085 // 3086 // Read count parameter 3087 // 3088 SKIP_WS(scan); 3089 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3090 "bad explicit places list"); 3091 next = scan; 3092 SKIP_DIGITS(next); 3093 count = __kmp_str_to_int(scan, *next); 3094 KMP_ASSERT(count >= 0); 3095 scan = next; 3096 3097 // 3098 // valid follow sets are ',' ':' and EOL 3099 // 3100 SKIP_WS(scan); 3101 if (*scan == '\0' || *scan == ',') { 3102 int i; 3103 for (i = 0; i < count; i++) { 3104 int j; 3105 if (setSize == 0) { 3106 break; 3107 } 3108 ADD_MASK(tempMask); 3109 setSize = 0; 3110 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j > 0; j--) { 3111 // 3112 // Use a temp var in case macro is changed to evaluate 3113 // args multiple times. 3114 // 3115 if (KMP_CPU_ISSET(j - stride, tempMask)) { 3116 KMP_CPU_SET(j, tempMask); 3117 setSize++; 3118 } 3119 else { 3120 KMP_CPU_CLR(j, tempMask); 3121 } 3122 } 3123 for (; j >= 0; j--) { 3124 KMP_CPU_CLR(j, tempMask); 3125 } 3126 } 3127 KMP_CPU_ZERO(tempMask); 3128 setSize = 0; 3129 3130 if (*scan == '\0') { 3131 break; 3132 } 3133 scan++; // skip ',' 3134 continue; 3135 } 3136 3137 KMP_ASSERT2(*scan == ':', "bad explicit places list"); 3138 scan++; // skip ':' 3139 3140 // 3141 // Read stride parameter 3142 // 3143 int sign = +1; 3144 for (;;) { 3145 SKIP_WS(scan); 3146 if (*scan == '+') { 3147 scan++; // skip '+' 3148 continue; 3149 } 3150 if (*scan == '-') { 3151 sign *= -1; 3152 scan++; // skip '-' 3153 continue; 3154 } 3155 break; 3156 } 3157 SKIP_WS(scan); 3158 KMP_ASSERT2((*scan >= '0') && (*scan <= '9'), 3159 "bad explicit places list"); 3160 next = scan; 3161 SKIP_DIGITS(next); 3162 stride = __kmp_str_to_int(scan, *next); 3163 KMP_DEBUG_ASSERT(stride >= 0); 3164 scan = next; 3165 stride *= sign; 3166 3167 if (stride > 0) { 3168 int i; 3169 for (i = 0; i < count; i++) { 3170 int j; 3171 if (setSize == 0) { 3172 break; 3173 } 3174 ADD_MASK(tempMask); 3175 setSize = 0; 3176 for (j = __kmp_affin_mask_size * CHAR_BIT - 1; j >= stride; j--) { 3177 if (KMP_CPU_ISSET(j - stride, tempMask)) { 3178 KMP_CPU_SET(j, tempMask); 3179 setSize++; 3180 } 3181 else { 3182 KMP_CPU_CLR(j, tempMask); 3183 } 3184 } 3185 for (; j >= 0; j--) { 3186 KMP_CPU_CLR(j, tempMask); 3187 } 3188 } 3189 } 3190 else { 3191 int i; 3192 for (i = 0; i < count; i++) { 3193 unsigned j; 3194 if (setSize == 0) { 3195 break; 3196 } 3197 ADD_MASK(tempMask); 3198 setSize = 0; 3199 for (j = 0; j < (__kmp_affin_mask_size * CHAR_BIT) + stride; 3200 j++) { 3201 if (KMP_CPU_ISSET(j - stride, tempMask)) { 3202 KMP_CPU_SET(j, tempMask); 3203 setSize++; 3204 } 3205 else { 3206 KMP_CPU_CLR(j, tempMask); 3207 } 3208 } 3209 for (; j < __kmp_affin_mask_size * CHAR_BIT; j++) { 3210 KMP_CPU_CLR(j, tempMask); 3211 } 3212 } 3213 } 3214 KMP_CPU_ZERO(tempMask); 3215 setSize = 0; 3216 3217 // 3218 // valid follow sets are ',' and EOL 3219 // 3220 SKIP_WS(scan); 3221 if (*scan == '\0') { 3222 break; 3223 } 3224 if (*scan == ',') { 3225 scan++; // skip ',' 3226 continue; 3227 } 3228 3229 KMP_ASSERT2(0, "bad explicit places list"); 3230 } 3231 3232 *out_numMasks = nextNewMask; 3233 if (nextNewMask == 0) { 3234 *out_masks = NULL; 3235 KMP_INTERNAL_FREE(newMasks); 3236 return; 3237 } 3238 *out_masks 3239 = (kmp_affin_mask_t *)__kmp_allocate(nextNewMask * __kmp_affin_mask_size); 3240 memcpy(*out_masks, newMasks, nextNewMask * __kmp_affin_mask_size); 3241 __kmp_free(tempMask); 3242 KMP_INTERNAL_FREE(newMasks); 3243 } 3244 3245 # endif /* OMP_40_ENABLED */ 3246 3247 #undef ADD_MASK 3248 #undef ADD_MASK_OSID 3249 3250 3251 # if KMP_MIC 3252 3253 static void 3254 __kmp_apply_thread_places(AddrUnsPair **pAddr, int depth) 3255 { 3256 if ( __kmp_place_num_cores == 0 ) { 3257 if ( __kmp_place_num_threads_per_core == 0 ) { 3258 return; // no cores limiting actions requested, exit 3259 } 3260 __kmp_place_num_cores = nCoresPerPkg; // use all available cores 3261 } 3262 if ( !__kmp_affinity_uniform_topology() || depth != 3 ) { 3263 KMP_WARNING( AffThrPlaceUnsupported ); 3264 return; // don't support non-uniform topology or not-3-level architecture 3265 } 3266 if ( __kmp_place_num_threads_per_core == 0 ) { 3267 __kmp_place_num_threads_per_core = __kmp_nThreadsPerCore; // use all HW contexts 3268 } 3269 if ( __kmp_place_core_offset + __kmp_place_num_cores > nCoresPerPkg ) { 3270 KMP_WARNING( AffThrPlaceManyCores ); 3271 return; 3272 } 3273 3274 AddrUnsPair *newAddr = (AddrUnsPair *)__kmp_allocate( sizeof(AddrUnsPair) * 3275 nPackages * __kmp_place_num_cores * __kmp_place_num_threads_per_core); 3276 int i, j, k, n_old = 0, n_new = 0; 3277 for ( i = 0; i < nPackages; ++i ) { 3278 for ( j = 0; j < nCoresPerPkg; ++j ) { 3279 if ( j < __kmp_place_core_offset || j >= __kmp_place_core_offset + __kmp_place_num_cores ) { 3280 n_old += __kmp_nThreadsPerCore; // skip not-requested core 3281 } else { 3282 for ( k = 0; k < __kmp_nThreadsPerCore; ++k ) { 3283 if ( k < __kmp_place_num_threads_per_core ) { 3284 newAddr[n_new] = (*pAddr)[n_old]; // copy requested core' data to new location 3285 n_new++; 3286 } 3287 n_old++; 3288 } 3289 } 3290 } 3291 } 3292 nCoresPerPkg = __kmp_place_num_cores; // correct nCoresPerPkg 3293 __kmp_nThreadsPerCore = __kmp_place_num_threads_per_core; // correct __kmp_nThreadsPerCore 3294 __kmp_avail_proc = n_new; // correct avail_proc 3295 __kmp_ncores = nPackages * __kmp_place_num_cores; // correct ncores 3296 3297 __kmp_free( *pAddr ); 3298 *pAddr = newAddr; // replace old topology with new one 3299 } 3300 3301 # endif /* KMP_MIC */ 3302 3303 3304 static AddrUnsPair *address2os = NULL; 3305 static int * procarr = NULL; 3306 static int __kmp_aff_depth = 0; 3307 3308 static void 3309 __kmp_aux_affinity_initialize(void) 3310 { 3311 if (__kmp_affinity_masks != NULL) { 3312 KMP_ASSERT(fullMask != NULL); 3313 return; 3314 } 3315 3316 // 3317 // Create the "full" mask - this defines all of the processors that we 3318 // consider to be in the machine model. If respect is set, then it is 3319 // the initialization thread's affinity mask. Otherwise, it is all 3320 // processors that we know about on the machine. 3321 // 3322 if (fullMask == NULL) { 3323 fullMask = (kmp_affin_mask_t *)__kmp_allocate(__kmp_affin_mask_size); 3324 } 3325 if (KMP_AFFINITY_CAPABLE()) { 3326 if (__kmp_affinity_respect_mask) { 3327 __kmp_get_system_affinity(fullMask, TRUE); 3328 3329 // 3330 // Count the number of available processors. 3331 // 3332 unsigned i; 3333 __kmp_avail_proc = 0; 3334 for (i = 0; i < KMP_CPU_SETSIZE; ++i) { 3335 if (! KMP_CPU_ISSET(i, fullMask)) { 3336 continue; 3337 } 3338 __kmp_avail_proc++; 3339 } 3340 if (__kmp_avail_proc > __kmp_xproc) { 3341 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3342 && (__kmp_affinity_type != affinity_none))) { 3343 KMP_WARNING(ErrorInitializeAffinity); 3344 } 3345 __kmp_affinity_type = affinity_none; 3346 __kmp_affin_mask_size = 0; 3347 return; 3348 } 3349 } 3350 else { 3351 __kmp_affinity_entire_machine_mask(fullMask); 3352 __kmp_avail_proc = __kmp_xproc; 3353 } 3354 } 3355 3356 int depth = -1; 3357 kmp_i18n_id_t msg_id = kmp_i18n_null; 3358 3359 // 3360 // For backward compatiblity, setting KMP_CPUINFO_FILE => 3361 // KMP_TOPOLOGY_METHOD=cpuinfo 3362 // 3363 if ((__kmp_cpuinfo_file != NULL) && 3364 (__kmp_affinity_top_method == affinity_top_method_all)) { 3365 __kmp_affinity_top_method = affinity_top_method_cpuinfo; 3366 } 3367 3368 if (__kmp_affinity_top_method == affinity_top_method_all) { 3369 // 3370 // In the default code path, errors are not fatal - we just try using 3371 // another method. We only emit a warning message if affinity is on, 3372 // or the verbose flag is set, an the nowarnings flag was not set. 3373 // 3374 const char *file_name = NULL; 3375 int line = 0; 3376 3377 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3378 3379 if (__kmp_affinity_verbose) { 3380 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", KMP_I18N_STR(Decodingx2APIC)); 3381 } 3382 3383 file_name = NULL; 3384 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3385 if (depth == 0) { 3386 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3387 KMP_ASSERT(address2os == NULL); 3388 return; 3389 } 3390 3391 if (depth < 0) { 3392 if ((msg_id != kmp_i18n_null) 3393 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3394 && (__kmp_affinity_type != affinity_none)))) { 3395 # if KMP_MIC 3396 if (__kmp_affinity_verbose) { 3397 KMP_INFORM(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3398 KMP_I18N_STR(DecodingLegacyAPIC)); 3399 } 3400 # else 3401 KMP_WARNING(AffInfoStrStr, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), 3402 KMP_I18N_STR(DecodingLegacyAPIC)); 3403 # endif 3404 } 3405 3406 file_name = NULL; 3407 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3408 if (depth == 0) { 3409 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3410 KMP_ASSERT(address2os == NULL); 3411 return; 3412 } 3413 } 3414 3415 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3416 3417 # if KMP_OS_LINUX 3418 3419 if (depth < 0) { 3420 if ((msg_id != kmp_i18n_null) 3421 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3422 && (__kmp_affinity_type != affinity_none)))) { 3423 # if KMP_MIC 3424 if (__kmp_affinity_verbose) { 3425 KMP_INFORM(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3426 } 3427 # else 3428 KMP_WARNING(AffStrParseFilename, "KMP_AFFINITY", __kmp_i18n_catgets(msg_id), "/proc/cpuinfo"); 3429 # endif 3430 } 3431 else if (__kmp_affinity_verbose) { 3432 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", "/proc/cpuinfo"); 3433 } 3434 3435 FILE *f = fopen("/proc/cpuinfo", "r"); 3436 if (f == NULL) { 3437 msg_id = kmp_i18n_str_CantOpenCpuinfo; 3438 } 3439 else { 3440 file_name = "/proc/cpuinfo"; 3441 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3442 fclose(f); 3443 if (depth == 0) { 3444 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3445 KMP_ASSERT(address2os == NULL); 3446 return; 3447 } 3448 } 3449 } 3450 3451 # endif /* KMP_OS_LINUX */ 3452 3453 if (depth < 0) { 3454 if (msg_id != kmp_i18n_null 3455 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3456 && (__kmp_affinity_type != affinity_none)))) { 3457 if (file_name == NULL) { 3458 KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3459 } 3460 else if (line == 0) { 3461 KMP_WARNING(UsingFlatOSFile, file_name, __kmp_i18n_catgets(msg_id)); 3462 } 3463 else { 3464 KMP_WARNING(UsingFlatOSFileLine, file_name, line, __kmp_i18n_catgets(msg_id)); 3465 } 3466 } 3467 3468 file_name = ""; 3469 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3470 if (depth == 0) { 3471 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3472 KMP_ASSERT(address2os == NULL); 3473 return; 3474 } 3475 KMP_ASSERT(depth > 0); 3476 KMP_ASSERT(address2os != NULL); 3477 } 3478 } 3479 3480 // 3481 // If the user has specified that a paricular topology discovery method 3482 // is to be used, then we abort if that method fails. The exception is 3483 // group affinity, which might have been implicitly set. 3484 // 3485 3486 # if KMP_ARCH_X86 || KMP_ARCH_X86_64 3487 3488 else if (__kmp_affinity_top_method == affinity_top_method_x2apicid) { 3489 if (__kmp_affinity_verbose) { 3490 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3491 KMP_I18N_STR(Decodingx2APIC)); 3492 } 3493 3494 depth = __kmp_affinity_create_x2apicid_map(&address2os, &msg_id); 3495 if (depth == 0) { 3496 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3497 KMP_ASSERT(address2os == NULL); 3498 return; 3499 } 3500 3501 if (depth < 0) { 3502 KMP_ASSERT(msg_id != kmp_i18n_null); 3503 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3504 } 3505 } 3506 else if (__kmp_affinity_top_method == affinity_top_method_apicid) { 3507 if (__kmp_affinity_verbose) { 3508 KMP_INFORM(AffInfoStr, "KMP_AFFINITY", 3509 KMP_I18N_STR(DecodingLegacyAPIC)); 3510 } 3511 3512 depth = __kmp_affinity_create_apicid_map(&address2os, &msg_id); 3513 if (depth == 0) { 3514 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3515 KMP_ASSERT(address2os == NULL); 3516 return; 3517 } 3518 3519 if (depth < 0) { 3520 KMP_ASSERT(msg_id != kmp_i18n_null); 3521 KMP_FATAL(MsgExiting, __kmp_i18n_catgets(msg_id)); 3522 } 3523 } 3524 3525 # endif /* KMP_ARCH_X86 || KMP_ARCH_X86_64 */ 3526 3527 else if (__kmp_affinity_top_method == affinity_top_method_cpuinfo) { 3528 const char *filename; 3529 if (__kmp_cpuinfo_file != NULL) { 3530 filename = __kmp_cpuinfo_file; 3531 } 3532 else { 3533 filename = "/proc/cpuinfo"; 3534 } 3535 3536 if (__kmp_affinity_verbose) { 3537 KMP_INFORM(AffParseFilename, "KMP_AFFINITY", filename); 3538 } 3539 3540 FILE *f = fopen(filename, "r"); 3541 if (f == NULL) { 3542 int code = errno; 3543 if (__kmp_cpuinfo_file != NULL) { 3544 __kmp_msg( 3545 kmp_ms_fatal, 3546 KMP_MSG(CantOpenFileForReading, filename), 3547 KMP_ERR(code), 3548 KMP_HNT(NameComesFrom_CPUINFO_FILE), 3549 __kmp_msg_null 3550 ); 3551 } 3552 else { 3553 __kmp_msg( 3554 kmp_ms_fatal, 3555 KMP_MSG(CantOpenFileForReading, filename), 3556 KMP_ERR(code), 3557 __kmp_msg_null 3558 ); 3559 } 3560 } 3561 int line = 0; 3562 depth = __kmp_affinity_create_cpuinfo_map(&address2os, &line, &msg_id, f); 3563 fclose(f); 3564 if (depth < 0) { 3565 KMP_ASSERT(msg_id != kmp_i18n_null); 3566 if (line > 0) { 3567 KMP_FATAL(FileLineMsgExiting, filename, line, __kmp_i18n_catgets(msg_id)); 3568 } 3569 else { 3570 KMP_FATAL(FileMsgExiting, filename, __kmp_i18n_catgets(msg_id)); 3571 } 3572 } 3573 if (__kmp_affinity_type == affinity_none) { 3574 KMP_ASSERT(depth == 0); 3575 KMP_ASSERT(address2os == NULL); 3576 return; 3577 } 3578 } 3579 3580 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3581 3582 else if (__kmp_affinity_top_method == affinity_top_method_group) { 3583 if (__kmp_affinity_verbose) { 3584 KMP_INFORM(AffWindowsProcGroupMap, "KMP_AFFINITY"); 3585 } 3586 3587 depth = __kmp_affinity_create_proc_group_map(&address2os, &msg_id); 3588 KMP_ASSERT(depth != 0); 3589 3590 if (depth < 0) { 3591 if ((msg_id != kmp_i18n_null) 3592 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3593 && (__kmp_affinity_type != affinity_none)))) { 3594 KMP_WARNING(UsingFlatOS, __kmp_i18n_catgets(msg_id)); 3595 } 3596 3597 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3598 if (depth == 0) { 3599 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3600 KMP_ASSERT(address2os == NULL); 3601 return; 3602 } 3603 // should not fail 3604 KMP_ASSERT(depth > 0); 3605 KMP_ASSERT(address2os != NULL); 3606 } 3607 } 3608 3609 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 3610 3611 else if (__kmp_affinity_top_method == affinity_top_method_flat) { 3612 if (__kmp_affinity_verbose) { 3613 KMP_INFORM(AffUsingFlatOS, "KMP_AFFINITY"); 3614 } 3615 3616 depth = __kmp_affinity_create_flat_map(&address2os, &msg_id); 3617 if (depth == 0) { 3618 KMP_ASSERT(__kmp_affinity_type == affinity_none); 3619 KMP_ASSERT(address2os == NULL); 3620 return; 3621 } 3622 // should not fail 3623 KMP_ASSERT(depth > 0); 3624 KMP_ASSERT(address2os != NULL); 3625 } 3626 3627 if (address2os == NULL) { 3628 if (KMP_AFFINITY_CAPABLE() 3629 && (__kmp_affinity_verbose || (__kmp_affinity_warnings 3630 && (__kmp_affinity_type != affinity_none)))) { 3631 KMP_WARNING(ErrorInitializeAffinity); 3632 } 3633 __kmp_affinity_type = affinity_none; 3634 __kmp_affin_mask_size = 0; 3635 return; 3636 } 3637 3638 # if KMP_MIC 3639 __kmp_apply_thread_places(&address2os, depth); 3640 # endif 3641 3642 // 3643 // Create the table of masks, indexed by thread Id. 3644 // 3645 unsigned maxIndex; 3646 unsigned numUnique; 3647 kmp_affin_mask_t *osId2Mask = __kmp_create_masks(&maxIndex, &numUnique, 3648 address2os, __kmp_avail_proc); 3649 if (__kmp_affinity_gran_levels == 0) { 3650 KMP_DEBUG_ASSERT(numUnique == __kmp_avail_proc); 3651 } 3652 3653 // 3654 // Set the childNums vector in all Address objects. This must be done 3655 // before we can sort using __kmp_affinity_cmp_Address_child_num(), 3656 // which takes into account the setting of __kmp_affinity_compact. 3657 // 3658 __kmp_affinity_assign_child_nums(address2os, __kmp_avail_proc); 3659 3660 switch (__kmp_affinity_type) { 3661 3662 case affinity_explicit: 3663 KMP_DEBUG_ASSERT(__kmp_affinity_proclist != NULL); 3664 # if OMP_40_ENABLED 3665 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3666 # endif 3667 { 3668 __kmp_affinity_process_proclist(&__kmp_affinity_masks, 3669 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3670 maxIndex); 3671 } 3672 # if OMP_40_ENABLED 3673 else { 3674 __kmp_affinity_process_placelist(&__kmp_affinity_masks, 3675 &__kmp_affinity_num_masks, __kmp_affinity_proclist, osId2Mask, 3676 maxIndex); 3677 } 3678 # endif 3679 if (__kmp_affinity_num_masks == 0) { 3680 if (__kmp_affinity_verbose || (__kmp_affinity_warnings 3681 && (__kmp_affinity_type != affinity_none))) { 3682 KMP_WARNING(AffNoValidProcID); 3683 } 3684 __kmp_affinity_type = affinity_none; 3685 return; 3686 } 3687 break; 3688 3689 // 3690 // The other affinity types rely on sorting the Addresses according 3691 // to some permutation of the machine topology tree. Set 3692 // __kmp_affinity_compact and __kmp_affinity_offset appropriately, 3693 // then jump to a common code fragment to do the sort and create 3694 // the array of affinity masks. 3695 // 3696 3697 case affinity_logical: 3698 __kmp_affinity_compact = 0; 3699 if (__kmp_affinity_offset) { 3700 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3701 % __kmp_avail_proc; 3702 } 3703 goto sortAddresses; 3704 3705 case affinity_physical: 3706 if (__kmp_nThreadsPerCore > 1) { 3707 __kmp_affinity_compact = 1; 3708 if (__kmp_affinity_compact >= depth) { 3709 __kmp_affinity_compact = 0; 3710 } 3711 } else { 3712 __kmp_affinity_compact = 0; 3713 } 3714 if (__kmp_affinity_offset) { 3715 __kmp_affinity_offset = __kmp_nThreadsPerCore * __kmp_affinity_offset 3716 % __kmp_avail_proc; 3717 } 3718 goto sortAddresses; 3719 3720 case affinity_scatter: 3721 if (__kmp_affinity_compact >= depth) { 3722 __kmp_affinity_compact = 0; 3723 } 3724 else { 3725 __kmp_affinity_compact = depth - 1 - __kmp_affinity_compact; 3726 } 3727 goto sortAddresses; 3728 3729 case affinity_compact: 3730 if (__kmp_affinity_compact >= depth) { 3731 __kmp_affinity_compact = depth - 1; 3732 } 3733 goto sortAddresses; 3734 3735 # if KMP_MIC 3736 case affinity_balanced: 3737 // Balanced works only for the case of a single package and uniform topology 3738 if( nPackages > 1 ) { 3739 if( __kmp_affinity_verbose || __kmp_affinity_warnings ) { 3740 KMP_WARNING( AffBalancedNotAvail, "KMP_AFFINITY" ); 3741 } 3742 __kmp_affinity_type = affinity_none; 3743 return; 3744 } else if( __kmp_affinity_uniform_topology() ) { 3745 break; 3746 } else { // Non-uniform topology 3747 3748 // Save the depth for further usage 3749 __kmp_aff_depth = depth; 3750 3751 // Number of hyper threads per core in HT machine 3752 int nth_per_core = __kmp_nThreadsPerCore; 3753 3754 int core_level; 3755 if( nth_per_core > 1 ) { 3756 core_level = depth - 2; 3757 } else { 3758 core_level = depth - 1; 3759 } 3760 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 3761 int nproc = nth_per_core * ncores; 3762 3763 procarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 3764 for( int i = 0; i < nproc; i++ ) { 3765 procarr[ i ] = -1; 3766 } 3767 3768 for( int i = 0; i < __kmp_avail_proc; i++ ) { 3769 int proc = address2os[ i ].second; 3770 // If depth == 3 then level=0 - package, level=1 - core, level=2 - thread. 3771 // If there is only one thread per core then depth == 2: level 0 - package, 3772 // level 1 - core. 3773 int level = depth - 1; 3774 3775 // __kmp_nth_per_core == 1 3776 int thread = 0; 3777 int core = address2os[ i ].first.labels[ level ]; 3778 // If the thread level exists, that is we have more than one thread context per core 3779 if( nth_per_core > 1 ) { 3780 thread = address2os[ i ].first.labels[ level ] % nth_per_core; 3781 core = address2os[ i ].first.labels[ level - 1 ]; 3782 } 3783 procarr[ core * nth_per_core + thread ] = proc; 3784 } 3785 3786 break; 3787 } 3788 # endif 3789 3790 sortAddresses: 3791 // 3792 // Allocate the gtid->affinity mask table. 3793 // 3794 if (__kmp_affinity_dups) { 3795 __kmp_affinity_num_masks = __kmp_avail_proc; 3796 } 3797 else { 3798 __kmp_affinity_num_masks = numUnique; 3799 } 3800 3801 # if OMP_40_ENABLED 3802 if ( ( __kmp_nested_proc_bind.bind_types[0] != proc_bind_intel ) 3803 && ( __kmp_affinity_num_places > 0 ) 3804 && ( (unsigned)__kmp_affinity_num_places < __kmp_affinity_num_masks ) ) { 3805 __kmp_affinity_num_masks = __kmp_affinity_num_places; 3806 } 3807 # endif 3808 3809 __kmp_affinity_masks = (kmp_affin_mask_t*)__kmp_allocate( 3810 __kmp_affinity_num_masks * __kmp_affin_mask_size); 3811 3812 // 3813 // Sort the address2os table according to the current setting of 3814 // __kmp_affinity_compact, then fill out __kmp_affinity_masks. 3815 // 3816 qsort(address2os, __kmp_avail_proc, sizeof(*address2os), 3817 __kmp_affinity_cmp_Address_child_num); 3818 { 3819 int i; 3820 unsigned j; 3821 for (i = 0, j = 0; i < __kmp_avail_proc; i++) { 3822 if ((! __kmp_affinity_dups) && (! address2os[i].first.leader)) { 3823 continue; 3824 } 3825 unsigned osId = address2os[i].second; 3826 kmp_affin_mask_t *src = KMP_CPU_INDEX(osId2Mask, osId); 3827 kmp_affin_mask_t *dest 3828 = KMP_CPU_INDEX(__kmp_affinity_masks, j); 3829 KMP_ASSERT(KMP_CPU_ISSET(osId, src)); 3830 KMP_CPU_COPY(dest, src); 3831 if (++j >= __kmp_affinity_num_masks) { 3832 break; 3833 } 3834 } 3835 KMP_DEBUG_ASSERT(j == __kmp_affinity_num_masks); 3836 } 3837 break; 3838 3839 default: 3840 KMP_ASSERT2(0, "Unexpected affinity setting"); 3841 } 3842 3843 __kmp_free(osId2Mask); 3844 } 3845 3846 3847 void 3848 __kmp_affinity_initialize(void) 3849 { 3850 // 3851 // Much of the code above was written assumming that if a machine was not 3852 // affinity capable, then __kmp_affinity_type == affinity_none. We now 3853 // explicitly represent this as __kmp_affinity_type == affinity_disabled. 3854 // 3855 // There are too many checks for __kmp_affinity_type == affinity_none 3856 // in this code. Instead of trying to change them all, check if 3857 // __kmp_affinity_type == affinity_disabled, and if so, slam it with 3858 // affinity_none, call the real initialization routine, then restore 3859 // __kmp_affinity_type to affinity_disabled. 3860 // 3861 int disabled = (__kmp_affinity_type == affinity_disabled); 3862 if (! KMP_AFFINITY_CAPABLE()) { 3863 KMP_ASSERT(disabled); 3864 } 3865 if (disabled) { 3866 __kmp_affinity_type = affinity_none; 3867 } 3868 __kmp_aux_affinity_initialize(); 3869 if (disabled) { 3870 __kmp_affinity_type = affinity_disabled; 3871 } 3872 } 3873 3874 3875 void 3876 __kmp_affinity_uninitialize(void) 3877 { 3878 if (__kmp_affinity_masks != NULL) { 3879 __kmp_free(__kmp_affinity_masks); 3880 __kmp_affinity_masks = NULL; 3881 } 3882 if (fullMask != NULL) { 3883 KMP_CPU_FREE(fullMask); 3884 fullMask = NULL; 3885 } 3886 __kmp_affinity_num_masks = 0; 3887 # if OMP_40_ENABLED 3888 __kmp_affinity_num_places = 0; 3889 # endif 3890 if (__kmp_affinity_proclist != NULL) { 3891 __kmp_free(__kmp_affinity_proclist); 3892 __kmp_affinity_proclist = NULL; 3893 } 3894 if( address2os != NULL ) { 3895 __kmp_free( address2os ); 3896 address2os = NULL; 3897 } 3898 if( procarr != NULL ) { 3899 __kmp_free( procarr ); 3900 procarr = NULL; 3901 } 3902 } 3903 3904 3905 void 3906 __kmp_affinity_set_init_mask(int gtid, int isa_root) 3907 { 3908 if (! KMP_AFFINITY_CAPABLE()) { 3909 return; 3910 } 3911 3912 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 3913 if (th->th.th_affin_mask == NULL) { 3914 KMP_CPU_ALLOC(th->th.th_affin_mask); 3915 } 3916 else { 3917 KMP_CPU_ZERO(th->th.th_affin_mask); 3918 } 3919 3920 // 3921 // Copy the thread mask to the kmp_info_t strucuture. 3922 // If __kmp_affinity_type == affinity_none, copy the "full" mask, i.e. one 3923 // that has all of the OS proc ids set, or if __kmp_affinity_respect_mask 3924 // is set, then the full mask is the same as the mask of the initialization 3925 // thread. 3926 // 3927 kmp_affin_mask_t *mask; 3928 int i; 3929 3930 # if OMP_40_ENABLED 3931 if (__kmp_nested_proc_bind.bind_types[0] == proc_bind_intel) 3932 # endif 3933 { 3934 if ((__kmp_affinity_type == affinity_none) 3935 # if KMP_MIC 3936 || (__kmp_affinity_type == affinity_balanced) 3937 # endif 3938 ) { 3939 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3940 if (__kmp_num_proc_groups > 1) { 3941 return; 3942 } 3943 # endif 3944 KMP_ASSERT(fullMask != NULL); 3945 i = -1; 3946 mask = fullMask; 3947 } 3948 else { 3949 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3950 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3951 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3952 } 3953 } 3954 # if OMP_40_ENABLED 3955 else { 3956 if ((! isa_root) 3957 || (__kmp_nested_proc_bind.bind_types[0] == proc_bind_false)) { 3958 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 3959 if (__kmp_num_proc_groups > 1) { 3960 return; 3961 } 3962 # endif 3963 KMP_ASSERT(fullMask != NULL); 3964 i = KMP_PLACE_ALL; 3965 mask = fullMask; 3966 } 3967 else { 3968 // 3969 // int i = some hash function or just a counter that doesn't 3970 // always start at 0. Use gtid for now. 3971 // 3972 KMP_DEBUG_ASSERT( __kmp_affinity_num_masks > 0 ); 3973 i = (gtid + __kmp_affinity_offset) % __kmp_affinity_num_masks; 3974 mask = KMP_CPU_INDEX(__kmp_affinity_masks, i); 3975 } 3976 } 3977 # endif 3978 3979 # if OMP_40_ENABLED 3980 th->th.th_current_place = i; 3981 if (isa_root) { 3982 th->th.th_new_place = i; 3983 th->th.th_first_place = 0; 3984 th->th.th_last_place = __kmp_affinity_num_masks - 1; 3985 } 3986 3987 if (i == KMP_PLACE_ALL) { 3988 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to all places\n", 3989 gtid)); 3990 } 3991 else { 3992 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to place %d\n", 3993 gtid, i)); 3994 } 3995 # else 3996 if (i == -1) { 3997 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to fullMask\n", 3998 gtid)); 3999 } 4000 else { 4001 KA_TRACE(100, ("__kmp_affinity_set_init_mask: binding T#%d to mask %d\n", 4002 gtid, i)); 4003 } 4004 # endif /* OMP_40_ENABLED */ 4005 4006 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4007 4008 if (__kmp_affinity_verbose) { 4009 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4010 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4011 th->th.th_affin_mask); 4012 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", gtid, buf); 4013 } 4014 4015 # if KMP_OS_WINDOWS 4016 // 4017 // On Windows* OS, the process affinity mask might have changed. 4018 // If the user didn't request affinity and this call fails, 4019 // just continue silently. See CQ171393. 4020 // 4021 if ( __kmp_affinity_type == affinity_none ) { 4022 __kmp_set_system_affinity(th->th.th_affin_mask, FALSE); 4023 } 4024 else 4025 # endif 4026 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4027 } 4028 4029 4030 # if OMP_40_ENABLED 4031 4032 void 4033 __kmp_affinity_set_place(int gtid) 4034 { 4035 int retval; 4036 4037 if (! KMP_AFFINITY_CAPABLE()) { 4038 return; 4039 } 4040 4041 kmp_info_t *th = (kmp_info_t *)TCR_SYNC_PTR(__kmp_threads[gtid]); 4042 4043 KA_TRACE(100, ("__kmp_affinity_set_place: binding T#%d to place %d (current place = %d)\n", 4044 gtid, th->th.th_new_place, th->th.th_current_place)); 4045 4046 // 4047 // Check that the new place is withing this thread's partition. 4048 // 4049 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4050 KMP_DEBUG_ASSERT(th->th.th_new_place >= 0); 4051 KMP_DEBUG_ASSERT((unsigned)th->th.th_new_place <= __kmp_affinity_num_masks); 4052 if (th->th.th_first_place <= th->th.th_last_place) { 4053 KMP_DEBUG_ASSERT((th->th.th_new_place >= th->th.th_first_place) 4054 && (th->th.th_new_place <= th->th.th_last_place)); 4055 } 4056 else { 4057 KMP_DEBUG_ASSERT((th->th.th_new_place <= th->th.th_first_place) 4058 || (th->th.th_new_place >= th->th.th_last_place)); 4059 } 4060 4061 // 4062 // Copy the thread mask to the kmp_info_t strucuture, 4063 // and set this thread's affinity. 4064 // 4065 kmp_affin_mask_t *mask = KMP_CPU_INDEX(__kmp_affinity_masks, 4066 th->th.th_new_place); 4067 KMP_CPU_COPY(th->th.th_affin_mask, mask); 4068 th->th.th_current_place = th->th.th_new_place; 4069 4070 if (__kmp_affinity_verbose) { 4071 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4072 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4073 th->th.th_affin_mask); 4074 KMP_INFORM(BoundToOSProcSet, "OMP_PROC_BIND", gtid, buf); 4075 } 4076 __kmp_set_system_affinity(th->th.th_affin_mask, TRUE); 4077 } 4078 4079 # endif /* OMP_40_ENABLED */ 4080 4081 4082 int 4083 __kmp_aux_set_affinity(void **mask) 4084 { 4085 int gtid; 4086 kmp_info_t *th; 4087 int retval; 4088 4089 if (! KMP_AFFINITY_CAPABLE()) { 4090 return -1; 4091 } 4092 4093 gtid = __kmp_entry_gtid(); 4094 KA_TRACE(1000, ;{ 4095 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4096 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4097 (kmp_affin_mask_t *)(*mask)); 4098 __kmp_debug_printf("kmp_set_affinity: setting affinity mask for thread %d = %s\n", 4099 gtid, buf); 4100 }); 4101 4102 if (__kmp_env_consistency_check) { 4103 if ((mask == NULL) || (*mask == NULL)) { 4104 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4105 } 4106 else { 4107 unsigned proc; 4108 int num_procs = 0; 4109 4110 for (proc = 0; proc < KMP_CPU_SETSIZE; proc++) { 4111 if (! KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask))) { 4112 continue; 4113 } 4114 num_procs++; 4115 if (! KMP_CPU_ISSET(proc, fullMask)) { 4116 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4117 break; 4118 } 4119 } 4120 if (num_procs == 0) { 4121 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4122 } 4123 4124 # if KMP_OS_WINDOWS && KMP_ARCH_X86_64 4125 if (__kmp_get_proc_group((kmp_affin_mask_t *)(*mask)) < 0) { 4126 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 4127 } 4128 # endif /* KMP_OS_WINDOWS && KMP_ARCH_X86_64 */ 4129 4130 } 4131 } 4132 4133 th = __kmp_threads[gtid]; 4134 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4135 retval = __kmp_set_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4136 if (retval == 0) { 4137 KMP_CPU_COPY(th->th.th_affin_mask, (kmp_affin_mask_t *)(*mask)); 4138 } 4139 4140 # if OMP_40_ENABLED 4141 th->th.th_current_place = KMP_PLACE_UNDEFINED; 4142 th->th.th_new_place = KMP_PLACE_UNDEFINED; 4143 th->th.th_first_place = 0; 4144 th->th.th_last_place = __kmp_affinity_num_masks - 1; 4145 # endif 4146 4147 return retval; 4148 } 4149 4150 4151 int 4152 __kmp_aux_get_affinity(void **mask) 4153 { 4154 int gtid; 4155 int retval; 4156 kmp_info_t *th; 4157 4158 if (! KMP_AFFINITY_CAPABLE()) { 4159 return -1; 4160 } 4161 4162 gtid = __kmp_entry_gtid(); 4163 th = __kmp_threads[gtid]; 4164 KMP_DEBUG_ASSERT(th->th.th_affin_mask != NULL); 4165 4166 KA_TRACE(1000, ;{ 4167 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4168 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4169 th->th.th_affin_mask); 4170 __kmp_printf("kmp_get_affinity: stored affinity mask for thread %d = %s\n", gtid, buf); 4171 }); 4172 4173 if (__kmp_env_consistency_check) { 4174 if ((mask == NULL) || (*mask == NULL)) { 4175 KMP_FATAL(AffinityInvalidMask, "kmp_get_affinity"); 4176 } 4177 } 4178 4179 # if !KMP_OS_WINDOWS 4180 4181 retval = __kmp_get_system_affinity((kmp_affin_mask_t *)(*mask), FALSE); 4182 KA_TRACE(1000, ;{ 4183 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4184 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4185 (kmp_affin_mask_t *)(*mask)); 4186 __kmp_printf("kmp_get_affinity: system affinity mask for thread %d = %s\n", gtid, buf); 4187 }); 4188 return retval; 4189 4190 # else 4191 4192 KMP_CPU_COPY((kmp_affin_mask_t *)(*mask), th->th.th_affin_mask); 4193 return 0; 4194 4195 # endif /* KMP_OS_WINDOWS */ 4196 4197 } 4198 4199 4200 int 4201 __kmp_aux_set_affinity_mask_proc(int proc, void **mask) 4202 { 4203 int retval; 4204 4205 if (! KMP_AFFINITY_CAPABLE()) { 4206 return -1; 4207 } 4208 4209 KA_TRACE(1000, ;{ 4210 int gtid = __kmp_entry_gtid(); 4211 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4212 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4213 (kmp_affin_mask_t *)(*mask)); 4214 __kmp_debug_printf("kmp_set_affinity_mask_proc: setting proc %d in affinity mask for thread %d = %s\n", 4215 proc, gtid, buf); 4216 }); 4217 4218 if (__kmp_env_consistency_check) { 4219 if ((mask == NULL) || (*mask == NULL)) { 4220 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4221 } 4222 } 4223 4224 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4225 return -1; 4226 } 4227 if (! KMP_CPU_ISSET(proc, fullMask)) { 4228 return -2; 4229 } 4230 4231 KMP_CPU_SET(proc, (kmp_affin_mask_t *)(*mask)); 4232 return 0; 4233 } 4234 4235 4236 int 4237 __kmp_aux_unset_affinity_mask_proc(int proc, void **mask) 4238 { 4239 int retval; 4240 4241 if (! KMP_AFFINITY_CAPABLE()) { 4242 return -1; 4243 } 4244 4245 KA_TRACE(1000, ;{ 4246 int gtid = __kmp_entry_gtid(); 4247 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4248 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4249 (kmp_affin_mask_t *)(*mask)); 4250 __kmp_debug_printf("kmp_unset_affinity_mask_proc: unsetting proc %d in affinity mask for thread %d = %s\n", 4251 proc, gtid, buf); 4252 }); 4253 4254 if (__kmp_env_consistency_check) { 4255 if ((mask == NULL) || (*mask == NULL)) { 4256 KMP_FATAL(AffinityInvalidMask, "kmp_unset_affinity_mask_proc"); 4257 } 4258 } 4259 4260 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4261 return -1; 4262 } 4263 if (! KMP_CPU_ISSET(proc, fullMask)) { 4264 return -2; 4265 } 4266 4267 KMP_CPU_CLR(proc, (kmp_affin_mask_t *)(*mask)); 4268 return 0; 4269 } 4270 4271 4272 int 4273 __kmp_aux_get_affinity_mask_proc(int proc, void **mask) 4274 { 4275 int retval; 4276 4277 if (! KMP_AFFINITY_CAPABLE()) { 4278 return -1; 4279 } 4280 4281 KA_TRACE(1000, ;{ 4282 int gtid = __kmp_entry_gtid(); 4283 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4284 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, 4285 (kmp_affin_mask_t *)(*mask)); 4286 __kmp_debug_printf("kmp_get_affinity_mask_proc: getting proc %d in affinity mask for thread %d = %s\n", 4287 proc, gtid, buf); 4288 }); 4289 4290 if (__kmp_env_consistency_check) { 4291 if ((mask == NULL) || (*mask == NULL)) { 4292 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity_mask_proc"); 4293 } 4294 } 4295 4296 if ((proc < 0) || ((unsigned)proc >= KMP_CPU_SETSIZE)) { 4297 return 0; 4298 } 4299 if (! KMP_CPU_ISSET(proc, fullMask)) { 4300 return 0; 4301 } 4302 4303 return KMP_CPU_ISSET(proc, (kmp_affin_mask_t *)(*mask)); 4304 } 4305 4306 # if KMP_MIC 4307 4308 // Dynamic affinity settings - Affinity balanced 4309 void __kmp_balanced_affinity( int tid, int nthreads ) 4310 { 4311 if( __kmp_affinity_uniform_topology() ) { 4312 int coreID; 4313 int threadID; 4314 // Number of hyper threads per core in HT machine 4315 int __kmp_nth_per_core = __kmp_avail_proc / __kmp_ncores; 4316 // Number of cores 4317 int ncores = __kmp_ncores; 4318 // How many threads will be bound to each core 4319 int chunk = nthreads / ncores; 4320 // How many cores will have an additional thread bound to it - "big cores" 4321 int big_cores = nthreads % ncores; 4322 // Number of threads on the big cores 4323 int big_nth = ( chunk + 1 ) * big_cores; 4324 if( tid < big_nth ) { 4325 coreID = tid / (chunk + 1 ); 4326 threadID = ( tid % (chunk + 1 ) ) % __kmp_nth_per_core ; 4327 } else { //tid >= big_nth 4328 coreID = ( tid - big_cores ) / chunk; 4329 threadID = ( ( tid - big_cores ) % chunk ) % __kmp_nth_per_core ; 4330 } 4331 4332 KMP_DEBUG_ASSERT2(KMP_AFFINITY_CAPABLE(), 4333 "Illegal set affinity operation when not capable"); 4334 4335 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4336 KMP_CPU_ZERO(mask); 4337 4338 // Granularity == thread 4339 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4340 int osID = address2os[ coreID * __kmp_nth_per_core + threadID ].second; 4341 KMP_CPU_SET( osID, mask); 4342 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4343 for( int i = 0; i < __kmp_nth_per_core; i++ ) { 4344 int osID; 4345 osID = address2os[ coreID * __kmp_nth_per_core + i ].second; 4346 KMP_CPU_SET( osID, mask); 4347 } 4348 } 4349 if (__kmp_affinity_verbose) { 4350 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4351 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4352 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf); 4353 } 4354 __kmp_set_system_affinity( mask, TRUE ); 4355 } else { // Non-uniform topology 4356 4357 kmp_affin_mask_t *mask = (kmp_affin_mask_t *)alloca(__kmp_affin_mask_size); 4358 KMP_CPU_ZERO(mask); 4359 4360 // Number of hyper threads per core in HT machine 4361 int nth_per_core = __kmp_nThreadsPerCore; 4362 int core_level; 4363 if( nth_per_core > 1 ) { 4364 core_level = __kmp_aff_depth - 2; 4365 } else { 4366 core_level = __kmp_aff_depth - 1; 4367 } 4368 4369 // Number of cores - maximum value; it does not count trail cores with 0 processors 4370 int ncores = address2os[ __kmp_avail_proc - 1 ].first.labels[ core_level ] + 1; 4371 4372 // For performance gain consider the special case nthreads == __kmp_avail_proc 4373 if( nthreads == __kmp_avail_proc ) { 4374 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4375 int osID = address2os[ tid ].second; 4376 KMP_CPU_SET( osID, mask); 4377 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4378 int coreID = address2os[ tid ].first.labels[ core_level ]; 4379 // We'll count found osIDs for the current core; they can be not more than nth_per_core; 4380 // since the address2os is sortied we can break when cnt==nth_per_core 4381 int cnt = 0; 4382 for( int i = 0; i < __kmp_avail_proc; i++ ) { 4383 int osID = address2os[ i ].second; 4384 int core = address2os[ i ].first.labels[ core_level ]; 4385 if( core == coreID ) { 4386 KMP_CPU_SET( osID, mask); 4387 cnt++; 4388 if( cnt == nth_per_core ) { 4389 break; 4390 } 4391 } 4392 } 4393 } 4394 } else if( nthreads <= __kmp_ncores ) { 4395 4396 int core = 0; 4397 for( int i = 0; i < ncores; i++ ) { 4398 // Check if this core from procarr[] is in the mask 4399 int in_mask = 0; 4400 for( int j = 0; j < nth_per_core; j++ ) { 4401 if( procarr[ i * nth_per_core + j ] != - 1 ) { 4402 in_mask = 1; 4403 break; 4404 } 4405 } 4406 if( in_mask ) { 4407 if( tid == core ) { 4408 for( int j = 0; j < nth_per_core; j++ ) { 4409 int osID = procarr[ i * nth_per_core + j ]; 4410 if( osID != -1 ) { 4411 KMP_CPU_SET( osID, mask ); 4412 // For granularity=thread it is enough to set the first available osID for this core 4413 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4414 break; 4415 } 4416 } 4417 } 4418 break; 4419 } else { 4420 core++; 4421 } 4422 } 4423 } 4424 4425 } else { // nthreads > __kmp_ncores 4426 4427 // Array to save the number of processors at each core 4428 int nproc_at_core[ ncores ]; 4429 // Array to save the number of cores with "x" available processors; 4430 int ncores_with_x_procs[ nth_per_core + 1 ]; 4431 // Array to save the number of cores with # procs from x to nth_per_core 4432 int ncores_with_x_to_max_procs[ nth_per_core + 1 ]; 4433 4434 for( int i = 0; i <= nth_per_core; i++ ) { 4435 ncores_with_x_procs[ i ] = 0; 4436 ncores_with_x_to_max_procs[ i ] = 0; 4437 } 4438 4439 for( int i = 0; i < ncores; i++ ) { 4440 int cnt = 0; 4441 for( int j = 0; j < nth_per_core; j++ ) { 4442 if( procarr[ i * nth_per_core + j ] != -1 ) { 4443 cnt++; 4444 } 4445 } 4446 nproc_at_core[ i ] = cnt; 4447 ncores_with_x_procs[ cnt ]++; 4448 } 4449 4450 for( int i = 0; i <= nth_per_core; i++ ) { 4451 for( int j = i; j <= nth_per_core; j++ ) { 4452 ncores_with_x_to_max_procs[ i ] += ncores_with_x_procs[ j ]; 4453 } 4454 } 4455 4456 // Max number of processors 4457 int nproc = nth_per_core * ncores; 4458 // An array to keep number of threads per each context 4459 int * newarr = ( int * )__kmp_allocate( sizeof( int ) * nproc ); 4460 for( int i = 0; i < nproc; i++ ) { 4461 newarr[ i ] = 0; 4462 } 4463 4464 int nth = nthreads; 4465 int flag = 0; 4466 while( nth > 0 ) { 4467 for( int j = 1; j <= nth_per_core; j++ ) { 4468 int cnt = ncores_with_x_to_max_procs[ j ]; 4469 for( int i = 0; i < ncores; i++ ) { 4470 // Skip the core with 0 processors 4471 if( nproc_at_core[ i ] == 0 ) { 4472 continue; 4473 } 4474 for( int k = 0; k < nth_per_core; k++ ) { 4475 if( procarr[ i * nth_per_core + k ] != -1 ) { 4476 if( newarr[ i * nth_per_core + k ] == 0 ) { 4477 newarr[ i * nth_per_core + k ] = 1; 4478 cnt--; 4479 nth--; 4480 break; 4481 } else { 4482 if( flag != 0 ) { 4483 newarr[ i * nth_per_core + k ] ++; 4484 cnt--; 4485 nth--; 4486 break; 4487 } 4488 } 4489 } 4490 } 4491 if( cnt == 0 || nth == 0 ) { 4492 break; 4493 } 4494 } 4495 if( nth == 0 ) { 4496 break; 4497 } 4498 } 4499 flag = 1; 4500 } 4501 int sum = 0; 4502 for( int i = 0; i < nproc; i++ ) { 4503 sum += newarr[ i ]; 4504 if( sum > tid ) { 4505 // Granularity == thread 4506 if( __kmp_affinity_gran == affinity_gran_fine || __kmp_affinity_gran == affinity_gran_thread) { 4507 int osID = procarr[ i ]; 4508 KMP_CPU_SET( osID, mask); 4509 } else if( __kmp_affinity_gran == affinity_gran_core ) { // Granularity == core 4510 int coreID = i / nth_per_core; 4511 for( int ii = 0; ii < nth_per_core; ii++ ) { 4512 int osID = procarr[ coreID * nth_per_core + ii ]; 4513 if( osID != -1 ) { 4514 KMP_CPU_SET( osID, mask); 4515 } 4516 } 4517 } 4518 break; 4519 } 4520 } 4521 __kmp_free( newarr ); 4522 } 4523 4524 if (__kmp_affinity_verbose) { 4525 char buf[KMP_AFFIN_MASK_PRINT_LEN]; 4526 __kmp_affinity_print_mask(buf, KMP_AFFIN_MASK_PRINT_LEN, mask); 4527 KMP_INFORM(BoundToOSProcSet, "KMP_AFFINITY", tid, buf); 4528 } 4529 __kmp_set_system_affinity( mask, TRUE ); 4530 } 4531 } 4532 4533 # endif /* KMP_MIC */ 4534 4535 #elif KMP_OS_DARWIN 4536 // affinity not supported 4537 #else 4538 #error "Unknown or unsupported OS" 4539 #endif // KMP_OS_WINDOWS || KMP_OS_LINUX 4540 4541